Add utility to fetch DOI metadata + papers for workbench ingest (#4)

lehigh-university-libraries · Mar 28, 2024 · bbea999 · bbea999
1 parent f38f23e
commit bbea999
Show file tree

Hide file tree

Showing 7 changed files with 759 additions and 7 deletions.
diff --git a/cmd/doi.go b/cmd/doi.go
@@ -0,0 +1,236 @@
+package cmd
+
+import (
+	"bufio"
+	"crypto/md5"
+	"encoding/csv"
+	"encoding/hex"
+	"encoding/json"
+	"fmt"
+	"log"
+	"os"
+	"path/filepath"
+	"regexp"
+	"strings"
+
+	"github.com/lehigh-university-libraries/papercut/internal/utils"
+	"github.com/lehigh-university-libraries/papercut/pkg/doi"
+	"github.com/spf13/cobra"
+)
+
+var (
+	// used for flags.
+	filePath string
+
+	doiCmd = &cobra.Command{
+		Use:   "doi",
+		Short: "Get DOI metadata and PDF",
+		Run: func(cmd *cobra.Command, args []string) {
+			file, err := os.Open(filePath)
+			if err != nil {
+				fmt.Println("Error opening file:", err)
+				return
+			}
+			defer file.Close()
+
+			// Create a scanner to read the file line by line
+			scanner := bufio.NewScanner(file)
+			url, err := cmd.Flags().GetString("url")
+			if err != nil {
+				log.Fatal(err)
+			}
+			wr := csv.NewWriter(os.Stdout)
+
+			// CSV header
+			err = wr.Write([]string{
+				"id",
+				"field_edtf_date_issued",
+				"title",
+				"field_full_title",
+				"field_abstract",
+				"field_model",
+				"field_linked_agent",
+				"field_identifier",
+				"field_part_detail",
+				"field_related_item",
+				"field_extent",
+				"field_language",
+				"field_rights",
+				"field_subject",
+				"file",
+			})
+			if err != nil {
+				log.Fatalf("Unable to write to CSV: %v", err)
+			}
+			for scanner.Scan() {
+				var doiObject doi.Article
+				line := strings.TrimSpace(scanner.Text())
+				dirPath := filepath.Join("dois", line)
+				dirPath, err = utils.MkTmpDir(dirPath)
+				if err != nil {
+					log.Printf("Unable to create cached file directory: %v", err)
+					continue
+				}
+
+				d := filepath.Join(dirPath, "doi.json")
+				result := getResult(d, url, line, "application/json")
+				err = json.Unmarshal(result, &doiObject)
+				if err != nil {
+					log.Printf("Could not unmarshal JSON for %s: %v", line, err)
+					continue
+				}
+
+				var linkedAgent []string
+				for _, author := range doiObject.Authors {
+					linkedAgent = append(linkedAgent, fmt.Sprintf("relators:aut:person:%s, %s", author.Family, author.Given))
+				}
+				if doiObject.Publisher != "" {
+					linkedAgent = append(linkedAgent, fmt.Sprintf("relators:pbl:corporate_body:%s", doiObject.Publisher))
+				}
+				identifiers := []string{
+					fmt.Sprintf(`{"attr0":"doi","value":"%s"}`, doiObject.DOI),
+				}
+				for _, i := range doiObject.ISSN {
+					identifiers = append(identifiers, fmt.Sprintf(`{"attr0":"issn","value":"%s"}`, i))
+				}
+
+				partDetail := []string{}
+				if doiObject.Volume != "" {
+					partDetail = append(partDetail, fmt.Sprintf(`{"type": "volume", "number": "%s"}`, doiObject.Volume))
+				}
+				if doiObject.Issue != "" {
+					partDetail = append(partDetail, fmt.Sprintf(`{"type": "volume", "number": "%s"}`, doiObject.Issue))
+				}
+
+				relatedItem := []string{}
+				if doiObject.ContainerTitle != "" {
+					relatedItem = append(relatedItem, fmt.Sprintf(`{"title": "%s"}`, doiObject.ContainerTitle))
+				}
+				extent := ""
+				if doiObject.Page != "" {
+					extent = fmt.Sprintf(`{"attr0": "page", "number": "%s"}`, doiObject.Page)
+				}
+
+				pdfUrl := ""
+				pdf := ""
+				for _, l := range doiObject.Link {
+					if l.ContentType == "application/pdf" || strings.Contains(strings.ToLower(l.URL), "pdf") {
+						pdfUrl = l.URL
+
+					}
+				}
+				if pdfUrl == "" {
+					d = filepath.Join(dirPath, "doi.html")
+					result = getResult(d, url, line, "text/html")
+					pattern := `<meta name="citation_pdf_url" content="([^"]+)".*>`
+					re := regexp.MustCompile(pattern)
+					matches := re.FindAllSubmatch(result, -1)
+					var pdfURLs []string
+					for _, match := range matches {
+						if len(match) >= 2 {
+							log.Println(string(match[1]))
+							pdfURLs = append(pdfURLs, string(match[1]))
+						}
+					}
+					for _, url := range pdfURLs {
+						pdfUrl = url
+						break
+					}
+				}
+				if pdfUrl != "" {
+					hash := md5.Sum([]byte(line))
+					hashStr := hex.EncodeToString(hash[:])
+
+					pdf = fmt.Sprintf("papers/dois/%s.pdf", hashStr)
+					err = utils.DownloadPdf(pdfUrl, pdf)
+					if err != nil {
+						err = os.Remove(pdf)
+						if err != nil {
+							log.Println("Error deleting file:", err)
+						}
+						pdf = pdfUrl
+					}
+				}
+
+				fullTitle := ""
+				if len(doiObject.Title) > 255 {
+					fullTitle = doiObject.Title
+				}
+				err = wr.Write([]string{
+					line,
+					doi.JoinDate(doiObject.Issued),
+					utils.TrimToMaxLen(doiObject.Title, 255),
+					fullTitle,
+					doiObject.Abstract,
+					"Digital Document",
+					strings.Join(linkedAgent, "|"),
+					strings.Join(identifiers, "|"),
+					strings.Join(partDetail, "|"),
+					strings.Join(relatedItem, "|"),
+					extent,
+					doiObject.Language,
+					"",
+					strings.Join(doiObject.Subject, "|"),
+					pdf,
+				})
+				if err != nil {
+					log.Fatalf("Unable to write to CSV: %v", err)
+				}
+				wr.Flush()
+			}
+
+			if err := scanner.Err(); err != nil {
+				fmt.Println("Error scanning file:", err)
+				return
+			}
+		},
+	}
+)
+
+func init() {
+	getCmd.AddCommand(doiCmd)
+
+	doiCmd.Flags().StringP("url", "u", "https://dx.doi.org", "The DOI API url")
+	doiCmd.Flags().StringVarP(&filePath, "file", "f", "", "path to file containing one DOI per line")
+}
+
+func getResult(d, url, line, acceptContentType string) []byte {
+	var err error
+
+	// see if we can just get the cached file
+	if _, err := os.Stat(d); err == nil {
+		content, err := os.ReadFile(d)
+		if err != nil {
+			fmt.Println("Error reading cached file:", err)
+		} else {
+			var a doi.Affiliation
+			err = json.Unmarshal(content, &a)
+			if err == nil || acceptContentType == "text/html" {
+				return content
+			}
+			log.Println("Error unmarshalling cached file:", err)
+		}
+	}
+
+	apiURL := fmt.Sprintf("%s/%s", url, line)
+
+	log.Printf("Accessing %s\n", apiURL)
+
+	doiObject, err := doi.GetObject(apiURL, acceptContentType)
+	if err != nil {
+		log.Fatal(err)
+	}
+	cacheFile, err := os.Create(d)
+	if err != nil {
+		fmt.Println("Error creating file:", err)
+		return nil
+	}
+	defer cacheFile.Close()
+
+	_, err = cacheFile.WriteString(string(doiObject))
+	if err != nil {
+		fmt.Println("Error caching DOI JSON:", err)
+	}
+
+	return doiObject
+}
diff --git a/cmd/get.go b/cmd/get.go
@@ -0,0 +1,18 @@
+package cmd
+
+import (
+	"github.com/spf13/cobra"
+)
+
+// getCmd represents the search command
+var getCmd = &cobra.Command{
+	Use:   "get",
+	Short: "Get articles.",
+	Long: `Fetch PDFs and/or metadata for articles.
+
+A subcommand is required in order to fetch the article from a specific source.`,
+}
+
+func init() {
+	rootCmd.AddCommand(getCmd)
+}
diff --git a/go.mod b/go.mod
@@ -2,8 +2,9 @@ module github.com/lehigh-university-libraries/papercut
 
 go 1.20
 
+require github.com/spf13/cobra v1.7.0
+
 require (
 	github.com/inconshreveable/mousetrap v1.1.0 // indirect
-	github.com/spf13/cobra v1.7.0 // indirect
 	github.com/spf13/pflag v1.0.5 // indirect
 )
diff --git a/internal/utils/helpers.go b/internal/utils/helpers.go
@@ -3,8 +3,12 @@ package utils
 import (
 	"fmt"
 	"io"
+	"log"
 	"net/http"
+	"os"
+	"path/filepath"
 	"regexp"
+	"time"
 	"unicode/utf8"
 )
 
@@ -33,16 +37,86 @@ func FetchEmails(url string) ([]string, error) {
 }
 
 func TrimToMaxLen(s string, maxLen int) string {
-	// Check if the string length exceeds the maximum length
 	if utf8.RuneCountInString(s) > maxLen {
-		// Convert the string to a slice of runes
 		runes := []rune(s)
-
-		// Truncate the slice to the maximum length
 		runes = runes[:maxLen]
-
-		// Convert the slice of runes back to a string
 		return string(runes)
 	}
+
 	return s
 }
+
+func MkTmpDir(d string) (string, error) {
+	tmpDir := os.TempDir()
+	dirPath := filepath.Join(tmpDir, d)
+	_, err := os.Stat(dirPath)
+	if err == nil {
+		return dirPath, nil
+	}
+
+	err = os.MkdirAll(dirPath, 0755)
+	if err != nil {
+		if !os.IsExist(err) {
+			return "", err
+		}
+	}
+
+	return dirPath, nil
+}
+
+func DownloadPdf(url, filePath string) error {
+	downloadDirectory := filepath.Dir(filePath)
+	if err := os.MkdirAll(downloadDirectory, 0755); err != nil {
+		fmt.Println("Error creating directory:", err)
+		return err
+	}
+
+	if _, err := os.Stat(filePath); os.IsNotExist(err) {
+
+		file, err := os.Create(filePath)
+		if err != nil {
+			fmt.Println("Error creating file:", err)
+			return err
+		}
+		defer file.Close()
+
+		client := &http.Client{
+			Transport: &http.Transport{
+				Proxy: http.ProxyFromEnvironment,
+			},
+		}
+
+		req, err := http.NewRequest("GET", url, nil)
+		if err != nil {
+			log.Println("Error creating request:", err)
+			return err
+		}
+
+		req.Header.Set("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36")
+		req.Header.Set("Accept", "application/pdf")
+		req.Header.Set("Accept-Language", "en-US")
+		req.Header.Set("Connection", "keep-alive")
+		req.Header.Set("Cache-Control", "no-cache")
+
+		response, err := client.Do(req)
+		if err != nil {
+			log.Println("Error downloading PDF:", err)
+			return err
+		}
+		defer response.Body.Close()
+
+		if response.StatusCode > 299 {
+			log.Printf("Error: HTTP status %d\n", response.StatusCode)
+			return fmt.Errorf("%s returned a non-200 status code: %d", url, response.StatusCode)
+		}
+		_, err = io.Copy(file, response.Body)
+		if err != nil {
+			log.Println("Error copying PDF content to file:", err)
+			return err
+		}
+	}
+
+	time.Sleep(500 * time.Microsecond)
+
+	return nil
+}