Skip to content

Commit

Permalink
Add utility to fetch DOI metadata + papers for workbench ingest (#4)
Browse files Browse the repository at this point in the history
  • Loading branch information
joecorall authored Mar 28, 2024
1 parent f38f23e commit bbea999
Show file tree
Hide file tree
Showing 7 changed files with 759 additions and 7 deletions.
236 changes: 236 additions & 0 deletions cmd/doi.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,236 @@
package cmd

import (
"bufio"
"crypto/md5"
"encoding/csv"
"encoding/hex"
"encoding/json"
"fmt"
"log"
"os"
"path/filepath"
"regexp"
"strings"

"github.com/lehigh-university-libraries/papercut/internal/utils"
"github.com/lehigh-university-libraries/papercut/pkg/doi"
"github.com/spf13/cobra"
)

var (
// used for flags.
filePath string

doiCmd = &cobra.Command{
Use: "doi",
Short: "Get DOI metadata and PDF",
Run: func(cmd *cobra.Command, args []string) {
file, err := os.Open(filePath)
if err != nil {
fmt.Println("Error opening file:", err)
return
}
defer file.Close()

// Create a scanner to read the file line by line
scanner := bufio.NewScanner(file)
url, err := cmd.Flags().GetString("url")
if err != nil {
log.Fatal(err)
}
wr := csv.NewWriter(os.Stdout)

// CSV header
err = wr.Write([]string{
"id",
"field_edtf_date_issued",
"title",
"field_full_title",
"field_abstract",
"field_model",
"field_linked_agent",
"field_identifier",
"field_part_detail",
"field_related_item",
"field_extent",
"field_language",
"field_rights",
"field_subject",
"file",
})
if err != nil {
log.Fatalf("Unable to write to CSV: %v", err)
}
for scanner.Scan() {
var doiObject doi.Article
line := strings.TrimSpace(scanner.Text())
dirPath := filepath.Join("dois", line)
dirPath, err = utils.MkTmpDir(dirPath)
if err != nil {
log.Printf("Unable to create cached file directory: %v", err)
continue
}

d := filepath.Join(dirPath, "doi.json")
result := getResult(d, url, line, "application/json")
err = json.Unmarshal(result, &doiObject)
if err != nil {
log.Printf("Could not unmarshal JSON for %s: %v", line, err)
continue
}

var linkedAgent []string
for _, author := range doiObject.Authors {
linkedAgent = append(linkedAgent, fmt.Sprintf("relators:aut:person:%s, %s", author.Family, author.Given))
}
if doiObject.Publisher != "" {
linkedAgent = append(linkedAgent, fmt.Sprintf("relators:pbl:corporate_body:%s", doiObject.Publisher))
}
identifiers := []string{
fmt.Sprintf(`{"attr0":"doi","value":"%s"}`, doiObject.DOI),
}
for _, i := range doiObject.ISSN {
identifiers = append(identifiers, fmt.Sprintf(`{"attr0":"issn","value":"%s"}`, i))
}

partDetail := []string{}
if doiObject.Volume != "" {
partDetail = append(partDetail, fmt.Sprintf(`{"type": "volume", "number": "%s"}`, doiObject.Volume))
}
if doiObject.Issue != "" {
partDetail = append(partDetail, fmt.Sprintf(`{"type": "volume", "number": "%s"}`, doiObject.Issue))
}

relatedItem := []string{}
if doiObject.ContainerTitle != "" {
relatedItem = append(relatedItem, fmt.Sprintf(`{"title": "%s"}`, doiObject.ContainerTitle))
}
extent := ""
if doiObject.Page != "" {
extent = fmt.Sprintf(`{"attr0": "page", "number": "%s"}`, doiObject.Page)
}

pdfUrl := ""
pdf := ""
for _, l := range doiObject.Link {
if l.ContentType == "application/pdf" || strings.Contains(strings.ToLower(l.URL), "pdf") {
pdfUrl = l.URL

}
}
if pdfUrl == "" {
d = filepath.Join(dirPath, "doi.html")
result = getResult(d, url, line, "text/html")
pattern := `<meta name="citation_pdf_url" content="([^"]+)".*>`
re := regexp.MustCompile(pattern)
matches := re.FindAllSubmatch(result, -1)
var pdfURLs []string
for _, match := range matches {
if len(match) >= 2 {
log.Println(string(match[1]))
pdfURLs = append(pdfURLs, string(match[1]))
}
}
for _, url := range pdfURLs {
pdfUrl = url
break
}
}
if pdfUrl != "" {
hash := md5.Sum([]byte(line))
hashStr := hex.EncodeToString(hash[:])

pdf = fmt.Sprintf("papers/dois/%s.pdf", hashStr)
err = utils.DownloadPdf(pdfUrl, pdf)
if err != nil {
err = os.Remove(pdf)
if err != nil {
log.Println("Error deleting file:", err)
}
pdf = pdfUrl
}
}

fullTitle := ""
if len(doiObject.Title) > 255 {
fullTitle = doiObject.Title
}
err = wr.Write([]string{
line,
doi.JoinDate(doiObject.Issued),
utils.TrimToMaxLen(doiObject.Title, 255),
fullTitle,
doiObject.Abstract,
"Digital Document",
strings.Join(linkedAgent, "|"),
strings.Join(identifiers, "|"),
strings.Join(partDetail, "|"),
strings.Join(relatedItem, "|"),
extent,
doiObject.Language,
"",
strings.Join(doiObject.Subject, "|"),
pdf,
})
if err != nil {
log.Fatalf("Unable to write to CSV: %v", err)
}
wr.Flush()
}

if err := scanner.Err(); err != nil {
fmt.Println("Error scanning file:", err)
return
}
},
}
)

func init() {
getCmd.AddCommand(doiCmd)

doiCmd.Flags().StringP("url", "u", "https://dx.doi.org", "The DOI API url")
doiCmd.Flags().StringVarP(&filePath, "file", "f", "", "path to file containing one DOI per line")
}

func getResult(d, url, line, acceptContentType string) []byte {
var err error

// see if we can just get the cached file
if _, err := os.Stat(d); err == nil {
content, err := os.ReadFile(d)
if err != nil {
fmt.Println("Error reading cached file:", err)
} else {
var a doi.Affiliation
err = json.Unmarshal(content, &a)
if err == nil || acceptContentType == "text/html" {
return content
}
log.Println("Error unmarshalling cached file:", err)
}
}

apiURL := fmt.Sprintf("%s/%s", url, line)

log.Printf("Accessing %s\n", apiURL)

doiObject, err := doi.GetObject(apiURL, acceptContentType)
if err != nil {
log.Fatal(err)
}
cacheFile, err := os.Create(d)
if err != nil {
fmt.Println("Error creating file:", err)
return nil
}
defer cacheFile.Close()

_, err = cacheFile.WriteString(string(doiObject))
if err != nil {
fmt.Println("Error caching DOI JSON:", err)
}

return doiObject
}
18 changes: 18 additions & 0 deletions cmd/get.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
package cmd

import (
"github.com/spf13/cobra"
)

// getCmd represents the search command
var getCmd = &cobra.Command{
Use: "get",
Short: "Get articles.",
Long: `Fetch PDFs and/or metadata for articles.
A subcommand is required in order to fetch the article from a specific source.`,
}

func init() {
rootCmd.AddCommand(getCmd)
}
3 changes: 2 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@ module github.com/lehigh-university-libraries/papercut

go 1.20

require github.com/spf13/cobra v1.7.0

require (
github.com/inconshreveable/mousetrap v1.1.0 // indirect
github.com/spf13/cobra v1.7.0 // indirect
github.com/spf13/pflag v1.0.5 // indirect
)
86 changes: 80 additions & 6 deletions internal/utils/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,12 @@ package utils
import (
"fmt"
"io"
"log"
"net/http"
"os"
"path/filepath"
"regexp"
"time"
"unicode/utf8"
)

Expand Down Expand Up @@ -33,16 +37,86 @@ func FetchEmails(url string) ([]string, error) {
}

func TrimToMaxLen(s string, maxLen int) string {
// Check if the string length exceeds the maximum length
if utf8.RuneCountInString(s) > maxLen {
// Convert the string to a slice of runes
runes := []rune(s)

// Truncate the slice to the maximum length
runes = runes[:maxLen]

// Convert the slice of runes back to a string
return string(runes)
}

return s
}

func MkTmpDir(d string) (string, error) {
tmpDir := os.TempDir()
dirPath := filepath.Join(tmpDir, d)
_, err := os.Stat(dirPath)
if err == nil {
return dirPath, nil
}

err = os.MkdirAll(dirPath, 0755)
if err != nil {
if !os.IsExist(err) {
return "", err
}
}

return dirPath, nil
}

func DownloadPdf(url, filePath string) error {
downloadDirectory := filepath.Dir(filePath)
if err := os.MkdirAll(downloadDirectory, 0755); err != nil {
fmt.Println("Error creating directory:", err)
return err
}

if _, err := os.Stat(filePath); os.IsNotExist(err) {

file, err := os.Create(filePath)
if err != nil {
fmt.Println("Error creating file:", err)
return err
}
defer file.Close()

client := &http.Client{
Transport: &http.Transport{
Proxy: http.ProxyFromEnvironment,
},
}

req, err := http.NewRequest("GET", url, nil)
if err != nil {
log.Println("Error creating request:", err)
return err
}

req.Header.Set("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36")
req.Header.Set("Accept", "application/pdf")
req.Header.Set("Accept-Language", "en-US")
req.Header.Set("Connection", "keep-alive")
req.Header.Set("Cache-Control", "no-cache")

response, err := client.Do(req)
if err != nil {
log.Println("Error downloading PDF:", err)
return err
}
defer response.Body.Close()

if response.StatusCode > 299 {
log.Printf("Error: HTTP status %d\n", response.StatusCode)
return fmt.Errorf("%s returned a non-200 status code: %d", url, response.StatusCode)
}
_, err = io.Copy(file, response.Body)
if err != nil {
log.Println("Error copying PDF content to file:", err)
return err
}
}

time.Sleep(500 * time.Microsecond)

return nil
}
Loading

0 comments on commit bbea999

Please sign in to comment.