Skip to content

Commit

Permalink
refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
joecorall committed Apr 3, 2024
1 parent b2bbf5d commit 873e19a
Show file tree
Hide file tree
Showing 7 changed files with 232 additions and 199 deletions.
64 changes: 9 additions & 55 deletions cmd/doi.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,15 @@ package cmd

import (
"bufio"
"crypto/md5"
"encoding/csv"
"encoding/hex"
"encoding/json"
"fmt"
"log"
"os"
"path/filepath"
"regexp"
"strings"

"github.com/lehigh-university-libraries/papercut/internal/utils"
"github.com/lehigh-university-libraries/papercut/pkg/doi"
"github.com/lehigh-university-libraries/papercut/pkg/romeo"
"github.com/spf13/cobra"
)

Expand Down Expand Up @@ -64,19 +60,10 @@ var (
}
for scanner.Scan() {
var doiObject doi.Article
line := strings.TrimSpace(scanner.Text())
dirPath := filepath.Join("dois", line)
dirPath, err = utils.MkTmpDir(dirPath)
doiStr := strings.TrimSpace(scanner.Text())
doiObject, err := doi.GetDoi(doiStr, url)
if err != nil {
log.Printf("Unable to create cached file directory: %v", err)
continue
}

d := filepath.Join(dirPath, "doi.json")
result := getResult(d, url, line, "application/json")
err = json.Unmarshal(result, &doiObject)
if err != nil {
log.Printf("Could not unmarshal JSON for %s: %v", line, err)
log.Println(err)
continue
}

Expand All @@ -93,6 +80,9 @@ var (
fieldRights := ""
for _, i := range doiObject.ISSN {
identifiers = append(identifiers, fmt.Sprintf(`{"attr0":"issn","value":"%s"}`, i))
if fieldRights == "" {
fieldRights = romeo.FindIssnLicense(i)
}
}

partDetail := []string{}
Expand All @@ -112,53 +102,17 @@ var (
extent = fmt.Sprintf(`{"attr0": "page", "number": "%s"}`, doiObject.Page)
}

pdfUrl := ""
pdf := ""
for _, l := range doiObject.Link {
if l.ContentType == "application/pdf" || strings.Contains(strings.ToLower(l.URL), "pdf") {
pdfUrl = l.URL
}
}
if downloadPdfs {
if pdfUrl == "" {
d = filepath.Join(dirPath, "doi.html")
result = getResult(d, url, line, "text/html")
pattern := `<meta name="citation_pdf_url" content="([^"]+)".*>`
re := regexp.MustCompile(pattern)
matches := re.FindAllSubmatch(result, -1)
var pdfURLs []string
for _, match := range matches {
if len(match) >= 2 {
pdfURLs = append(pdfURLs, string(match[1]))
}
}
for _, url := range pdfURLs {
pdfUrl = url
break
}
}
if pdfUrl != "" {
hash := md5.Sum([]byte(line))
hashStr := hex.EncodeToString(hash[:])

pdf = fmt.Sprintf("papers/dois/%s.pdf", hashStr)
err = utils.DownloadPdf(pdfUrl, pdf)
if err != nil {
err = os.Remove(pdf)
if err != nil {
log.Println("Error deleting file:", err)
}
pdf = pdfUrl
}
}
pdf = doiObject.DownloadPdf()
}

fullTitle := ""
if len(doiObject.Title) > 255 {
fullTitle = doiObject.Title
}
err = wr.Write([]string{
line,
doiStr,
doi.JoinDate(doiObject.Issued),
utils.TrimToMaxLen(doiObject.Title, 255),
fullTitle,
Expand Down
64 changes: 5 additions & 59 deletions cmd/license.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,11 @@ package cmd
import (
"bufio"
"encoding/csv"
"encoding/json"
"fmt"
"log"
neturl "net/url"
"os"
"path/filepath"
"strings"

"github.com/lehigh-university-libraries/papercut/internal/utils"
"github.com/lehigh-university-libraries/papercut/pkg/doi"
"github.com/lehigh-university-libraries/papercut/pkg/romeo"
"github.com/spf13/cobra"
Expand All @@ -20,7 +16,6 @@ import (
var (
// used for flags.
licenseFilePath string
romeoApiKey = os.Getenv("SHERPA_ROMEO_API_KEY")
licenseCmd = &cobra.Command{
Use: "license",
Short: "Get license for a DOI",
Expand Down Expand Up @@ -49,72 +44,23 @@ var (
log.Fatalf("Unable to write to CSV: %v", err)
}
for scanner.Scan() {
var doiObject doi.Article
line := strings.TrimSpace(scanner.Text())
dirPath := filepath.Join("dois", line)
dirPath, err = utils.MkTmpDir(dirPath)
doiStr := strings.TrimSpace(scanner.Text())
doiObject, err := doi.GetDoi(doiStr, url)
if err != nil {
log.Printf("Unable to create cached file directory: %v", err)
continue
}

d := filepath.Join(dirPath, "doi.json")
result := getResult(d, url, line, "application/json")
err = json.Unmarshal(result, &doiObject)
if err != nil {
log.Printf("Could not unmarshal JSON for %s: %v", line, err)
log.Println(err)
continue
}

fieldRights := ""
for _, i := range doiObject.ISSN {
d, err = utils.MkTmpDir("issns")
if err != nil {
continue
}
d = filepath.Join(d, i)
publicationId := checkCachedFile(d)
id := string(publicationId)
if publicationId == nil {
id = romeo.GetIdFromIssn(i)
if id != "" {
writeCachedFile(d, id)
}
}
if id == "" {
log.Println("Could not find publication ID for ISSN", i)
continue
}
filter := fmt.Sprintf("[[\"id\",\"equals\",\"%s\"]]", id)
romeUrl := fmt.Sprintf("https://v2.sherpa.ac.uk/cgi/retrieve?item-type=publication&format=Json&limit=10&offset=0&order=-id&filter=%s&api-key=%s", neturl.QueryEscape(filter), romeoApiKey)
d, _ = utils.MkTmpDir(filepath.Join("issns", "ids"))
d = filepath.Join(d, id)
publication := checkCachedFile(d)
if publication == nil {
publication = romeo.GetPublication(romeUrl)
if publication != nil {
writeCachedFile(d, string(publication))
}
}
if publication == nil {
log.Println("Could not find publication info for", i)
continue
}
var r romeo.Response
err = json.Unmarshal(publication, &r)
if err != nil {
log.Printf("Unable to read publication: %v", err)
continue
}

fieldRights = r.GetLicense()
fieldRights = romeo.FindIssnLicense(i)
if fieldRights != "" {
break
}
}

err = wr.Write([]string{
line,
doiStr,
fieldRights,
})
if err != nil {
Expand Down
54 changes: 0 additions & 54 deletions cmd/root.go
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
package cmd

import (
"encoding/json"
"fmt"
"log"
"os"

"github.com/lehigh-university-libraries/papercut/pkg/doi"
"github.com/spf13/cobra"
)

Expand All @@ -32,54 +29,3 @@ func SetVersionInfo(version, commit, date string) {
func init() {
rootCmd.Flags().BoolP("toggle", "t", false, "Help message for toggle")
}

func getResult(d, url, line, acceptContentType string) []byte {
var err error
content := checkCachedFile(d)
if content != nil {
var a doi.Affiliation
err = json.Unmarshal(content, &a)
if err == nil || acceptContentType == "text/html" {
return content
}
log.Println("Error unmarshalling cached file:", err)
}

apiURL := fmt.Sprintf("%s/%s", url, line)

log.Printf("Accessing %s\n", apiURL)

doiObject, err := doi.GetObject(apiURL, acceptContentType)
if err != nil {
log.Fatal(err)
}
writeCachedFile(d, string(doiObject))
return doiObject
}

func checkCachedFile(d string) []byte {
// see if we can just get the cached file
if _, err := os.Stat(d); err == nil {
content, err := os.ReadFile(d)
if err != nil {
log.Println("Error reading cached file:", err)
return nil
}
return content
}
return nil
}

func writeCachedFile(f, c string) {
cacheFile, err := os.Create(f)
if err != nil {
fmt.Println("Error creating file:", err)
return
}
defer cacheFile.Close()

_, err = cacheFile.WriteString(c)
if err != nil {
log.Println("Error caching DOI JSON:", err)
}
}
82 changes: 82 additions & 0 deletions internal/utils/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -128,3 +128,85 @@ func StrInSlice(s string, sl []string) bool {
}
return false
}

func GetResult(d, url, acceptContentType string) []byte {
var err error
content := CheckCachedFile(d)
if content != nil {
return content
}

log.Printf("Accessing %s\n", url)

r, err := getResult(url, acceptContentType)
if err != nil {
log.Fatal(err)
}
WriteCachedFile(d, string(r))

return r
}

func CheckCachedFile(d string) []byte {
// see if we can just get the cached file
if _, err := os.Stat(d); err == nil {
content, err := os.ReadFile(d)
if err != nil {
log.Println("Error reading cached file:", err)
return nil
}
return content
}
return nil
}

func WriteCachedFile(f, c string) {
cacheFile, err := os.Create(f)
if err != nil {
fmt.Println("Error creating file:", err)
return
}
defer cacheFile.Close()

_, err = cacheFile.WriteString(c)
if err != nil {
log.Println("Error caching DOI JSON:", err)
}
}

func getResult(url, acceptContentType string) ([]byte, error) {
client := &http.Client{
Transport: &http.Transport{
Proxy: http.ProxyFromEnvironment,
},
}

req, err := http.NewRequest("GET", url, nil)
if err != nil {
fmt.Println("Error creating request:", err)
return nil, err
}

req.Header.Set("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36")
req.Header.Set("Accept", acceptContentType)
req.Header.Set("Accept-Language", "en-US")
req.Header.Set("Connection", "keep-alive")
req.Header.Set("Cache-Control", "no-cache")

resp, err := client.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()

if resp.StatusCode > 299 {
return nil, fmt.Errorf("%s returned a non-200 status code: %d", url, resp.StatusCode)
}

body, err := io.ReadAll(resp.Body)
if err != nil {
return nil, err
}

return body, nil
}
Loading

0 comments on commit 873e19a

Please sign in to comment.