Skip to content

Commit

Permalink
fix: make scraper always run, ignore already saved unless -f flag
Browse files Browse the repository at this point in the history
  • Loading branch information
lorenzocorallo committed Oct 17, 2024
1 parent 8377f35 commit 3c05e72
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 23 deletions.
37 changes: 17 additions & 20 deletions cmd/scraper/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,64 +32,61 @@ func main() {
}

mansWriter, err := writer.NewWriter[[]scraper.Manifesto](opts.dataDir)
mans, scraped := ParseLocalOrScrapeManifesti(&mansWriter, opts.force)
mans := ScrapeManifestiWithLocal(&mansWriter, opts.force)
if err != nil {
panic(err)
}

if scraped {
slog.Info("scraped manifesti", "found", len(mans))
slog.Info("finished scraping manifesti, writing to file...", "found", len(mans))

err = mansWriter.JsonWrite(constants.OutputManifestiListFilename, mans, false)
if err != nil {
panic(err)
}
} else {
slog.Info("parsed manifesti", "found", len(mans))
err = mansWriter.JsonWrite(constants.OutputManifestiListFilename, mans, false)
if err != nil {
panic(err)
}

slog.Info("successfully written manifesti to file!")

manEquals, err := DoLocalEqualsRemoteManifesti(&mansWriter)
if err != nil {
panic(err)
slog.Error("cannot perform comparison between local and remote versions", "err", err)
return
}

slog.Info("Scrape manifesti, equals to remote version??", "equals", manEquals)
slog.Info("Scrape manifesti, equals to remote version?? SUS", "equals", manEquals)
}

func ParseLocalOrScrapeManifesti(w *writer.Writer[[]scraper.Manifesto], force bool) ([]scraper.Manifesto, bool) {
func ScrapeManifestiWithLocal(w *writer.Writer[[]scraper.Manifesto], force bool) []scraper.Manifesto {
fn := constants.OutputManifestiListFilename
fp := w.GetFilePath(fn)
slog := slog.With("filepath", fp)

if force {
slog.Info("Scraping manifesti because of -f flag")
return scraper.ScrapeManifesti(), true
return scraper.ScrapeManifesti(nil)
}

local, err := w.JsonRead(fn)
if err != nil {
switch {
case errors.Is(err, os.ErrNotExist):
slog.Info(fmt.Sprintf("%s file not found, running scraper...", fn))
return scraper.ScrapeManifesti(), true
slog.Info(fmt.Sprintf("%s file not found, running scraper...", fn))
case errors.As(err, new(*json.SyntaxError)):
slog.Error(fmt.Sprintf("%s contains malformed JSON, running scraper...", fn))
return scraper.ScrapeManifesti(), true
case errors.As(err, new(*json.UnmarshalTypeError)):
slog.Error(fmt.Sprintf("%s contains JSON not compatible with the Manifesto struct, running scraper...", fn))
return scraper.ScrapeManifesti(), true
default:
slog.Error("Failed to read from manifesti json file, running scraper...", "error", err)
return scraper.ScrapeManifesti(), true
}
return scraper.ScrapeManifesti(nil)
}

if len(local) == 0 {
slog.Info(fmt.Sprintf("%s file is empty, running scraper...", fn))
return scraper.ScrapeManifesti(), true
return scraper.ScrapeManifesti(nil)
}

return local, false
slog.Info(fmt.Sprintf("loaded %d manifesti from %s json file, running scraper to check if there are new ones. If you would like to regenerate the whole thing, use the -f flag.", len(local), fn))
return scraper.ScrapeManifesti(local)
}

func GetRemoteManifesti() ([]byte, []scraper.Manifesto, error) {
Expand Down
17 changes: 14 additions & 3 deletions pkg/scraper/scraper.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"log/slog"
"net/http"
"reflect"
"slices"
"strconv"
"strings"
"sync"
Expand Down Expand Up @@ -37,13 +38,18 @@ type Manifesto struct {
DegreeType string `json:"type"`
}

func ScrapeManifesti() []Manifesto {
func ScrapeManifesti(alreadyScraped []Manifesto) []Manifesto {
urls := []string{constants.WebPolimiDesignUrl, constants.WebPolimiArchUrbUrl, constants.WebPolimiIngCivUrl, constants.WebPolimiIngInfIndUrl}
// hrefs := []string{}
out := []Manifesto{}
out := alreadyScraped

wg := sync.WaitGroup{}

alreadyScrapedUrl := make([]string, len(alreadyScraped))
for i, as := range alreadyScraped {
alreadyScrapedUrl[i] = as.Url
}

for _, url := range urls {
wg.Add(1)
go func() {
Expand Down Expand Up @@ -92,7 +98,12 @@ func ScrapeManifesti() []Manifesto {
q.Del("__pj0")
optUrl.RawQuery = q.Encode()

slog.Debug("optgroup", "label", degreeType, "opt", courseName, "value", value, "link", optUrl.String())
if slices.Contains(alreadyScrapedUrl, optUrl.String()) {
slog.Debug("url already scraped, skipping...", "url", optUrl.String())
return
}

slog.Debug("found new manifesti url, scraping...", "url", optUrl.String())
mandoc, _, err := loadDoc(optUrl.String())
if err != nil {
log.Fatal(err)
Expand Down

0 comments on commit 3c05e72

Please sign in to comment.