From 3c05e72b787770c6df5ad216eb24bfd8f0ac6595 Mon Sep 17 00:00:00 2001 From: Lorenzo Corallo Date: Thu, 17 Oct 2024 17:28:23 +0200 Subject: [PATCH] fix: make scraper always run, ignore already saved unless -f flag --- cmd/scraper/main.go | 37 +++++++++++++++++-------------------- pkg/scraper/scraper.go | 17 ++++++++++++++--- 2 files changed, 31 insertions(+), 23 deletions(-) diff --git a/cmd/scraper/main.go b/cmd/scraper/main.go index e8d99d0..e79d322 100644 --- a/cmd/scraper/main.go +++ b/cmd/scraper/main.go @@ -32,64 +32,61 @@ func main() { } mansWriter, err := writer.NewWriter[[]scraper.Manifesto](opts.dataDir) - mans, scraped := ParseLocalOrScrapeManifesti(&mansWriter, opts.force) + mans := ScrapeManifestiWithLocal(&mansWriter, opts.force) if err != nil { panic(err) } - if scraped { - slog.Info("scraped manifesti", "found", len(mans)) + slog.Info("finished scraping manifesti, writing to file...", "found", len(mans)) - err = mansWriter.JsonWrite(constants.OutputManifestiListFilename, mans, false) - if err != nil { - panic(err) - } - } else { - slog.Info("parsed manifesti", "found", len(mans)) + err = mansWriter.JsonWrite(constants.OutputManifestiListFilename, mans, false) + if err != nil { + panic(err) } + slog.Info("successfully written manifesti to file!") + manEquals, err := DoLocalEqualsRemoteManifesti(&mansWriter) if err != nil { - panic(err) + slog.Error("cannot perform comparison between local and remote versions", "err", err) + return } - slog.Info("Scrape manifesti, equals to remote version??", "equals", manEquals) + slog.Info("Scrape manifesti, equals to remote version?? SUS", "equals", manEquals) } -func ParseLocalOrScrapeManifesti(w *writer.Writer[[]scraper.Manifesto], force bool) ([]scraper.Manifesto, bool) { +func ScrapeManifestiWithLocal(w *writer.Writer[[]scraper.Manifesto], force bool) []scraper.Manifesto { fn := constants.OutputManifestiListFilename fp := w.GetFilePath(fn) slog := slog.With("filepath", fp) if force { slog.Info("Scraping manifesti because of -f flag") - return scraper.ScrapeManifesti(), true + return scraper.ScrapeManifesti(nil) } local, err := w.JsonRead(fn) if err != nil { switch { case errors.Is(err, os.ErrNotExist): - slog.Info(fmt.Sprintf("%s file not found, running scraper...", fn)) - return scraper.ScrapeManifesti(), true + slog.Info(fmt.Sprintf("%s file not found, running scraper...", fn)) case errors.As(err, new(*json.SyntaxError)): slog.Error(fmt.Sprintf("%s contains malformed JSON, running scraper...", fn)) - return scraper.ScrapeManifesti(), true case errors.As(err, new(*json.UnmarshalTypeError)): slog.Error(fmt.Sprintf("%s contains JSON not compatible with the Manifesto struct, running scraper...", fn)) - return scraper.ScrapeManifesti(), true default: slog.Error("Failed to read from manifesti json file, running scraper...", "error", err) - return scraper.ScrapeManifesti(), true } + return scraper.ScrapeManifesti(nil) } if len(local) == 0 { slog.Info(fmt.Sprintf("%s file is empty, running scraper...", fn)) - return scraper.ScrapeManifesti(), true + return scraper.ScrapeManifesti(nil) } - return local, false + slog.Info(fmt.Sprintf("loaded %d manifesti from %s json file, running scraper to check if there are new ones. If you would like to regenerate the whole thing, use the -f flag.", len(local), fn)) + return scraper.ScrapeManifesti(local) } func GetRemoteManifesti() ([]byte, []scraper.Manifesto, error) { diff --git a/pkg/scraper/scraper.go b/pkg/scraper/scraper.go index d7d6f0e..dd1df70 100644 --- a/pkg/scraper/scraper.go +++ b/pkg/scraper/scraper.go @@ -6,6 +6,7 @@ import ( "log/slog" "net/http" "reflect" + "slices" "strconv" "strings" "sync" @@ -37,13 +38,18 @@ type Manifesto struct { DegreeType string `json:"type"` } -func ScrapeManifesti() []Manifesto { +func ScrapeManifesti(alreadyScraped []Manifesto) []Manifesto { urls := []string{constants.WebPolimiDesignUrl, constants.WebPolimiArchUrbUrl, constants.WebPolimiIngCivUrl, constants.WebPolimiIngInfIndUrl} // hrefs := []string{} - out := []Manifesto{} + out := alreadyScraped wg := sync.WaitGroup{} + alreadyScrapedUrl := make([]string, len(alreadyScraped)) + for i, as := range alreadyScraped { + alreadyScrapedUrl[i] = as.Url + } + for _, url := range urls { wg.Add(1) go func() { @@ -92,7 +98,12 @@ func ScrapeManifesti() []Manifesto { q.Del("__pj0") optUrl.RawQuery = q.Encode() - slog.Debug("optgroup", "label", degreeType, "opt", courseName, "value", value, "link", optUrl.String()) + if slices.Contains(alreadyScrapedUrl, optUrl.String()) { + slog.Debug("url already scraped, skipping...", "url", optUrl.String()) + return + } + + slog.Debug("found new manifesti url, scraping...", "url", optUrl.String()) mandoc, _, err := loadDoc(optUrl.String()) if err != nil { log.Fatal(err)