From 8377f3575c6e041886fef60499d2410718a0ae77 Mon Sep 17 00:00:00 2001 From: Lorenzo Corallo Date: Thu, 17 Oct 2024 00:29:13 +0200 Subject: [PATCH] feat: parse manifesti by degreeType (multiple files) and by courseName (one file) NOTE: see docs/main_process.pdf NOTE: byCourseName file wasn't originally considered, added during construction --- cmd/parser/main.go | 48 +++++++++++++++++++++++++++++++++++++++++ cmd/scraper/main.go | 4 ++-- pkg/constants/output.go | 5 ++++- pkg/parser/manifesti.go | 48 ++++++++++++++++++++++++++++++----------- pkg/utils/fs.go | 9 +++++++- 5 files changed, 97 insertions(+), 17 deletions(-) diff --git a/cmd/parser/main.go b/cmd/parser/main.go index db3f1d4..caa64dc 100644 --- a/cmd/parser/main.go +++ b/cmd/parser/main.go @@ -2,13 +2,61 @@ package main import ( "log/slog" + "path" + "github.com/PoliNetworkOrg/rankings-backend-go/pkg/constants" "github.com/PoliNetworkOrg/rankings-backend-go/pkg/logger" + "github.com/PoliNetworkOrg/rankings-backend-go/pkg/parser" + "github.com/PoliNetworkOrg/rankings-backend-go/pkg/scraper" + "github.com/PoliNetworkOrg/rankings-backend-go/pkg/utils" + "github.com/PoliNetworkOrg/rankings-backend-go/pkg/writer" ) func main() { slog.SetDefault(logger.GetDefaultLogger()) opts := ParseOpts() + outDir := path.Join(opts.dataDir, constants.OutputBaseFolder, constants.OutputParsedManifestiFolder) // abs path slog.Info("argv validation", "data_dir", opts.dataDir) + + smWriter, err := writer.NewWriter[[]scraper.Manifesto](opts.dataDir) + if err != nil { + panic(err) + } + + inputMans, err := smWriter.JsonRead(constants.OutputManifestiListFilename) + if err != nil { + panic(err) + } + + byDegTypeMans := parser.ParseManifestiByDegreeType(inputMans) + dtmWriter, err := writer.NewWriter[parser.ManifestiByDegreeType](outDir) + if err != nil { + panic(err) + } + for _, m := range byDegTypeMans { + fn := utils.MakeFilename(m.DegreeType, ".json") + err := dtmWriter.JsonWrite(fn, m, false) + if err != nil { + slog.Error("error while writing parsed manifesti byDegreeType (grouped)", "filename", fn) + panic(err) + } + + slog.Info("manifesti parser: successful write", "filename", fn) + } + + byCourseMans := parser.ParseManifestiByCourse(inputMans) + cmWriter, err := writer.NewWriter[parser.ManifestiByCourse](outDir) + if err != nil { + panic(err) + } + + cmFn := constants.OutputParsedManifestiAllFilename + err = cmWriter.JsonWrite("all.json", byCourseMans, false) + if err != nil { + slog.Error("error while writing parsed manifesti byCourse (all)", "filename", cmFn) + panic(err) + } + + slog.Info("manifesti parser: successful write", "filename", cmFn) } diff --git a/cmd/scraper/main.go b/cmd/scraper/main.go index 71a196e..e8d99d0 100644 --- a/cmd/scraper/main.go +++ b/cmd/scraper/main.go @@ -109,13 +109,13 @@ func GetRemoteManifesti() ([]byte, []scraper.Manifesto, error) { return nil, nil, err } - out := parser.ManifestiJson{} + out := parser.ManifestiByDegreeType{} err = json.Unmarshal(bytes, &out.Data) if err != nil { return bytes, nil, err } - return bytes, out.GetSlice(), err + return bytes, out.GetAll(), err } func DoLocalEqualsRemoteManifesti(w *writer.Writer[[]scraper.Manifesto]) (bool, error) { diff --git a/pkg/constants/output.go b/pkg/constants/output.go index ec54831..9bcc0d2 100644 --- a/pkg/constants/output.go +++ b/pkg/constants/output.go @@ -1,11 +1,14 @@ package constants const ( - OutputBaseFolder = "output" OutputHtmlFolder = "html" OutputLinksFilename = "links.txt" OutputStatsFilname = "stats.json" OutputManifestiListFilename = "manifesti_list.json" + OutputBaseFolder = "output" + OutputParsedManifestiFolder = "manifesti" + OutputParsedManifestiAllFilename = "all.json" + TmpDirectoryName = "tmp" ) diff --git a/pkg/parser/manifesti.go b/pkg/parser/manifesti.go index 47c1679..363bf05 100644 --- a/pkg/parser/manifesti.go +++ b/pkg/parser/manifesti.go @@ -10,8 +10,32 @@ type ( courseMap = map[string]locationMap ) -type ManifestiJson struct { - Data map[string]courseMap // json output structure +type ManifestiByDegreeType struct { + DegreeType string `json:"degreeType"` + Data courseMap `json:"data"` +} + +type ManifestiByCourse struct { + Data courseMap `json:"data"` +} + +func ParseManifestiByDegreeType(mans []scraper.Manifesto) []ManifestiByDegreeType { + byDegType := groupByDegreeType(mans) + out := make([]ManifestiByDegreeType, 0, len(byDegType)) + for dt, all := range groupByDegreeType(mans) { + data := groupByCourse(all) + m := ManifestiByDegreeType { DegreeType: dt, Data: data } + out = append(out, m) + } + + return out +} + +func ParseManifestiByCourse(mans []scraper.Manifesto) ManifestiByCourse { + byDegType := groupByCourse(mans) + return ManifestiByCourse { + Data: byDegType, + } } func groupByDegreeType(mans []scraper.Manifesto) degreeMap { @@ -40,18 +64,16 @@ func groupByCourse(mans []scraper.Manifesto) courseMap { return out } -func (m *ManifestiJson) GetSlice() []scraper.Manifesto { +func (m *ManifestiByDegreeType) GetAll() []scraper.Manifesto { out := make([]scraper.Manifesto, 0) - for dtk, m1 := range m.Data { - for ck, m2 := range m1 { - for lk, url := range m2 { - out = append(out, scraper.Manifesto { - Name: ck, - Location: lk, - DegreeType: dtk, - Url: url, - }) - } + for ck, m2 := range m.Data { + for lk, url := range m2 { + out = append(out, scraper.Manifesto{ + Name: ck, + Location: lk, + DegreeType: m.DegreeType, + Url: url, + }) } } diff --git a/pkg/utils/fs.go b/pkg/utils/fs.go index 6c3fa81..f68c550 100644 --- a/pkg/utils/fs.go +++ b/pkg/utils/fs.go @@ -5,8 +5,8 @@ import ( "io/fs" "log/slog" "os" - "path" "path/filepath" + "strings" "github.com/PoliNetworkOrg/rankings-backend-go/pkg/constants" ) @@ -63,3 +63,10 @@ func TmpDirectory() (string, error) { return tmpPath, nil } + +func MakeFilename(str string, ext string) string { + str = strings.TrimSpace(str) + str = strings.ToLower(str) + str = strings.ReplaceAll(str, " ", "_") + return str + ext +}