From fd8dda07bde1574b34ae1f64ab88a0cbb88856d0 Mon Sep 17 00:00:00 2001 From: jakopako Date: Sat, 20 Apr 2024 19:20:06 +0200 Subject: [PATCH 1/7] Add debug flag - wip Fixes #218 --- fetch/fetcher.go | 4 ++++ go.mod | 2 +- main.go | 59 +++++++++++++++++++++++++++++++++------------- output/api.go | 15 +++++++----- output/file.go | 19 ++++++++------- output/stdout.go | 7 +++--- scraper/scraper.go | 17 +++++++------ 7 files changed, 79 insertions(+), 44 deletions(-) diff --git a/fetch/fetcher.go b/fetch/fetcher.go index 7c24a68..bb26eb2 100644 --- a/fetch/fetcher.go +++ b/fetch/fetcher.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "io" + "log/slog" "net/http" "time" @@ -28,6 +29,7 @@ type StaticFetcher struct { } func (s *StaticFetcher) Fetch(url string, opts FetchOpts) (string, error) { + slog.Debug("fetching page", slog.String("fetcher", "static"), slog.String("url", url), slog.String("user-agent", s.UserAgent)) var resString string client := &http.Client{} @@ -89,6 +91,8 @@ func (d *DynamicFetcher) Cancel() { } func (d *DynamicFetcher) Fetch(url string, opts FetchOpts) (string, error) { + logger := slog.With(slog.String("fetcher", "dynamic"), slog.String("url", url)) + logger.Debug("fetching page", slog.String("user-agent", d.UserAgent)) // start := time.Now() ctx, cancel := chromedp.NewContext(d.allocContext) // ctx, cancel := chromedp.NewContext(d.allocContext, diff --git a/go.mod b/go.mod index 87b18d4..0147190 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module github.com/jakopako/goskyr -go 1.19 +go 1.22 require ( github.com/PuerkitoBio/goquery v1.9.1 diff --git a/main.go b/main.go index 4480e5e..6a31f80 100644 --- a/main.go +++ b/main.go @@ -3,7 +3,7 @@ package main import ( "flag" "fmt" - "log" + "log/slog" "math" "os" "sync" @@ -18,19 +18,21 @@ import ( var version = "dev" func worker(sc chan scraper.Scraper, ic chan map[string]interface{}, gc *scraper.GlobalConfig, threadNr int) { + workerLogger := slog.With(slog.Int("thread", threadNr)) for s := range sc { - log.Printf("thread %d: scraping %s\n", threadNr, s.Name) + scraperLogger := workerLogger.With(slog.String("name", s.Name)) + scraperLogger.Info("starting scraping task") items, err := s.GetItems(gc, false) if err != nil { - log.Printf("%s ERROR: %s", s.Name, err) + scraperLogger.Error(fmt.Sprintf("%s: %s", s.Name, err)) continue } - log.Printf("thread %d: fetched %d %s items\n", threadNr, len(items), s.Name) + scraperLogger.Info(fmt.Sprintf("fetched %d items", len(items))) for _, item := range items { ic <- item } } - log.Printf("thread %d: done working\n", threadNr) + workerLogger.Info("done working") } func main() { @@ -41,11 +43,12 @@ func main() { generateConfig := flag.String("g", "", "Automatically generate a config file for the given url.") m := flag.Int("m", 20, "The minimum number of items on a page. This is needed to filter out noise. Works in combination with the -g flag.") f := flag.Bool("f", false, "Only show fields that have varying values across the list of items. Works in combination with the -g flag.") - d := flag.Bool("d", false, "Render JS before generating a configuration file. Works in combination with the -g flag.") + renderJs := flag.Bool("r", false, "Render JS before generating a configuration file. Works in combination with the -g flag.") extractFeatures := flag.String("e", "", "Extract ML features based on the given configuration file (-c) and write them to the given file in csv format.") wordsDir := flag.String("w", "word-lists", "The directory that contains a number of files containing words of different languages. This is needed for the ML part (use with -e or -b).") buildModel := flag.String("t", "", "Train a ML model based on the given csv features file. This will generate 2 files, goskyr.model and goskyr.class") modelPath := flag.String("model", "", "Use a pre-trained ML model to infer names of extracted fields. Works in combination with the -g flag.") + debug := flag.Bool("debug", false, "Set debug mode.") flag.Parse() @@ -54,14 +57,27 @@ func main() { return } + var logLevel slog.Level + if *debug { + logLevel = slog.LevelDebug + } else { + logLevel = slog.LevelInfo + } + + logger := slog.New(slog.NewTextHandler(os.Stdout, &slog.HandlerOptions{Level: logLevel})) + slog.SetDefault(logger) + if *generateConfig != "" { + slog.Debug("starting to generate config") s := &scraper.Scraper{URL: *generateConfig} - if *d { + if *renderJs { s.RenderJs = true } + slog.Debug(fmt.Sprintf("analyzing url %s", s.URL)) err := automate.GetDynamicFieldsConfig(s, *m, *f, *modelPath, *wordsDir) if err != nil { - log.Fatal(err) + slog.Error(fmt.Sprintf("%v", err)) + os.Exit(1) } c := scraper.Config{ Scrapers: []scraper.Scraper{ @@ -70,7 +86,8 @@ func main() { } yamlData, err := yaml.Marshal(&c) if err != nil { - log.Fatalf("Error while Marshaling. %v", err) + slog.Error(fmt.Sprintf("error while marshaling. %v", err)) + os.Exit(1) } if *toStdout { @@ -78,33 +95,38 @@ func main() { } else { f, err := os.Create(*configLoc) if err != nil { - log.Fatalf("ERROR while trying to open file: %v", err) + slog.Error(fmt.Sprintf("error opening file: %v", err)) + os.Exit(1) } defer f.Close() _, err = f.Write(yamlData) if err != nil { - log.Fatalf("ERROR while trying to write to file: %v", err) + slog.Error(fmt.Sprintf("error writing to file: %v", err)) + os.Exit(1) } - log.Printf("successfully wrote config to file %s", *configLoc) + slog.Info(fmt.Sprintf("successfully wrote config to file %s", *configLoc)) } return } if *buildModel != "" { if err := ml.TrainModel(*buildModel); err != nil { - log.Fatal(err) + slog.Error(fmt.Sprintf("%v", err)) + os.Exit(1) } return } config, err := scraper.NewConfig(*configLoc) if err != nil { - log.Fatal(err) + slog.Error(fmt.Sprintf("%v", err)) + os.Exit(1) } if *extractFeatures != "" { if err := ml.ExtractFeatures(config, *extractFeatures, *wordsDir); err != nil { - log.Fatal(err) + slog.Error(fmt.Sprintf("%v", err)) + os.Exit(1) } return } @@ -125,7 +147,8 @@ func main() { case output.FILE_WRITER_TYPE: writer = output.NewFileWriter(&config.Writer) default: - log.Fatalf("writer of type %s not implemented", config.Writer.Type) + slog.Error(fmt.Sprintf("writer of type %s not implemented", config.Writer.Type)) + os.Exit(1) } } @@ -150,8 +173,9 @@ func main() { if *singleScraper == "" { nrWorkers = int(math.Min(20, float64(len(config.Scrapers)))) } - log.Printf("running with %d threads\n", nrWorkers) + slog.Info(fmt.Sprintf("running with %d threads\n", nrWorkers)) workerWg.Add(nrWorkers) + slog.Debug("starting workers") for i := 0; i < nrWorkers; i++ { go func(j int) { defer workerWg.Done() @@ -161,6 +185,7 @@ func main() { // start writer writerWg.Add(1) + slog.Debug("starting writer") go func() { defer writerWg.Done() writer.Write(ic) diff --git a/output/api.go b/output/api.go index 25705fd..2b0e2b7 100644 --- a/output/api.go +++ b/output/api.go @@ -5,9 +5,10 @@ import ( "encoding/json" "fmt" "io" - "log" + "log/slog" "net/http" "net/url" + "os" "time" ) @@ -25,6 +26,7 @@ func NewAPIWriter(wc *WriterConfig) *APIWriter { } func (f *APIWriter) Write(items chan map[string]interface{}) { + logger := slog.With(slog.String("writer", API_WRITER_TYPE)) client := &http.Client{ Timeout: time.Second * 10, } @@ -45,7 +47,7 @@ func (f *APIWriter) Write(items chan map[string]interface{}) { // delete all items from the given source firstDate, ok := item["date"].(time.Time) if !ok { - log.Printf("error while trying to cast the date field of item %v to time.Time", item) + logger.Error(fmt.Sprintf("error while trying to cast the date field of item %v to time.Time", item)) continue } firstDateUTCF := firstDate.UTC().Format("2006-01-02 15:04") @@ -54,15 +56,16 @@ func (f *APIWriter) Write(items chan map[string]interface{}) { req.SetBasicAuth(apiUser, apiPassword) resp, err := client.Do(req) if err != nil { - log.Printf("error while deleting items from the api: %v\n", err) + logger.Error(fmt.Sprintf("error while deleting items from the api: %v\n", err)) continue } if resp.StatusCode != 200 { body, err := io.ReadAll(resp.Body) if err != nil { - log.Fatal(err) + logger.Error(fmt.Sprintf("%v", err)) } - log.Fatalf("error while deleting items. Status Code: %d\nUrl: %s Response: %s\n", resp.StatusCode, deleteURL, body) + logger.Error(fmt.Sprintf("error while deleting items. Status Code: %d\nUrl: %s Response: %s\n", resp.StatusCode, deleteURL, body)) + os.Exit(1) } resp.Body.Close() } @@ -82,7 +85,7 @@ func (f *APIWriter) Write(items chan map[string]interface{}) { nrItemsWritten = nrItemsWritten + len(batch) } - log.Printf("wrote %d items from %d sources to the api", nrItemsWritten, len(deletedSources)) + logger.Info(fmt.Sprintf("wrote %d items from %d sources to the api", nrItemsWritten, len(deletedSources))) } func postBatch(client *http.Client, batch []map[string]interface{}, apiURL, apiUser, apiPassword string) error { diff --git a/output/file.go b/output/file.go index 62e5945..8986805 100644 --- a/output/file.go +++ b/output/file.go @@ -3,7 +3,8 @@ package output import ( "bytes" "encoding/json" - "log" + "fmt" + "log/slog" "os" ) @@ -18,9 +19,11 @@ func NewFileWriter(wc *WriterConfig) *FileWriter { } } func (fr *FileWriter) Write(items chan map[string]interface{}) { + logger := slog.With(slog.String("writer", FILE_WRITER_TYPE)) f, err := os.Create(fr.writerConfig.FilePath) if err != nil { - log.Fatalf("FileWriter ERROR while trying to open file: %v", err) + logger.Error(fmt.Sprintf("error while trying to open file: %v", err)) + os.Exit(1) } defer f.Close() allItems := []map[string]interface{}{} @@ -41,18 +44,18 @@ func (fr *FileWriter) Write(items chan map[string]interface{}) { encoder := json.NewEncoder(buffer) encoder.SetEscapeHTML(false) if err := encoder.Encode(allItems); err != nil { - log.Printf("FileWriter ERROR while encoding items: %v", err) + logger.Error(fmt.Sprintf("error while encoding items: %v", err)) return } var indentBuffer bytes.Buffer if err := json.Indent(&indentBuffer, buffer.Bytes(), "", " "); err != nil { - log.Printf("FileWriter ERROR while indenting json: %v", err) + logger.Error(fmt.Sprintf("error while indenting json: %v", err)) return } - _, err = f.Write(indentBuffer.Bytes()) - if err != nil { - log.Printf("FileWriter ERROR while writing json to file: %v", err) + if _, err = f.Write(indentBuffer.Bytes()); err != nil { + logger.Error(fmt.Sprintf("error while writing json to file: %v", err)) + } else { + logger.Info(fmt.Sprintf("wrote %d items to file %s", len(allItems), fr.writerConfig.FilePath)) } - log.Printf("wrote %d items to file %s", len(allItems), fr.writerConfig.FilePath) } diff --git a/output/stdout.go b/output/stdout.go index 14a3cfa..362ea5d 100644 --- a/output/stdout.go +++ b/output/stdout.go @@ -4,12 +4,13 @@ import ( "bytes" "encoding/json" "fmt" - "log" + "log/slog" ) type StdoutWriter struct{} func (s *StdoutWriter) Write(items chan map[string]interface{}) { + logger := slog.With(slog.String("writer", STDOUT_WRITER_TYPE)) for item := range items { // We cannot use the following line of code because it automatically replaces certain html characters // with the corresponding Unicode replacement rune. @@ -24,13 +25,13 @@ func (s *StdoutWriter) Write(items chan map[string]interface{}) { encoder := json.NewEncoder(buffer) encoder.SetEscapeHTML(false) if err := encoder.Encode(item); err != nil { - log.Printf("StdoutWriter ERROR while writing item %v: %v", item, err) + logger.Error(fmt.Sprintf("error while writing item %v: %v", item, err)) continue } var indentBuffer bytes.Buffer if err := json.Indent(&indentBuffer, buffer.Bytes(), "", " "); err != nil { - log.Printf("StdoutWriter ERROR while writing item %v: %v", item, err) + logger.Error(fmt.Sprintf("error while writing item %v: %v", item, err)) continue } fmt.Print(indentBuffer.String()) diff --git a/scraper/scraper.go b/scraper/scraper.go index c68b8d9..d6bedc4 100644 --- a/scraper/scraper.go +++ b/scraper/scraper.go @@ -5,7 +5,7 @@ import ( "errors" "fmt" "io/fs" - "log" + "log/slog" "net/url" "os" "path/filepath" @@ -60,7 +60,7 @@ func NewConfig(configPath string) (*Config, error) { if config.Writer.Type == "" { config.Writer = configTmp.Writer } else { - return fmt.Errorf("ERROR: config files must only contain max. one writer") + return fmt.Errorf("config files must only contain max. one writer") } } } @@ -257,6 +257,7 @@ type Scraper struct { // present on the main page (not subpages). This is used by the ML feature generation. func (c Scraper) GetItems(globalConfig *GlobalConfig, rawDyn bool) ([]map[string]interface{}, error) { + scrLogger := slog.With(slog.String("name", c.Name)) // initialize fetcher if c.RenderJs { dynFetcher := fetch.NewDynamicFetcher(globalConfig.UserAgent, c.PageLoadWait) @@ -270,6 +271,7 @@ func (c Scraper) GetItems(globalConfig *GlobalConfig, rawDyn bool) ([]map[string var items []map[string]interface{} + scrLogger.Debug("initializing filters") if err := c.initializeFilters(); err != nil { return items, err } @@ -311,7 +313,7 @@ func (c Scraper) GetItems(globalConfig *GlobalConfig, rawDyn bool) ([]map[string err = extractField(&f, currentItem, s, baseUrl) } if err != nil { - log.Printf("%s ERROR: error while parsing field %s: %v. Skipping item %v.", c.Name, f.Name, err, currentItem) + scrLogger.Error(fmt.Sprintf("error while parsing field %s: %v. Skipping item %v.", f.Name, err, currentItem)) return } } @@ -329,12 +331,12 @@ func (c Scraper) GetItems(globalConfig *GlobalConfig, rawDyn bool) ([]map[string if !found { subRes, err := c.fetcher.Fetch(subpageURL, fetch.FetchOpts{}) if err != nil { - log.Printf("%s ERROR: %v. Skipping item %v.", c.Name, err, currentItem) + scrLogger.Error(fmt.Sprintf("%v. Skipping item %v.", err, currentItem)) return } subDoc, err := goquery.NewDocumentFromReader(strings.NewReader(subRes)) if err != nil { - log.Printf("%s ERROR: error while reading document: %v. Skipping item %v", c.Name, err, currentItem) + scrLogger.Error(fmt.Sprintf("error while reading document: %v. Skipping item %v", err, currentItem)) return } subDocs[subpageURL] = subDoc @@ -342,7 +344,7 @@ func (c Scraper) GetItems(globalConfig *GlobalConfig, rawDyn bool) ([]map[string baseURLSubpage := getBaseURL(subpageURL, subDocs[subpageURL]) err = extractField(&f, currentItem, subDocs[subpageURL].Selection, baseURLSubpage) if err != nil { - log.Printf("%s ERROR: error while parsing field %s: %v. Skipping item %v.", c.Name, f.Name, err, currentItem) + scrLogger.Error(fmt.Sprintf("error while parsing field %s: %v. Skipping item %v.", f.Name, err, currentItem)) return } } @@ -351,9 +353,6 @@ func (c Scraper) GetItems(globalConfig *GlobalConfig, rawDyn bool) ([]map[string // check if item should be filtered filter := c.filterItem(currentItem) - if err != nil { - log.Fatalf("%s ERROR: error while applying filter: %v.", c.Name, err) - } if filter { currentItem = c.removeHiddenFields(currentItem) items = append(items, currentItem) From d7d7be264aae6954affe44eaeb94a50d654cfa19 Mon Sep 17 00:00:00 2001 From: jakopako Date: Sat, 20 Apr 2024 19:22:08 +0200 Subject: [PATCH 2/7] updated go-tests workflow --- .github/workflows/go-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/go-tests.yml b/.github/workflows/go-tests.yml index 3dd1cfb..c9594d1 100644 --- a/.github/workflows/go-tests.yml +++ b/.github/workflows/go-tests.yml @@ -7,7 +7,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - go-version: ["1.18", "1.19"] + go-version: ["1.22"] steps: - uses: actions/checkout@v4 From b93ac50697f2cccfe0d53ea12c1a0dcce9f503a4 Mon Sep 17 00:00:00 2001 From: jakopako Date: Sat, 20 Apr 2024 19:32:12 +0200 Subject: [PATCH 3/7] updated makefile --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 97014e8..cab9f4a 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ PACKAGE_NAME := github.com/jakopako/goskyr -GOLANG_CROSS_VERSION ?= v1.19 +GOLANG_CROSS_VERSION ?= v1.22 .PHONY: release-dry-run release-dry-run: From b0c1012f0d0c0ca2d8c3895820903b2318138b2d Mon Sep 17 00:00:00 2001 From: jakopako Date: Sat, 20 Apr 2024 21:55:03 +0200 Subject: [PATCH 4/7] writing html file in debug mode --- .github/workflows/codeql-analysis.yml | 8 ++++- main.go | 1 + scraper/scraper.go | 42 +++++++++++++++++++++++---- 3 files changed, 44 insertions(+), 7 deletions(-) diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml index d530152..008d4ef 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql-analysis.yml @@ -33,6 +33,7 @@ jobs: fail-fast: false matrix: language: ["go"] + go-version: ["1.22"] # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ] # Learn more about CodeQL language support at https://git.io/codeql-language-support @@ -40,6 +41,11 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 + - name: Setup Go ${{ matrix.go-version }} + uses: actions/setup-go@v5 + with: + go-version: ${{ matrix.go-version }} + # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL uses: github/codeql-action/init@v3 @@ -66,5 +72,5 @@ jobs: # make bootstrap # make release - - name: Perform CodeQL Analysi2 + - name: Perform CodeQL Analysis uses: github/codeql-action/analyze@v3 diff --git a/main.go b/main.go index 6a31f80..8c38406 100644 --- a/main.go +++ b/main.go @@ -162,6 +162,7 @@ func main() { go func() { for _, s := range config.Scrapers { if *singleScraper == "" || *singleScraper == s.Name { + s.Debug = *debug sc <- s } } diff --git a/scraper/scraper.go b/scraper/scraper.go index d6bedc4..42ff0c1 100644 --- a/scraper/scraper.go +++ b/scraper/scraper.go @@ -2,6 +2,7 @@ package scraper import ( "bytes" + "crypto/rand" "errors" "fmt" "io/fs" @@ -247,6 +248,7 @@ type Scraper struct { PageLoadWait int `yaml:"page_load_wait,omitempty"` // milliseconds. Only taken into account when render_js = true Interaction types.Interaction `yaml:"interaction,omitempty"` fetcher fetch.Fetcher + Debug bool } // GetItems fetches and returns all items from a website according to the @@ -476,7 +478,7 @@ func (c *Scraper) removeHiddenFields(item map[string]interface{}) map[string]int func (c *Scraper) fetchPage(doc *goquery.Document, nextPageI int, currentPageUrl, userAgent string, i *types.Interaction) (bool, string, *goquery.Document, error) { if nextPageI == 0 { - newDoc, err := fetchToDoc(currentPageUrl, c.fetcher, fetch.FetchOpts{Interaction: *i}) + newDoc, err := c.fetchToDoc(currentPageUrl, fetch.FetchOpts{Interaction: *i}) if err != nil { return false, "", nil, err } @@ -493,7 +495,7 @@ func (c *Scraper) fetchPage(doc *goquery.Document, nextPageI int, currentPageUrl Type: types.InteractionTypeClick, Count: nextPageI, // we always need to 'restart' the clicks because we always re-fetch the page } - nextPageDoc, err := fetchToDoc(currentPageUrl, c.fetcher, fetch.FetchOpts{Interaction: ia}) + nextPageDoc, err := c.fetchToDoc(currentPageUrl, fetch.FetchOpts{Interaction: ia}) if err != nil { return false, "", nil, err } @@ -507,7 +509,7 @@ func (c *Scraper) fetchPage(doc *goquery.Document, nextPageI int, currentPageUrl return false, "", nil, err } if nextPageUrl != "" { - nextPageDoc, err := fetchToDoc(nextPageUrl, c.fetcher, fetch.FetchOpts{}) + nextPageDoc, err := c.fetchToDoc(nextPageUrl, fetch.FetchOpts{}) if err != nil { return false, "", nil, err } @@ -521,13 +523,41 @@ func (c *Scraper) fetchPage(doc *goquery.Document, nextPageI int, currentPageUrl } } -func fetchToDoc(url string, fetcher fetch.Fetcher, opts fetch.FetchOpts) (*goquery.Document, error) { - res, err := fetcher.Fetch(url, opts) +func (c *Scraper) fetchToDoc(url string, opts fetch.FetchOpts) (*goquery.Document, error) { + res, err := c.fetcher.Fetch(url, opts) if err != nil { return nil, err } // fmt.Println(res) - return goquery.NewDocumentFromReader(strings.NewReader(res)) + doc, err := goquery.NewDocumentFromReader(strings.NewReader(res)) + if err != nil { + return nil, err + } + + if c.Debug { + bs := make([]byte, 8) + _, err := rand.Read(bs) + if err != nil { + return nil, fmt.Errorf("failed to generate random bytes for html file name") + } + filename := fmt.Sprintf("%s-%x.html", c.Name, bs[:8]) + slog.Debug(fmt.Sprintf("writing html to file %s", filename), slog.String("url", url)) + htmlStr, err := goquery.OuterHtml(doc.Children()) + if err != nil { + return nil, fmt.Errorf("failed to write html file: %v", err) + } + + f, err := os.Create(filename) + if err != nil { + return nil, fmt.Errorf("failed to write html file: %v", err) + } + defer f.Close() + _, err = f.WriteString(htmlStr) + if err != nil { + return nil, fmt.Errorf("failed to write html file: %v", err) + } + } + return doc, nil } func extractField(field *Field, event map[string]interface{}, s *goquery.Selection, baseURL string) error { From daf9c76c720f784f5491dc686b25769118bc3e8c Mon Sep 17 00:00:00 2001 From: jakopako Date: Sat, 20 Apr 2024 21:58:46 +0200 Subject: [PATCH 5/7] updated help text --- main.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.go b/main.go index 8c38406..ebac3c5 100644 --- a/main.go +++ b/main.go @@ -48,7 +48,7 @@ func main() { wordsDir := flag.String("w", "word-lists", "The directory that contains a number of files containing words of different languages. This is needed for the ML part (use with -e or -b).") buildModel := flag.String("t", "", "Train a ML model based on the given csv features file. This will generate 2 files, goskyr.model and goskyr.class") modelPath := flag.String("model", "", "Use a pre-trained ML model to infer names of extracted fields. Works in combination with the -g flag.") - debug := flag.Bool("debug", false, "Set debug mode.") + debug := flag.Bool("debug", false, "Prints debug logs and writes scraped html's to files.") flag.Parse() From 941617deb81a809c2b1d4358328c34d11fa3ddff Mon Sep 17 00:00:00 2001 From: jakopako Date: Sat, 20 Apr 2024 22:01:52 +0200 Subject: [PATCH 6/7] updated readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 6783edb..f0b78fc 100644 --- a/README.md +++ b/README.md @@ -97,7 +97,7 @@ Note that the machine learning feature is rather new and might not always work w ## Manual Configuration & Usage -Despite the option to automatically generate a configuration file for goskyr there are a lot more options that can be configured manually. +Despite the option to automatically generate a configuration file for goskyr there are a lot more options that can be configured manually. Note that while writing and testing a new configuration it might make sense to use the `-debug` flag when running goskyr, to enable more detailed logging and have the scraped html's written to files. A very simple configuration would look something like this: From d27f68ee1c4e6aa741175148290fe5d9e28c0b0d Mon Sep 17 00:00:00 2001 From: jakopako Date: Sat, 20 Apr 2024 22:02:45 +0200 Subject: [PATCH 7/7] added comment --- scraper/scraper.go | 1 + 1 file changed, 1 insertion(+) diff --git a/scraper/scraper.go b/scraper/scraper.go index 42ff0c1..e9a06ac 100644 --- a/scraper/scraper.go +++ b/scraper/scraper.go @@ -534,6 +534,7 @@ func (c *Scraper) fetchToDoc(url string, opts fetch.FetchOpts) (*goquery.Documen return nil, err } + // in debug mode we want to write all the html's to files if c.Debug { bs := make([]byte, 8) _, err := rand.Read(bs)