From 90196f8918e540d83a9ac64779025ca18d84471d Mon Sep 17 00:00:00 2001 From: jakopako Date: Sun, 5 Nov 2023 15:21:40 +0100 Subject: [PATCH 1/5] trying some things --- automate/config.go | 2 +- fetch/fetcher.go | 59 +++++++++++++++++++++++++++++++++------------- scraper/scraper.go | 49 +++++++++++++++++++++----------------- 3 files changed, 71 insertions(+), 39 deletions(-) diff --git a/automate/config.go b/automate/config.go index ed96008..0a7c807 100644 --- a/automate/config.go +++ b/automate/config.go @@ -436,7 +436,7 @@ func GetDynamicFieldsConfig(s *scraper.Scraper, minOcc int, removeStaticFields b } else { fetcher = &fetch.StaticFetcher{} } - res, err := fetcher.Fetch(s.URL) + res, err := fetcher.Fetch(s.URL, nil) if err != nil { return err } diff --git a/fetch/fetcher.go b/fetch/fetcher.go index 0382275..d670104 100644 --- a/fetch/fetcher.go +++ b/fetch/fetcher.go @@ -15,7 +15,7 @@ import ( // A Fetcher allows to fetch the content of a web page type Fetcher interface { - Fetch(url string) (string, error) + Fetch(url string, ia *types.Interaction) (string, error) } // The StaticFetcher fetches static page content @@ -23,7 +23,7 @@ type StaticFetcher struct { UserAgent string } -func (s *StaticFetcher) Fetch(url string) (string, error) { +func (s *StaticFetcher) Fetch(url string, ia *types.Interaction) (string, error) { var resString string client := &http.Client{} @@ -52,22 +52,42 @@ func (s *StaticFetcher) Fetch(url string) (string, error) { // The DynamicFetcher renders js type DynamicFetcher struct { - UserAgent string - Interaction types.Interaction + UserAgent string + // Interaction types.Interaction WaitSeconds int + ctx context.Context } -func (d *DynamicFetcher) Fetch(url string) (string, error) { - // TODO: add user agent +func NewDynamicFetcher(ua string, s int) *DynamicFetcher { opts := append( chromedp.DefaultExecAllocatorOptions[:], chromedp.WindowSize(1920, 1080), // init with a desktop view (sometimes pages look different on mobile, eg buttons are missing) ) - parentCtx, cancel := chromedp.NewExecAllocator(context.Background(), opts...) - defer cancel() - ctx, cancel := chromedp.NewContext(parentCtx) + parentCtx, _ := chromedp.NewExecAllocator(context.Background(), opts...) + ctx, _ := chromedp.NewContext(parentCtx) + return &DynamicFetcher{ + UserAgent: ua, + WaitSeconds: s, + ctx: ctx, + } +} + +func (d *DynamicFetcher) Fetch(url string, ia *types.Interaction) (string, error) { + // TODO: add user agent + start := time.Now() + // opts := append( + // chromedp.DefaultExecAllocatorOptions[:], + // chromedp.WindowSize(1920, 1080), // init with a desktop view (sometimes pages look different on mobile, eg buttons are missing) + // ) + // parentCtx, cancel := chromedp.NewExecAllocator(context.Background(), opts...) + elapsed := time.Since(start) + fmt.Printf("time elapsed: %s\n", elapsed) + // defer cancel() + // ctx, cancel := chromedp.NewContext(parentCtx) + elapsed = time.Since(start) + fmt.Printf("time elapsed: %s\n", elapsed) // ctx, cancel := chromedp.NewContext(parentCtx, chromedp.WithDebugf(log.Printf)) - defer cancel() + // defer cancel() var body string sleepTime := 5 * time.Second @@ -79,13 +99,14 @@ func (d *DynamicFetcher) Fetch(url string) (string, error) { chromedp.Sleep(sleepTime), // for now } delay := 1000 * time.Millisecond // default is 1 second - if d.Interaction.Delay > 0 { - delay = time.Duration(d.Interaction.Delay) * time.Millisecond + if ia.Delay > 0 { + delay = time.Duration(ia.Delay) * time.Millisecond } - if d.Interaction.Type == types.InteractionTypeClick { + if ia.Type == types.InteractionTypeClick { count := 1 // default is 1 - if d.Interaction.Count > 0 { - count = d.Interaction.Count + fmt.Println("shouldnt get here") + if ia.Count > 0 { + count = ia.Count } for i := 0; i < count; i++ { // we only click the button if it exists. Do we really need this check here? @@ -93,7 +114,7 @@ func (d *DynamicFetcher) Fetch(url string) (string, error) { // actions = append(actions, chromedp.Click(d.Interaction.Selector, chromedp.ByQuery)) actions = append(actions, chromedp.ActionFunc(func(ctx context.Context) error { var nodes []*cdp.Node - if err := chromedp.Nodes(d.Interaction.Selector, &nodes, chromedp.AtLeast(0)).Do(ctx); err != nil { + if err := chromedp.Nodes(ia.Selector, &nodes, chromedp.AtLeast(0)).Do(ctx); err != nil { return err } if len(nodes) == 0 { @@ -113,9 +134,13 @@ func (d *DynamicFetcher) Fetch(url string) (string, error) { return err })) + elapsed = time.Since(start) + fmt.Printf("time elapsed: %s\n", elapsed) // run task list - err := chromedp.Run(ctx, + err := chromedp.Run(d.ctx, actions..., ) + elapsed = time.Since(start) + fmt.Printf("time elapsed: %s\n", elapsed) return body, err } diff --git a/scraper/scraper.go b/scraper/scraper.go index 4f0eb8b..6de536c 100644 --- a/scraper/scraper.go +++ b/scraper/scraper.go @@ -289,7 +289,7 @@ func (c Scraper) GetItems(globalConfig *GlobalConfig, rawDyn bool) ([]map[string subpageURL := fmt.Sprint(currentItem[f.OnSubpage]) _, found := subDocs[subpageURL] if !found { - subRes, err := subpageFetcher.Fetch(subpageURL) + subRes, err := subpageFetcher.Fetch(subpageURL, nil) if err != nil { log.Printf("%s ERROR: %v. Skipping item %v.", c.Name, err, currentItem) return @@ -415,21 +415,23 @@ func (c *Scraper) removeHiddenFields(item map[string]interface{}) map[string]int func (c *Scraper) fetchPage(doc *goquery.Document, nextPageI int, currentPageUrl, userAgent string) (bool, string, *goquery.Document, error) { var fetcher fetch.Fetcher if c.RenderJs { - fetcher = &fetch.DynamicFetcher{ - UserAgent: userAgent, - Interaction: c.Interaction, - } + // fetcher = &fetch.DynamicFetcher{ + // UserAgent: userAgent, + // Interaction: c.Interaction, + // } + fetcher = fetch.NewDynamicFetcher(userAgent, 0) } else { fetcher = &fetch.StaticFetcher{ UserAgent: userAgent, } } if nextPageI == 0 { - res, err := fetcher.Fetch(currentPageUrl) - if err != nil { - return false, "", nil, err - } - newDoc, err := goquery.NewDocumentFromReader(strings.NewReader(res)) + newDoc, err := fetchToDoc(currentPageUrl, nil, fetcher) + // res, err := fetcher.Fetch(currentPageUrl) + // if err != nil { + // return false, "", nil, err + // } + // newDoc, err := goquery.NewDocumentFromReader(strings.NewReader(res)) if err != nil { return false, "", nil, err } @@ -440,15 +442,20 @@ func (c *Scraper) fetchPage(doc *goquery.Document, nextPageI int, currentPageUrl // check if node c.Paginator.Location.Selector is present in doc pagSelector := doc.Find(c.Paginator.Location.Selector) if len(pagSelector.Nodes) > 0 { - fetcher = &fetch.DynamicFetcher{ - UserAgent: userAgent, - Interaction: types.Interaction{ - Selector: c.Paginator.Location.Selector, - Type: types.InteractionTypeClick, - Count: nextPageI, // we always need to 'restart' the clicks because we always re-fetch the page - }, + // fetcher = &fetch.DynamicFetcher{ + // UserAgent: userAgent, + // Interaction: types.Interaction{ + // Selector: c.Paginator.Location.Selector, + // Type: types.InteractionTypeClick, + // Count: nextPageI, // we always need to 'restart' the clicks because we always re-fetch the page + // }, + // } + ia := &types.Interaction{ + Selector: c.Paginator.Location.Selector, + Type: types.InteractionTypeClick, + Count: nextPageI, // we always need to 'restart' the clicks because we always re-fetch the page } - nextPageDoc, err := fetchToDoc(currentPageUrl, fetcher) + nextPageDoc, err := fetchToDoc(currentPageUrl, ia, fetcher) if err != nil { return false, "", nil, err } @@ -460,7 +467,7 @@ func (c *Scraper) fetchPage(doc *goquery.Document, nextPageI int, currentPageUrl baseUrl := getBaseURL(currentPageUrl, doc) nextPageUrl := getURLString(&c.Paginator.Location, doc.Selection, baseUrl) if nextPageUrl != "" { - nextPageDoc, err := fetchToDoc(nextPageUrl, fetcher) + nextPageDoc, err := fetchToDoc(nextPageUrl, nil, fetcher) if err != nil { return false, "", nil, err } @@ -474,8 +481,8 @@ func (c *Scraper) fetchPage(doc *goquery.Document, nextPageI int, currentPageUrl } } -func fetchToDoc(url string, fetcher fetch.Fetcher) (*goquery.Document, error) { - res, err := fetcher.Fetch(url) +func fetchToDoc(url string, ia *types.Interaction, fetcher fetch.Fetcher) (*goquery.Document, error) { + res, err := fetcher.Fetch(url, ia) if err != nil { return nil, err } From a0a910613fe4bd0b88154859bb029f80df6bca7f Mon Sep 17 00:00:00 2001 From: jakopako Date: Sun, 5 Nov 2023 15:31:39 +0100 Subject: [PATCH 2/5] wip --- automate/config.go | 2 +- fetch/fetcher.go | 22 +++++++++++++--------- scraper/scraper.go | 14 +++++++------- 3 files changed, 21 insertions(+), 17 deletions(-) diff --git a/automate/config.go b/automate/config.go index 0a7c807..8b6ff0e 100644 --- a/automate/config.go +++ b/automate/config.go @@ -436,7 +436,7 @@ func GetDynamicFieldsConfig(s *scraper.Scraper, minOcc int, removeStaticFields b } else { fetcher = &fetch.StaticFetcher{} } - res, err := fetcher.Fetch(s.URL, nil) + res, err := fetcher.Fetch(s.URL, fetch.FetchOpts{}) if err != nil { return err } diff --git a/fetch/fetcher.go b/fetch/fetcher.go index d670104..f879b9e 100644 --- a/fetch/fetcher.go +++ b/fetch/fetcher.go @@ -13,9 +13,13 @@ import ( "github.com/jakopako/goskyr/types" ) +type FetchOpts struct { + Interaction types.Interaction +} + // A Fetcher allows to fetch the content of a web page type Fetcher interface { - Fetch(url string, ia *types.Interaction) (string, error) + Fetch(url string, opts FetchOpts) (string, error) } // The StaticFetcher fetches static page content @@ -23,7 +27,7 @@ type StaticFetcher struct { UserAgent string } -func (s *StaticFetcher) Fetch(url string, ia *types.Interaction) (string, error) { +func (s *StaticFetcher) Fetch(url string, opts FetchOpts) (string, error) { var resString string client := &http.Client{} @@ -72,7 +76,7 @@ func NewDynamicFetcher(ua string, s int) *DynamicFetcher { } } -func (d *DynamicFetcher) Fetch(url string, ia *types.Interaction) (string, error) { +func (d *DynamicFetcher) Fetch(url string, opts FetchOpts) (string, error) { // TODO: add user agent start := time.Now() // opts := append( @@ -99,14 +103,14 @@ func (d *DynamicFetcher) Fetch(url string, ia *types.Interaction) (string, error chromedp.Sleep(sleepTime), // for now } delay := 1000 * time.Millisecond // default is 1 second - if ia.Delay > 0 { - delay = time.Duration(ia.Delay) * time.Millisecond + if opts.Interaction.Delay > 0 { + delay = time.Duration(opts.Interaction.Delay) * time.Millisecond } - if ia.Type == types.InteractionTypeClick { + if opts.Interaction.Type == types.InteractionTypeClick { count := 1 // default is 1 fmt.Println("shouldnt get here") - if ia.Count > 0 { - count = ia.Count + if opts.Interaction.Count > 0 { + count = opts.Interaction.Count } for i := 0; i < count; i++ { // we only click the button if it exists. Do we really need this check here? @@ -114,7 +118,7 @@ func (d *DynamicFetcher) Fetch(url string, ia *types.Interaction) (string, error // actions = append(actions, chromedp.Click(d.Interaction.Selector, chromedp.ByQuery)) actions = append(actions, chromedp.ActionFunc(func(ctx context.Context) error { var nodes []*cdp.Node - if err := chromedp.Nodes(ia.Selector, &nodes, chromedp.AtLeast(0)).Do(ctx); err != nil { + if err := chromedp.Nodes(opts.Interaction.Selector, &nodes, chromedp.AtLeast(0)).Do(ctx); err != nil { return err } if len(nodes) == 0 { diff --git a/scraper/scraper.go b/scraper/scraper.go index 6de536c..76d7ec3 100644 --- a/scraper/scraper.go +++ b/scraper/scraper.go @@ -289,7 +289,7 @@ func (c Scraper) GetItems(globalConfig *GlobalConfig, rawDyn bool) ([]map[string subpageURL := fmt.Sprint(currentItem[f.OnSubpage]) _, found := subDocs[subpageURL] if !found { - subRes, err := subpageFetcher.Fetch(subpageURL, nil) + subRes, err := subpageFetcher.Fetch(subpageURL, fetch.FetchOpts{}) if err != nil { log.Printf("%s ERROR: %v. Skipping item %v.", c.Name, err, currentItem) return @@ -426,7 +426,7 @@ func (c *Scraper) fetchPage(doc *goquery.Document, nextPageI int, currentPageUrl } } if nextPageI == 0 { - newDoc, err := fetchToDoc(currentPageUrl, nil, fetcher) + newDoc, err := fetchToDoc(currentPageUrl, fetcher, fetch.FetchOpts{}) // res, err := fetcher.Fetch(currentPageUrl) // if err != nil { // return false, "", nil, err @@ -450,12 +450,12 @@ func (c *Scraper) fetchPage(doc *goquery.Document, nextPageI int, currentPageUrl // Count: nextPageI, // we always need to 'restart' the clicks because we always re-fetch the page // }, // } - ia := &types.Interaction{ + ia := types.Interaction{ Selector: c.Paginator.Location.Selector, Type: types.InteractionTypeClick, Count: nextPageI, // we always need to 'restart' the clicks because we always re-fetch the page } - nextPageDoc, err := fetchToDoc(currentPageUrl, ia, fetcher) + nextPageDoc, err := fetchToDoc(currentPageUrl, fetcher, fetch.FetchOpts{Interaction: ia}) if err != nil { return false, "", nil, err } @@ -467,7 +467,7 @@ func (c *Scraper) fetchPage(doc *goquery.Document, nextPageI int, currentPageUrl baseUrl := getBaseURL(currentPageUrl, doc) nextPageUrl := getURLString(&c.Paginator.Location, doc.Selection, baseUrl) if nextPageUrl != "" { - nextPageDoc, err := fetchToDoc(nextPageUrl, nil, fetcher) + nextPageDoc, err := fetchToDoc(nextPageUrl, fetcher, fetch.FetchOpts{}) if err != nil { return false, "", nil, err } @@ -481,8 +481,8 @@ func (c *Scraper) fetchPage(doc *goquery.Document, nextPageI int, currentPageUrl } } -func fetchToDoc(url string, ia *types.Interaction, fetcher fetch.Fetcher) (*goquery.Document, error) { - res, err := fetcher.Fetch(url, ia) +func fetchToDoc(url string, fetcher fetch.Fetcher, opts fetch.FetchOpts) (*goquery.Document, error) { + res, err := fetcher.Fetch(url, opts) if err != nil { return nil, err } From 88cc2f9a1c089bbc88966debefbcf86a5297deb8 Mon Sep 17 00:00:00 2001 From: jakopako Date: Sun, 5 Nov 2023 16:46:26 +0100 Subject: [PATCH 3/5] wip --- fetch/fetcher.go | 15 ++++++++------- scraper/scraper.go | 3 ++- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/fetch/fetcher.go b/fetch/fetcher.go index f879b9e..0b193d8 100644 --- a/fetch/fetcher.go +++ b/fetch/fetcher.go @@ -69,6 +69,7 @@ func NewDynamicFetcher(ua string, s int) *DynamicFetcher { ) parentCtx, _ := chromedp.NewExecAllocator(context.Background(), opts...) ctx, _ := chromedp.NewContext(parentCtx) + // TODO don't forget to actually do something with the context.CancelFunc return &DynamicFetcher{ UserAgent: ua, WaitSeconds: s, @@ -84,17 +85,17 @@ func (d *DynamicFetcher) Fetch(url string, opts FetchOpts) (string, error) { // chromedp.WindowSize(1920, 1080), // init with a desktop view (sometimes pages look different on mobile, eg buttons are missing) // ) // parentCtx, cancel := chromedp.NewExecAllocator(context.Background(), opts...) - elapsed := time.Since(start) - fmt.Printf("time elapsed: %s\n", elapsed) + // elapsed := time.Since(start) + // fmt.Printf("time elapsed: %s\n", elapsed) // defer cancel() // ctx, cancel := chromedp.NewContext(parentCtx) - elapsed = time.Since(start) - fmt.Printf("time elapsed: %s\n", elapsed) + // elapsed = time.Since(start) + // fmt.Printf("time elapsed: %s\n", elapsed) // ctx, cancel := chromedp.NewContext(parentCtx, chromedp.WithDebugf(log.Printf)) // defer cancel() var body string - sleepTime := 5 * time.Second + sleepTime := 2 * time.Second if d.WaitSeconds > 0 { sleepTime = time.Duration(d.WaitSeconds) * time.Second } @@ -102,7 +103,7 @@ func (d *DynamicFetcher) Fetch(url string, opts FetchOpts) (string, error) { chromedp.Navigate(url), chromedp.Sleep(sleepTime), // for now } - delay := 1000 * time.Millisecond // default is 1 second + delay := 500 * time.Millisecond // default is .5 seconds if opts.Interaction.Delay > 0 { delay = time.Duration(opts.Interaction.Delay) * time.Millisecond } @@ -138,7 +139,7 @@ func (d *DynamicFetcher) Fetch(url string, opts FetchOpts) (string, error) { return err })) - elapsed = time.Since(start) + elapsed := time.Since(start) fmt.Printf("time elapsed: %s\n", elapsed) // run task list err := chromedp.Run(d.ctx, diff --git a/scraper/scraper.go b/scraper/scraper.go index 76d7ec3..ecd69ce 100644 --- a/scraper/scraper.go +++ b/scraper/scraper.go @@ -208,6 +208,7 @@ type Scraper struct { Filters []*Filter `yaml:"filters,omitempty"` Paginator Paginator `yaml:"paginator,omitempty"` RenderJs bool `yaml:"renderJs,omitempty"` + PageLoadWaitSeconds int `yaml:"page_load_wait_sec,omitempty"` // only taken into account when renderJs = true Interaction types.Interaction `yaml:"interaction,omitempty"` } @@ -419,7 +420,7 @@ func (c *Scraper) fetchPage(doc *goquery.Document, nextPageI int, currentPageUrl // UserAgent: userAgent, // Interaction: c.Interaction, // } - fetcher = fetch.NewDynamicFetcher(userAgent, 0) + fetcher = fetch.NewDynamicFetcher(userAgent, c.PageLoadWaitSeconds) } else { fetcher = &fetch.StaticFetcher{ UserAgent: userAgent, From 5ce533ea7ae54a6e4d6a5ea78722eed727a0eca2 Mon Sep 17 00:00:00 2001 From: jakopako Date: Sun, 5 Nov 2023 17:17:14 +0100 Subject: [PATCH 4/5] finished some minor speed improvements --- README.md | 4 +++- fetch/fetcher.go | 58 ++++++++++++++++++---------------------------- scraper/scraper.go | 44 ++++++++++++++--------------------- 3 files changed, 43 insertions(+), 63 deletions(-) diff --git a/README.md b/README.md index a73c61c..b5a2234 100644 --- a/README.md +++ b/README.md @@ -420,7 +420,9 @@ A dynamic field has a field type that can either be `text`, `url` or `date`. The ### JS Rendering -Since version 0.3.0 js rendering is supported. For this to work the `google-chrome` binary needs to be installed. In the configuration snippet of a scraper just add `renderJs: true` and everything will be taken care of. For now goskyr just tells chrome to fetch the page, render it, wait 5 seconds and return the rendered dom which will then be used to extract the desired data. User interactions with the page (eg scrolling) might be implemented in the future. +Since version 0.3.0 js rendering is supported. For this to work the `google-chrome` binary needs to be installed. In the configuration snippet of a scraper just add `renderJs: true` and everything will be taken care of. With `page_load_wait_sec: ` the default waiting time of 2 seconds can be adapted accordingly. + +User interactions with the page (eg scrolling) might be implemented in the future. Clicking has been implemented. TODO: document. ### Filters diff --git a/fetch/fetcher.go b/fetch/fetcher.go index 0b193d8..40e771b 100644 --- a/fetch/fetcher.go +++ b/fetch/fetcher.go @@ -56,10 +56,11 @@ func (s *StaticFetcher) Fetch(url string, opts FetchOpts) (string, error) { // The DynamicFetcher renders js type DynamicFetcher struct { - UserAgent string - // Interaction types.Interaction - WaitSeconds int - ctx context.Context + UserAgent string + WaitSeconds int + ctx context.Context + cancelParent context.CancelFunc + cancel context.CancelFunc } func NewDynamicFetcher(ua string, s int) *DynamicFetcher { @@ -67,38 +68,30 @@ func NewDynamicFetcher(ua string, s int) *DynamicFetcher { chromedp.DefaultExecAllocatorOptions[:], chromedp.WindowSize(1920, 1080), // init with a desktop view (sometimes pages look different on mobile, eg buttons are missing) ) - parentCtx, _ := chromedp.NewExecAllocator(context.Background(), opts...) - ctx, _ := chromedp.NewContext(parentCtx) - // TODO don't forget to actually do something with the context.CancelFunc - return &DynamicFetcher{ - UserAgent: ua, - WaitSeconds: s, - ctx: ctx, + parentCtx, cancelParent := chromedp.NewExecAllocator(context.Background(), opts...) + ctx, cancel := chromedp.NewContext(parentCtx) + d := &DynamicFetcher{ + UserAgent: ua, + WaitSeconds: s, + ctx: ctx, + cancelParent: cancelParent, + cancel: cancel, + } + if d.WaitSeconds == 0 { + d.WaitSeconds = 2 // default } + return d +} + +func (d *DynamicFetcher) Cancel() { + d.cancelParent() + d.cancel() } func (d *DynamicFetcher) Fetch(url string, opts FetchOpts) (string, error) { // TODO: add user agent - start := time.Now() - // opts := append( - // chromedp.DefaultExecAllocatorOptions[:], - // chromedp.WindowSize(1920, 1080), // init with a desktop view (sometimes pages look different on mobile, eg buttons are missing) - // ) - // parentCtx, cancel := chromedp.NewExecAllocator(context.Background(), opts...) - // elapsed := time.Since(start) - // fmt.Printf("time elapsed: %s\n", elapsed) - // defer cancel() - // ctx, cancel := chromedp.NewContext(parentCtx) - // elapsed = time.Since(start) - // fmt.Printf("time elapsed: %s\n", elapsed) - // ctx, cancel := chromedp.NewContext(parentCtx, chromedp.WithDebugf(log.Printf)) - // defer cancel() - var body string - sleepTime := 2 * time.Second - if d.WaitSeconds > 0 { - sleepTime = time.Duration(d.WaitSeconds) * time.Second - } + sleepTime := time.Duration(d.WaitSeconds) * time.Second actions := []chromedp.Action{ chromedp.Navigate(url), chromedp.Sleep(sleepTime), // for now @@ -109,7 +102,6 @@ func (d *DynamicFetcher) Fetch(url string, opts FetchOpts) (string, error) { } if opts.Interaction.Type == types.InteractionTypeClick { count := 1 // default is 1 - fmt.Println("shouldnt get here") if opts.Interaction.Count > 0 { count = opts.Interaction.Count } @@ -139,13 +131,9 @@ func (d *DynamicFetcher) Fetch(url string, opts FetchOpts) (string, error) { return err })) - elapsed := time.Since(start) - fmt.Printf("time elapsed: %s\n", elapsed) // run task list err := chromedp.Run(d.ctx, actions..., ) - elapsed = time.Since(start) - fmt.Printf("time elapsed: %s\n", elapsed) return body, err } diff --git a/scraper/scraper.go b/scraper/scraper.go index ecd69ce..688f167 100644 --- a/scraper/scraper.go +++ b/scraper/scraper.go @@ -210,6 +210,7 @@ type Scraper struct { RenderJs bool `yaml:"renderJs,omitempty"` PageLoadWaitSeconds int `yaml:"page_load_wait_sec,omitempty"` // only taken into account when renderJs = true Interaction types.Interaction `yaml:"interaction,omitempty"` + fetcher fetch.Fetcher } // GetItems fetches and returns all items from a website according to the @@ -220,6 +221,17 @@ type Scraper struct { // present on the main page (not subpages). This is used by the ML feature generation. func (c Scraper) GetItems(globalConfig *GlobalConfig, rawDyn bool) ([]map[string]interface{}, error) { + // initialize fetcher + if c.RenderJs { + dynFetcher := fetch.NewDynamicFetcher(globalConfig.UserAgent, c.PageLoadWaitSeconds) + defer dynFetcher.Cancel() + c.fetcher = dynFetcher + } else { + c.fetcher = &fetch.StaticFetcher{ + UserAgent: globalConfig.UserAgent, + } + } + var items []map[string]interface{} if err := c.initializeFilters(); err != nil { @@ -272,17 +284,6 @@ func (c Scraper) GetItems(globalConfig *GlobalConfig, rawDyn bool) ([]map[string // handle all fields on subpages if !rawDyn { - var subpageFetcher fetch.Fetcher - if c.RenderJs { - subpageFetcher = &fetch.DynamicFetcher{ - UserAgent: globalConfig.UserAgent, - WaitSeconds: 1, // let's see if this works... - } - } else { - subpageFetcher = &fetch.StaticFetcher{ - UserAgent: globalConfig.UserAgent, - } - } subDocs := make(map[string]*goquery.Document) for _, f := range c.Fields { if f.OnSubpage != "" && f.Value == "" { @@ -290,7 +291,7 @@ func (c Scraper) GetItems(globalConfig *GlobalConfig, rawDyn bool) ([]map[string subpageURL := fmt.Sprint(currentItem[f.OnSubpage]) _, found := subDocs[subpageURL] if !found { - subRes, err := subpageFetcher.Fetch(subpageURL, fetch.FetchOpts{}) + subRes, err := c.fetcher.Fetch(subpageURL, fetch.FetchOpts{}) if err != nil { log.Printf("%s ERROR: %v. Skipping item %v.", c.Name, err, currentItem) return @@ -414,20 +415,9 @@ func (c *Scraper) removeHiddenFields(item map[string]interface{}) map[string]int } func (c *Scraper) fetchPage(doc *goquery.Document, nextPageI int, currentPageUrl, userAgent string) (bool, string, *goquery.Document, error) { - var fetcher fetch.Fetcher - if c.RenderJs { - // fetcher = &fetch.DynamicFetcher{ - // UserAgent: userAgent, - // Interaction: c.Interaction, - // } - fetcher = fetch.NewDynamicFetcher(userAgent, c.PageLoadWaitSeconds) - } else { - fetcher = &fetch.StaticFetcher{ - UserAgent: userAgent, - } - } + if nextPageI == 0 { - newDoc, err := fetchToDoc(currentPageUrl, fetcher, fetch.FetchOpts{}) + newDoc, err := fetchToDoc(currentPageUrl, c.fetcher, fetch.FetchOpts{}) // res, err := fetcher.Fetch(currentPageUrl) // if err != nil { // return false, "", nil, err @@ -456,7 +446,7 @@ func (c *Scraper) fetchPage(doc *goquery.Document, nextPageI int, currentPageUrl Type: types.InteractionTypeClick, Count: nextPageI, // we always need to 'restart' the clicks because we always re-fetch the page } - nextPageDoc, err := fetchToDoc(currentPageUrl, fetcher, fetch.FetchOpts{Interaction: ia}) + nextPageDoc, err := fetchToDoc(currentPageUrl, c.fetcher, fetch.FetchOpts{Interaction: ia}) if err != nil { return false, "", nil, err } @@ -468,7 +458,7 @@ func (c *Scraper) fetchPage(doc *goquery.Document, nextPageI int, currentPageUrl baseUrl := getBaseURL(currentPageUrl, doc) nextPageUrl := getURLString(&c.Paginator.Location, doc.Selection, baseUrl) if nextPageUrl != "" { - nextPageDoc, err := fetchToDoc(nextPageUrl, fetcher, fetch.FetchOpts{}) + nextPageDoc, err := fetchToDoc(nextPageUrl, c.fetcher, fetch.FetchOpts{}) if err != nil { return false, "", nil, err } From 102bd74a30c651449bef408d473cb1981f98bc13 Mon Sep 17 00:00:00 2001 From: jakopako Date: Sun, 5 Nov 2023 17:19:20 +0100 Subject: [PATCH 5/5] removed commented code --- scraper/scraper.go | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/scraper/scraper.go b/scraper/scraper.go index 688f167..327b199 100644 --- a/scraper/scraper.go +++ b/scraper/scraper.go @@ -418,11 +418,6 @@ func (c *Scraper) fetchPage(doc *goquery.Document, nextPageI int, currentPageUrl if nextPageI == 0 { newDoc, err := fetchToDoc(currentPageUrl, c.fetcher, fetch.FetchOpts{}) - // res, err := fetcher.Fetch(currentPageUrl) - // if err != nil { - // return false, "", nil, err - // } - // newDoc, err := goquery.NewDocumentFromReader(strings.NewReader(res)) if err != nil { return false, "", nil, err } @@ -433,14 +428,6 @@ func (c *Scraper) fetchPage(doc *goquery.Document, nextPageI int, currentPageUrl // check if node c.Paginator.Location.Selector is present in doc pagSelector := doc.Find(c.Paginator.Location.Selector) if len(pagSelector.Nodes) > 0 { - // fetcher = &fetch.DynamicFetcher{ - // UserAgent: userAgent, - // Interaction: types.Interaction{ - // Selector: c.Paginator.Location.Selector, - // Type: types.InteractionTypeClick, - // Count: nextPageI, // we always need to 'restart' the clicks because we always re-fetch the page - // }, - // } ia := types.Interaction{ Selector: c.Paginator.Location.Selector, Type: types.InteractionTypeClick,