diff --git a/README.md b/README.md index a73c61c..b5a2234 100644 --- a/README.md +++ b/README.md @@ -420,7 +420,9 @@ A dynamic field has a field type that can either be `text`, `url` or `date`. The ### JS Rendering -Since version 0.3.0 js rendering is supported. For this to work the `google-chrome` binary needs to be installed. In the configuration snippet of a scraper just add `renderJs: true` and everything will be taken care of. For now goskyr just tells chrome to fetch the page, render it, wait 5 seconds and return the rendered dom which will then be used to extract the desired data. User interactions with the page (eg scrolling) might be implemented in the future. +Since version 0.3.0 js rendering is supported. For this to work the `google-chrome` binary needs to be installed. In the configuration snippet of a scraper just add `renderJs: true` and everything will be taken care of. With `page_load_wait_sec: ` the default waiting time of 2 seconds can be adapted accordingly. + +User interactions with the page (eg scrolling) might be implemented in the future. Clicking has been implemented. TODO: document. ### Filters diff --git a/fetch/fetcher.go b/fetch/fetcher.go index 0b193d8..40e771b 100644 --- a/fetch/fetcher.go +++ b/fetch/fetcher.go @@ -56,10 +56,11 @@ func (s *StaticFetcher) Fetch(url string, opts FetchOpts) (string, error) { // The DynamicFetcher renders js type DynamicFetcher struct { - UserAgent string - // Interaction types.Interaction - WaitSeconds int - ctx context.Context + UserAgent string + WaitSeconds int + ctx context.Context + cancelParent context.CancelFunc + cancel context.CancelFunc } func NewDynamicFetcher(ua string, s int) *DynamicFetcher { @@ -67,38 +68,30 @@ func NewDynamicFetcher(ua string, s int) *DynamicFetcher { chromedp.DefaultExecAllocatorOptions[:], chromedp.WindowSize(1920, 1080), // init with a desktop view (sometimes pages look different on mobile, eg buttons are missing) ) - parentCtx, _ := chromedp.NewExecAllocator(context.Background(), opts...) - ctx, _ := chromedp.NewContext(parentCtx) - // TODO don't forget to actually do something with the context.CancelFunc - return &DynamicFetcher{ - UserAgent: ua, - WaitSeconds: s, - ctx: ctx, + parentCtx, cancelParent := chromedp.NewExecAllocator(context.Background(), opts...) + ctx, cancel := chromedp.NewContext(parentCtx) + d := &DynamicFetcher{ + UserAgent: ua, + WaitSeconds: s, + ctx: ctx, + cancelParent: cancelParent, + cancel: cancel, + } + if d.WaitSeconds == 0 { + d.WaitSeconds = 2 // default } + return d +} + +func (d *DynamicFetcher) Cancel() { + d.cancelParent() + d.cancel() } func (d *DynamicFetcher) Fetch(url string, opts FetchOpts) (string, error) { // TODO: add user agent - start := time.Now() - // opts := append( - // chromedp.DefaultExecAllocatorOptions[:], - // chromedp.WindowSize(1920, 1080), // init with a desktop view (sometimes pages look different on mobile, eg buttons are missing) - // ) - // parentCtx, cancel := chromedp.NewExecAllocator(context.Background(), opts...) - // elapsed := time.Since(start) - // fmt.Printf("time elapsed: %s\n", elapsed) - // defer cancel() - // ctx, cancel := chromedp.NewContext(parentCtx) - // elapsed = time.Since(start) - // fmt.Printf("time elapsed: %s\n", elapsed) - // ctx, cancel := chromedp.NewContext(parentCtx, chromedp.WithDebugf(log.Printf)) - // defer cancel() - var body string - sleepTime := 2 * time.Second - if d.WaitSeconds > 0 { - sleepTime = time.Duration(d.WaitSeconds) * time.Second - } + sleepTime := time.Duration(d.WaitSeconds) * time.Second actions := []chromedp.Action{ chromedp.Navigate(url), chromedp.Sleep(sleepTime), // for now @@ -109,7 +102,6 @@ func (d *DynamicFetcher) Fetch(url string, opts FetchOpts) (string, error) { } if opts.Interaction.Type == types.InteractionTypeClick { count := 1 // default is 1 - fmt.Println("shouldnt get here") if opts.Interaction.Count > 0 { count = opts.Interaction.Count } @@ -139,13 +131,9 @@ func (d *DynamicFetcher) Fetch(url string, opts FetchOpts) (string, error) { return err })) - elapsed := time.Since(start) - fmt.Printf("time elapsed: %s\n", elapsed) // run task list err := chromedp.Run(d.ctx, actions..., ) - elapsed = time.Since(start) - fmt.Printf("time elapsed: %s\n", elapsed) return body, err } diff --git a/scraper/scraper.go b/scraper/scraper.go index ecd69ce..688f167 100644 --- a/scraper/scraper.go +++ b/scraper/scraper.go @@ -210,6 +210,7 @@ type Scraper struct { RenderJs bool `yaml:"renderJs,omitempty"` PageLoadWaitSeconds int `yaml:"page_load_wait_sec,omitempty"` // only taken into account when renderJs = true Interaction types.Interaction `yaml:"interaction,omitempty"` + fetcher fetch.Fetcher } // GetItems fetches and returns all items from a website according to the @@ -220,6 +221,17 @@ type Scraper struct { // present on the main page (not subpages). This is used by the ML feature generation. func (c Scraper) GetItems(globalConfig *GlobalConfig, rawDyn bool) ([]map[string]interface{}, error) { + // initialize fetcher + if c.RenderJs { + dynFetcher := fetch.NewDynamicFetcher(globalConfig.UserAgent, c.PageLoadWaitSeconds) + defer dynFetcher.Cancel() + c.fetcher = dynFetcher + } else { + c.fetcher = &fetch.StaticFetcher{ + UserAgent: globalConfig.UserAgent, + } + } + var items []map[string]interface{} if err := c.initializeFilters(); err != nil { @@ -272,17 +284,6 @@ func (c Scraper) GetItems(globalConfig *GlobalConfig, rawDyn bool) ([]map[string // handle all fields on subpages if !rawDyn { - var subpageFetcher fetch.Fetcher - if c.RenderJs { - subpageFetcher = &fetch.DynamicFetcher{ - UserAgent: globalConfig.UserAgent, - WaitSeconds: 1, // let's see if this works... - } - } else { - subpageFetcher = &fetch.StaticFetcher{ - UserAgent: globalConfig.UserAgent, - } - } subDocs := make(map[string]*goquery.Document) for _, f := range c.Fields { if f.OnSubpage != "" && f.Value == "" { @@ -290,7 +291,7 @@ func (c Scraper) GetItems(globalConfig *GlobalConfig, rawDyn bool) ([]map[string subpageURL := fmt.Sprint(currentItem[f.OnSubpage]) _, found := subDocs[subpageURL] if !found { - subRes, err := subpageFetcher.Fetch(subpageURL, fetch.FetchOpts{}) + subRes, err := c.fetcher.Fetch(subpageURL, fetch.FetchOpts{}) if err != nil { log.Printf("%s ERROR: %v. Skipping item %v.", c.Name, err, currentItem) return @@ -414,20 +415,9 @@ func (c *Scraper) removeHiddenFields(item map[string]interface{}) map[string]int } func (c *Scraper) fetchPage(doc *goquery.Document, nextPageI int, currentPageUrl, userAgent string) (bool, string, *goquery.Document, error) { - var fetcher fetch.Fetcher - if c.RenderJs { - // fetcher = &fetch.DynamicFetcher{ - // UserAgent: userAgent, - // Interaction: c.Interaction, - // } - fetcher = fetch.NewDynamicFetcher(userAgent, c.PageLoadWaitSeconds) - } else { - fetcher = &fetch.StaticFetcher{ - UserAgent: userAgent, - } - } + if nextPageI == 0 { - newDoc, err := fetchToDoc(currentPageUrl, fetcher, fetch.FetchOpts{}) + newDoc, err := fetchToDoc(currentPageUrl, c.fetcher, fetch.FetchOpts{}) // res, err := fetcher.Fetch(currentPageUrl) // if err != nil { // return false, "", nil, err @@ -456,7 +446,7 @@ func (c *Scraper) fetchPage(doc *goquery.Document, nextPageI int, currentPageUrl Type: types.InteractionTypeClick, Count: nextPageI, // we always need to 'restart' the clicks because we always re-fetch the page } - nextPageDoc, err := fetchToDoc(currentPageUrl, fetcher, fetch.FetchOpts{Interaction: ia}) + nextPageDoc, err := fetchToDoc(currentPageUrl, c.fetcher, fetch.FetchOpts{Interaction: ia}) if err != nil { return false, "", nil, err } @@ -468,7 +458,7 @@ func (c *Scraper) fetchPage(doc *goquery.Document, nextPageI int, currentPageUrl baseUrl := getBaseURL(currentPageUrl, doc) nextPageUrl := getURLString(&c.Paginator.Location, doc.Selection, baseUrl) if nextPageUrl != "" { - nextPageDoc, err := fetchToDoc(nextPageUrl, fetcher, fetch.FetchOpts{}) + nextPageDoc, err := fetchToDoc(nextPageUrl, c.fetcher, fetch.FetchOpts{}) if err != nil { return false, "", nil, err }