Skip to content

Commit

Permalink
Merge pull request #246 from jakopako/feature/better-render-performance
Browse files Browse the repository at this point in the history
Feature/better render performance
  • Loading branch information
jakopako authored Nov 5, 2023
2 parents acedf0b + 102bd74 commit 4a979b7
Show file tree
Hide file tree
Showing 4 changed files with 69 additions and 64 deletions.
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -420,7 +420,9 @@ A dynamic field has a field type that can either be `text`, `url` or `date`. The

### JS Rendering

Since version 0.3.0 js rendering is supported. For this to work the `google-chrome` binary needs to be installed. In the configuration snippet of a scraper just add `renderJs: true` and everything will be taken care of. For now goskyr just tells chrome to fetch the page, render it, wait 5 seconds and return the rendered dom which will then be used to extract the desired data. User interactions with the page (eg scrolling) might be implemented in the future.
Since version 0.3.0 js rendering is supported. For this to work the `google-chrome` binary needs to be installed. In the configuration snippet of a scraper just add `renderJs: true` and everything will be taken care of. With `page_load_wait_sec: <seconds>` the default waiting time of 2 seconds can be adapted accordingly.

User interactions with the page (eg scrolling) might be implemented in the future. Clicking has been implemented. TODO: document.

### Filters

Expand Down
2 changes: 1 addition & 1 deletion automate/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -436,7 +436,7 @@ func GetDynamicFieldsConfig(s *scraper.Scraper, minOcc int, removeStaticFields b
} else {
fetcher = &fetch.StaticFetcher{}
}
res, err := fetcher.Fetch(s.URL)
res, err := fetcher.Fetch(s.URL, fetch.FetchOpts{})
if err != nil {
return err
}
Expand Down
64 changes: 41 additions & 23 deletions fetch/fetcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,21 @@ import (
"github.com/jakopako/goskyr/types"
)

type FetchOpts struct {
Interaction types.Interaction
}

// A Fetcher allows to fetch the content of a web page
type Fetcher interface {
Fetch(url string) (string, error)
Fetch(url string, opts FetchOpts) (string, error)
}

// The StaticFetcher fetches static page content
type StaticFetcher struct {
UserAgent string
}

func (s *StaticFetcher) Fetch(url string) (string, error) {
func (s *StaticFetcher) Fetch(url string, opts FetchOpts) (string, error) {
var resString string
client := &http.Client{}

Expand Down Expand Up @@ -52,48 +56,62 @@ func (s *StaticFetcher) Fetch(url string) (string, error) {

// The DynamicFetcher renders js
type DynamicFetcher struct {
UserAgent string
Interaction types.Interaction
WaitSeconds int
UserAgent string
WaitSeconds int
ctx context.Context
cancelParent context.CancelFunc
cancel context.CancelFunc
}

func (d *DynamicFetcher) Fetch(url string) (string, error) {
// TODO: add user agent
func NewDynamicFetcher(ua string, s int) *DynamicFetcher {
opts := append(
chromedp.DefaultExecAllocatorOptions[:],
chromedp.WindowSize(1920, 1080), // init with a desktop view (sometimes pages look different on mobile, eg buttons are missing)
)
parentCtx, cancel := chromedp.NewExecAllocator(context.Background(), opts...)
defer cancel()
parentCtx, cancelParent := chromedp.NewExecAllocator(context.Background(), opts...)
ctx, cancel := chromedp.NewContext(parentCtx)
// ctx, cancel := chromedp.NewContext(parentCtx, chromedp.WithDebugf(log.Printf))
defer cancel()
d := &DynamicFetcher{
UserAgent: ua,
WaitSeconds: s,
ctx: ctx,
cancelParent: cancelParent,
cancel: cancel,
}
if d.WaitSeconds == 0 {
d.WaitSeconds = 2 // default
}
return d
}

func (d *DynamicFetcher) Cancel() {
d.cancelParent()
d.cancel()
}

func (d *DynamicFetcher) Fetch(url string, opts FetchOpts) (string, error) {
// TODO: add user agent
var body string
sleepTime := 5 * time.Second
if d.WaitSeconds > 0 {
sleepTime = time.Duration(d.WaitSeconds) * time.Second
}
sleepTime := time.Duration(d.WaitSeconds) * time.Second
actions := []chromedp.Action{
chromedp.Navigate(url),
chromedp.Sleep(sleepTime), // for now
}
delay := 1000 * time.Millisecond // default is 1 second
if d.Interaction.Delay > 0 {
delay = time.Duration(d.Interaction.Delay) * time.Millisecond
delay := 500 * time.Millisecond // default is .5 seconds
if opts.Interaction.Delay > 0 {
delay = time.Duration(opts.Interaction.Delay) * time.Millisecond
}
if d.Interaction.Type == types.InteractionTypeClick {
if opts.Interaction.Type == types.InteractionTypeClick {
count := 1 // default is 1
if d.Interaction.Count > 0 {
count = d.Interaction.Count
if opts.Interaction.Count > 0 {
count = opts.Interaction.Count
}
for i := 0; i < count; i++ {
// we only click the button if it exists. Do we really need this check here?
// TODO: should we click as many times as possible if count == 0? How would we implement this?
// actions = append(actions, chromedp.Click(d.Interaction.Selector, chromedp.ByQuery))
actions = append(actions, chromedp.ActionFunc(func(ctx context.Context) error {
var nodes []*cdp.Node
if err := chromedp.Nodes(d.Interaction.Selector, &nodes, chromedp.AtLeast(0)).Do(ctx); err != nil {
if err := chromedp.Nodes(opts.Interaction.Selector, &nodes, chromedp.AtLeast(0)).Do(ctx); err != nil {
return err
}
if len(nodes) == 0 {
Expand All @@ -114,7 +132,7 @@ func (d *DynamicFetcher) Fetch(url string) (string, error) {
}))

// run task list
err := chromedp.Run(ctx,
err := chromedp.Run(d.ctx,
actions...,
)
return body, err
Expand Down
63 changes: 24 additions & 39 deletions scraper/scraper.go
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,9 @@ type Scraper struct {
Filters []*Filter `yaml:"filters,omitempty"`
Paginator Paginator `yaml:"paginator,omitempty"`
RenderJs bool `yaml:"renderJs,omitempty"`
PageLoadWaitSeconds int `yaml:"page_load_wait_sec,omitempty"` // only taken into account when renderJs = true
Interaction types.Interaction `yaml:"interaction,omitempty"`
fetcher fetch.Fetcher
}

// GetItems fetches and returns all items from a website according to the
Expand All @@ -219,6 +221,17 @@ type Scraper struct {
// present on the main page (not subpages). This is used by the ML feature generation.
func (c Scraper) GetItems(globalConfig *GlobalConfig, rawDyn bool) ([]map[string]interface{}, error) {

// initialize fetcher
if c.RenderJs {
dynFetcher := fetch.NewDynamicFetcher(globalConfig.UserAgent, c.PageLoadWaitSeconds)
defer dynFetcher.Cancel()
c.fetcher = dynFetcher
} else {
c.fetcher = &fetch.StaticFetcher{
UserAgent: globalConfig.UserAgent,
}
}

var items []map[string]interface{}

if err := c.initializeFilters(); err != nil {
Expand Down Expand Up @@ -271,25 +284,14 @@ func (c Scraper) GetItems(globalConfig *GlobalConfig, rawDyn bool) ([]map[string

// handle all fields on subpages
if !rawDyn {
var subpageFetcher fetch.Fetcher
if c.RenderJs {
subpageFetcher = &fetch.DynamicFetcher{
UserAgent: globalConfig.UserAgent,
WaitSeconds: 1, // let's see if this works...
}
} else {
subpageFetcher = &fetch.StaticFetcher{
UserAgent: globalConfig.UserAgent,
}
}
subDocs := make(map[string]*goquery.Document)
for _, f := range c.Fields {
if f.OnSubpage != "" && f.Value == "" {
// check whether we fetched the page already
subpageURL := fmt.Sprint(currentItem[f.OnSubpage])
_, found := subDocs[subpageURL]
if !found {
subRes, err := subpageFetcher.Fetch(subpageURL)
subRes, err := c.fetcher.Fetch(subpageURL, fetch.FetchOpts{})
if err != nil {
log.Printf("%s ERROR: %v. Skipping item %v.", c.Name, err, currentItem)
return
Expand Down Expand Up @@ -413,23 +415,9 @@ func (c *Scraper) removeHiddenFields(item map[string]interface{}) map[string]int
}

func (c *Scraper) fetchPage(doc *goquery.Document, nextPageI int, currentPageUrl, userAgent string) (bool, string, *goquery.Document, error) {
var fetcher fetch.Fetcher
if c.RenderJs {
fetcher = &fetch.DynamicFetcher{
UserAgent: userAgent,
Interaction: c.Interaction,
}
} else {
fetcher = &fetch.StaticFetcher{
UserAgent: userAgent,
}
}

if nextPageI == 0 {
res, err := fetcher.Fetch(currentPageUrl)
if err != nil {
return false, "", nil, err
}
newDoc, err := goquery.NewDocumentFromReader(strings.NewReader(res))
newDoc, err := fetchToDoc(currentPageUrl, c.fetcher, fetch.FetchOpts{})
if err != nil {
return false, "", nil, err
}
Expand All @@ -440,15 +428,12 @@ func (c *Scraper) fetchPage(doc *goquery.Document, nextPageI int, currentPageUrl
// check if node c.Paginator.Location.Selector is present in doc
pagSelector := doc.Find(c.Paginator.Location.Selector)
if len(pagSelector.Nodes) > 0 {
fetcher = &fetch.DynamicFetcher{
UserAgent: userAgent,
Interaction: types.Interaction{
Selector: c.Paginator.Location.Selector,
Type: types.InteractionTypeClick,
Count: nextPageI, // we always need to 'restart' the clicks because we always re-fetch the page
},
ia := types.Interaction{
Selector: c.Paginator.Location.Selector,
Type: types.InteractionTypeClick,
Count: nextPageI, // we always need to 'restart' the clicks because we always re-fetch the page
}
nextPageDoc, err := fetchToDoc(currentPageUrl, fetcher)
nextPageDoc, err := fetchToDoc(currentPageUrl, c.fetcher, fetch.FetchOpts{Interaction: ia})
if err != nil {
return false, "", nil, err
}
Expand All @@ -460,7 +445,7 @@ func (c *Scraper) fetchPage(doc *goquery.Document, nextPageI int, currentPageUrl
baseUrl := getBaseURL(currentPageUrl, doc)
nextPageUrl := getURLString(&c.Paginator.Location, doc.Selection, baseUrl)
if nextPageUrl != "" {
nextPageDoc, err := fetchToDoc(nextPageUrl, fetcher)
nextPageDoc, err := fetchToDoc(nextPageUrl, c.fetcher, fetch.FetchOpts{})
if err != nil {
return false, "", nil, err
}
Expand All @@ -474,8 +459,8 @@ func (c *Scraper) fetchPage(doc *goquery.Document, nextPageI int, currentPageUrl
}
}

func fetchToDoc(url string, fetcher fetch.Fetcher) (*goquery.Document, error) {
res, err := fetcher.Fetch(url)
func fetchToDoc(url string, fetcher fetch.Fetcher, opts fetch.FetchOpts) (*goquery.Document, error) {
res, err := fetcher.Fetch(url, opts)
if err != nil {
return nil, err
}
Expand Down

0 comments on commit 4a979b7

Please sign in to comment.