Skip to content

Commit

Permalink
finished some minor speed improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
jakopako committed Nov 5, 2023
1 parent 88cc2f9 commit 5ce533e
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 63 deletions.
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -420,7 +420,9 @@ A dynamic field has a field type that can either be `text`, `url` or `date`. The

### JS Rendering

Since version 0.3.0 js rendering is supported. For this to work the `google-chrome` binary needs to be installed. In the configuration snippet of a scraper just add `renderJs: true` and everything will be taken care of. For now goskyr just tells chrome to fetch the page, render it, wait 5 seconds and return the rendered dom which will then be used to extract the desired data. User interactions with the page (eg scrolling) might be implemented in the future.
Since version 0.3.0 js rendering is supported. For this to work the `google-chrome` binary needs to be installed. In the configuration snippet of a scraper just add `renderJs: true` and everything will be taken care of. With `page_load_wait_sec: <seconds>` the default waiting time of 2 seconds can be adapted accordingly.

User interactions with the page (eg scrolling) might be implemented in the future. Clicking has been implemented. TODO: document.

### Filters

Expand Down
58 changes: 23 additions & 35 deletions fetch/fetcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,49 +56,42 @@ func (s *StaticFetcher) Fetch(url string, opts FetchOpts) (string, error) {

// The DynamicFetcher renders js
type DynamicFetcher struct {
UserAgent string
// Interaction types.Interaction
WaitSeconds int
ctx context.Context
UserAgent string
WaitSeconds int
ctx context.Context
cancelParent context.CancelFunc
cancel context.CancelFunc
}

func NewDynamicFetcher(ua string, s int) *DynamicFetcher {
opts := append(
chromedp.DefaultExecAllocatorOptions[:],
chromedp.WindowSize(1920, 1080), // init with a desktop view (sometimes pages look different on mobile, eg buttons are missing)
)
parentCtx, _ := chromedp.NewExecAllocator(context.Background(), opts...)
ctx, _ := chromedp.NewContext(parentCtx)
// TODO don't forget to actually do something with the context.CancelFunc
return &DynamicFetcher{
UserAgent: ua,
WaitSeconds: s,
ctx: ctx,
parentCtx, cancelParent := chromedp.NewExecAllocator(context.Background(), opts...)
ctx, cancel := chromedp.NewContext(parentCtx)
d := &DynamicFetcher{
UserAgent: ua,
WaitSeconds: s,
ctx: ctx,
cancelParent: cancelParent,
cancel: cancel,
}
if d.WaitSeconds == 0 {
d.WaitSeconds = 2 // default
}
return d
}

func (d *DynamicFetcher) Cancel() {
d.cancelParent()
d.cancel()
}

func (d *DynamicFetcher) Fetch(url string, opts FetchOpts) (string, error) {
// TODO: add user agent
start := time.Now()
// opts := append(
// chromedp.DefaultExecAllocatorOptions[:],
// chromedp.WindowSize(1920, 1080), // init with a desktop view (sometimes pages look different on mobile, eg buttons are missing)
// )
// parentCtx, cancel := chromedp.NewExecAllocator(context.Background(), opts...)
// elapsed := time.Since(start)
// fmt.Printf("time elapsed: %s\n", elapsed)
// defer cancel()
// ctx, cancel := chromedp.NewContext(parentCtx)
// elapsed = time.Since(start)
// fmt.Printf("time elapsed: %s\n", elapsed)
// ctx, cancel := chromedp.NewContext(parentCtx, chromedp.WithDebugf(log.Printf))
// defer cancel()

var body string
sleepTime := 2 * time.Second
if d.WaitSeconds > 0 {
sleepTime = time.Duration(d.WaitSeconds) * time.Second
}
sleepTime := time.Duration(d.WaitSeconds) * time.Second
actions := []chromedp.Action{
chromedp.Navigate(url),
chromedp.Sleep(sleepTime), // for now
Expand All @@ -109,7 +102,6 @@ func (d *DynamicFetcher) Fetch(url string, opts FetchOpts) (string, error) {
}
if opts.Interaction.Type == types.InteractionTypeClick {
count := 1 // default is 1
fmt.Println("shouldnt get here")
if opts.Interaction.Count > 0 {
count = opts.Interaction.Count
}
Expand Down Expand Up @@ -139,13 +131,9 @@ func (d *DynamicFetcher) Fetch(url string, opts FetchOpts) (string, error) {
return err
}))

elapsed := time.Since(start)
fmt.Printf("time elapsed: %s\n", elapsed)
// run task list
err := chromedp.Run(d.ctx,
actions...,
)
elapsed = time.Since(start)
fmt.Printf("time elapsed: %s\n", elapsed)
return body, err
}
44 changes: 17 additions & 27 deletions scraper/scraper.go
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,7 @@ type Scraper struct {
RenderJs bool `yaml:"renderJs,omitempty"`
PageLoadWaitSeconds int `yaml:"page_load_wait_sec,omitempty"` // only taken into account when renderJs = true
Interaction types.Interaction `yaml:"interaction,omitempty"`
fetcher fetch.Fetcher
}

// GetItems fetches and returns all items from a website according to the
Expand All @@ -220,6 +221,17 @@ type Scraper struct {
// present on the main page (not subpages). This is used by the ML feature generation.
func (c Scraper) GetItems(globalConfig *GlobalConfig, rawDyn bool) ([]map[string]interface{}, error) {

// initialize fetcher
if c.RenderJs {
dynFetcher := fetch.NewDynamicFetcher(globalConfig.UserAgent, c.PageLoadWaitSeconds)
defer dynFetcher.Cancel()
c.fetcher = dynFetcher
} else {
c.fetcher = &fetch.StaticFetcher{
UserAgent: globalConfig.UserAgent,
}
}

var items []map[string]interface{}

if err := c.initializeFilters(); err != nil {
Expand Down Expand Up @@ -272,25 +284,14 @@ func (c Scraper) GetItems(globalConfig *GlobalConfig, rawDyn bool) ([]map[string

// handle all fields on subpages
if !rawDyn {
var subpageFetcher fetch.Fetcher
if c.RenderJs {
subpageFetcher = &fetch.DynamicFetcher{
UserAgent: globalConfig.UserAgent,
WaitSeconds: 1, // let's see if this works...
}
} else {
subpageFetcher = &fetch.StaticFetcher{
UserAgent: globalConfig.UserAgent,
}
}
subDocs := make(map[string]*goquery.Document)
for _, f := range c.Fields {
if f.OnSubpage != "" && f.Value == "" {
// check whether we fetched the page already
subpageURL := fmt.Sprint(currentItem[f.OnSubpage])
_, found := subDocs[subpageURL]
if !found {
subRes, err := subpageFetcher.Fetch(subpageURL, fetch.FetchOpts{})
subRes, err := c.fetcher.Fetch(subpageURL, fetch.FetchOpts{})
if err != nil {
log.Printf("%s ERROR: %v. Skipping item %v.", c.Name, err, currentItem)
return
Expand Down Expand Up @@ -414,20 +415,9 @@ func (c *Scraper) removeHiddenFields(item map[string]interface{}) map[string]int
}

func (c *Scraper) fetchPage(doc *goquery.Document, nextPageI int, currentPageUrl, userAgent string) (bool, string, *goquery.Document, error) {
var fetcher fetch.Fetcher
if c.RenderJs {
// fetcher = &fetch.DynamicFetcher{
// UserAgent: userAgent,
// Interaction: c.Interaction,
// }
fetcher = fetch.NewDynamicFetcher(userAgent, c.PageLoadWaitSeconds)
} else {
fetcher = &fetch.StaticFetcher{
UserAgent: userAgent,
}
}

if nextPageI == 0 {
newDoc, err := fetchToDoc(currentPageUrl, fetcher, fetch.FetchOpts{})
newDoc, err := fetchToDoc(currentPageUrl, c.fetcher, fetch.FetchOpts{})
// res, err := fetcher.Fetch(currentPageUrl)
// if err != nil {
// return false, "", nil, err
Expand Down Expand Up @@ -456,7 +446,7 @@ func (c *Scraper) fetchPage(doc *goquery.Document, nextPageI int, currentPageUrl
Type: types.InteractionTypeClick,
Count: nextPageI, // we always need to 'restart' the clicks because we always re-fetch the page
}
nextPageDoc, err := fetchToDoc(currentPageUrl, fetcher, fetch.FetchOpts{Interaction: ia})
nextPageDoc, err := fetchToDoc(currentPageUrl, c.fetcher, fetch.FetchOpts{Interaction: ia})
if err != nil {
return false, "", nil, err
}
Expand All @@ -468,7 +458,7 @@ func (c *Scraper) fetchPage(doc *goquery.Document, nextPageI int, currentPageUrl
baseUrl := getBaseURL(currentPageUrl, doc)
nextPageUrl := getURLString(&c.Paginator.Location, doc.Selection, baseUrl)
if nextPageUrl != "" {
nextPageDoc, err := fetchToDoc(nextPageUrl, fetcher, fetch.FetchOpts{})
nextPageDoc, err := fetchToDoc(nextPageUrl, c.fetcher, fetch.FetchOpts{})
if err != nil {
return false, "", nil, err
}
Expand Down

0 comments on commit 5ce533e

Please sign in to comment.