From 4700ebb38ffc45ca56d23af66e50e62ca3d8da5a Mon Sep 17 00:00:00 2001 From: ErikOwen Date: Mon, 16 Oct 2023 13:06:11 -0700 Subject: [PATCH 1/3] update non-headless crawling to not follow redirects when the disable-redirects flag is enabled --- pkg/engine/common/base.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pkg/engine/common/base.go b/pkg/engine/common/base.go index d9c9a0ed..6cf939ff 100644 --- a/pkg/engine/common/base.go +++ b/pkg/engine/common/base.go @@ -239,6 +239,10 @@ func (s *Shared) Do(crawlSession *CrawlSession, doRequest DoRequestFunc) error { if resp.Resp == nil || resp.Reader == nil { return } + isRedirectResponse := resp.StatusCode >= 300 && resp.StatusCode < 400 + if s.Options.Options.DisableRedirects && isRedirectResponse { + return + } navigationRequests := parser.ParseResponse(resp) s.Enqueue(crawlSession.Queue, navigationRequests...) From 6d0c3caad405659a7f5f253ec5a27f75533c0ee7 Mon Sep 17 00:00:00 2001 From: ErikOwen Date: Wed, 18 Oct 2023 16:44:28 -0700 Subject: [PATCH 2/3] update headless crawling to not follow redirects when the disable-redirects flag is enabled --- pkg/engine/hybrid/crawl.go | 6 ++++++ pkg/engine/parser/parser.go | 4 +++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/pkg/engine/hybrid/crawl.go b/pkg/engine/hybrid/crawl.go index 5684f527..962d0185 100644 --- a/pkg/engine/hybrid/crawl.go +++ b/pkg/engine/hybrid/crawl.go @@ -128,6 +128,12 @@ func (c *Crawler) navigateRequest(s *common.CrawlSession, request *navigation.Re // process the raw response navigationRequests := parser.ParseResponse(resp) c.Enqueue(s.Queue, navigationRequests...) + + // do not continue following the request if it's a redirect and redirects are disabled + isRedirectResponse := statusCode >= 300 && statusCode < 400 + if c.Options.Options.DisableRedirects && isRedirectResponse { + return nil + } return FetchContinueRequest(page, e) })() //nolint defer func() { diff --git a/pkg/engine/parser/parser.go b/pkg/engine/parser/parser.go index 7d8111b4..a1e3f064 100644 --- a/pkg/engine/parser/parser.go +++ b/pkg/engine/parser/parser.go @@ -37,7 +37,6 @@ var responseParsers = []responseParser{ // Header based parsers {headerParser, headerContentLocationParser}, {headerParser, headerLinkParser}, - {headerParser, headerLocationParser}, {headerParser, headerRefreshParser}, // Body based parsers @@ -84,6 +83,9 @@ func InitWithOptions(options *types.Options) { responseParsers = append(responseParsers, responseParser{contentParser, scriptJSFileRegexParser}) responseParsers = append(responseParsers, responseParser{contentParser, bodyScrapeEndpointsParser}) } + if !options.DisableRedirects { + responseParsers = append(responseParsers, responseParser{headerParser, headerLocationParser}) + } } // parseResponse runs the response parsers on the navigation response From 19db3cfb92bf22de896147d9a93dfb36ab3c7f16 Mon Sep 17 00:00:00 2001 From: Mzack9999 Date: Tue, 31 Oct 2023 17:23:28 +0100 Subject: [PATCH 3/3] small refactor --- pkg/engine/common/base.go | 3 +-- pkg/engine/hybrid/crawl.go | 3 +-- pkg/navigation/response.go | 4 ++++ 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/pkg/engine/common/base.go b/pkg/engine/common/base.go index 6cf939ff..2c11ca35 100644 --- a/pkg/engine/common/base.go +++ b/pkg/engine/common/base.go @@ -239,8 +239,7 @@ func (s *Shared) Do(crawlSession *CrawlSession, doRequest DoRequestFunc) error { if resp.Resp == nil || resp.Reader == nil { return } - isRedirectResponse := resp.StatusCode >= 300 && resp.StatusCode < 400 - if s.Options.Options.DisableRedirects && isRedirectResponse { + if s.Options.Options.DisableRedirects && resp.IsRedirect() { return } diff --git a/pkg/engine/hybrid/crawl.go b/pkg/engine/hybrid/crawl.go index 962d0185..212decaf 100644 --- a/pkg/engine/hybrid/crawl.go +++ b/pkg/engine/hybrid/crawl.go @@ -130,8 +130,7 @@ func (c *Crawler) navigateRequest(s *common.CrawlSession, request *navigation.Re c.Enqueue(s.Queue, navigationRequests...) // do not continue following the request if it's a redirect and redirects are disabled - isRedirectResponse := statusCode >= 300 && statusCode < 400 - if c.Options.Options.DisableRedirects && isRedirectResponse { + if c.Options.Options.DisableRedirects && resp.IsRedirect() { return nil } return FetchContinueRequest(page, e) diff --git a/pkg/navigation/response.go b/pkg/navigation/response.go index aed2bd65..cb6f0ef2 100644 --- a/pkg/navigation/response.go +++ b/pkg/navigation/response.go @@ -58,3 +58,7 @@ func (n Response) AbsoluteURL(path string) string { final := absURL.String() return final } + +func (n Response) IsRedirect() bool { + return n.StatusCode >= 300 && n.StatusCode <= 399 +}