diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ea154fe --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +.vscode +.idea +bin +upload +result +.DS_Store +internal/.DS_Store +pkg/.DS_Store \ No newline at end of file diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..b009722 --- /dev/null +++ b/Makefile @@ -0,0 +1,15 @@ +VERSION=$(shell git describe --tags --always) + +.PHONY: build_all +# build +build_all: + rm -rf bin && mkdir bin bin/linux-amd64 bin/linux-arm64 bin/darwin-amd64 bin/darwin-arm64 \ + && CGO_ENABLED=0 GOOS=darwin GOARCH=arm64 go build -ldflags "-X 'main.Version=$(VERSION)'" -o ./bin/darwin-arm64/ ./... \ + && CGO_ENABLED=0 GOOS=darwin GOARCH=amd64 go build -ldflags "-X 'main.Version=$(VERSION)'" -o ./bin/darwin-amd64/ ./... \ + && CGO_ENABLED=0 GOOS=linux GOARCH=arm64 go build -ldflags "-X 'main.Version=$(VERSION)'" -o ./bin/linux-arm64/ ./... \ + && CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -ldflags "-X 'main.Version=$(VERSION)'" -o ./bin/linux-amd64/ ./... + +.PHONY: build +# build +build: + rm -rf bin && mkdir bin && go build -ldflags "-X main.Version=$(VERSION)" -o ./bin/ ./... \ No newline at end of file diff --git a/README.md b/README.md index 2e3d22c..5fc4f5e 100644 --- a/README.md +++ b/README.md @@ -28,9 +28,15 @@ crawlergo currently supports the following features: **Build** +- compilation for current platform + +```shell +make build +``` + +- compile for all platforms ```shell -cd crawlergo/cmd/crawlergo -go build crawlergo_cmd.go +make build_all ``` 1. crawlergo relies only on the chrome environment to run, go to [download](https://www.chromium.org/getting-involved/download-chromium) for the new version of chromium. @@ -45,14 +51,14 @@ go build crawlergo_cmd.go Assuming your chromium installation directory is `/tmp/chromium/`, set up 10 tabs open at the same time and crawl the `testphp.vulnweb.com`: ```shell -./crawlergo -c /tmp/chromium/chrome -t 10 http://testphp.vulnweb.com/ +bin/crawlergo -c /tmp/chromium/chrome -t 10 http://testphp.vulnweb.com/ ``` ### Using Proxy ```shell -./crawlergo -c /tmp/chromium/chrome -t 10 --request-proxy socks5://127.0.0.1:7891 http://testphp.vulnweb.com/ +bin/crawlergo -c /tmp/chromium/chrome -t 10 --request-proxy socks5://127.0.0.1:7891 http://testphp.vulnweb.com/ ``` @@ -70,7 +76,7 @@ import subprocess def main(): target = "http://testphp.vulnweb.com/" - cmd = ["./crawlergo", "-c", "/tmp/chromium/chrome", "-o", "json", target] + cmd = ["bin/crawlergo", "-c", "/tmp/chromium/chrome", "-o", "json", target] rsp = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) output, error = rsp.communicate() # "--[Mission Complete]--" is the end-of-task separator string diff --git a/README_zh-cn.md b/README_zh-cn.md index 558153c..a503095 100644 --- a/README_zh-cn.md +++ b/README_zh-cn.md @@ -28,9 +28,15 @@ crawlergo 目前支持以下特性: **Build** +- 编译适用于当前机器的文件 + +```shell +make build +``` + +- 交叉编译所有平台的文件 ```shell -cd crawlergo/cmd/crawlergo -go build crawlergo_cmd.go +make build_all ``` 1. crawlergo 只依赖chrome运行即可,前往[下载](https://www.chromium.org/getting-involved/download-chromium)新版本的chromium。 @@ -46,7 +52,7 @@ go build crawlergo_cmd.go 假设你的chromium安装在 `/tmp/chromium/` ,开启最大10标签页,爬取AWVS靶场: ```shell -./crawlergo -c /tmp/chromium/chrome -t 10 http://testphp.vulnweb.com/ +bin/crawlergo -c /tmp/chromium/chrome -t 10 http://testphp.vulnweb.com/ ``` @@ -54,7 +60,7 @@ go build crawlergo_cmd.go ### 使用代理 ```shell -./crawlergo -c /tmp/chromium/chrome -t 10 --request-proxy socks5://127.0.0.1:7891 http://testphp.vulnweb.com/ +bin/crawlergo -c /tmp/chromium/chrome -t 10 --request-proxy socks5://127.0.0.1:7891 http://testphp.vulnweb.com/ ``` @@ -73,7 +79,7 @@ import subprocess def main(): target = "http://testphp.vulnweb.com/" - cmd = ["./crawlergo", "-c", "/tmp/chromium/chrome", "-o", "json", target] + cmd = ["bin/crawlergo", "-c", "/tmp/chromium/chrome", "-o", "json", target] rsp = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) output, error = rsp.communicate() # "--[Mission Complete]--" 是任务结束的分隔字符串 diff --git a/cmd/crawlergo/flag.go b/cmd/crawlergo/flag.go new file mode 100644 index 0000000..86b5209 --- /dev/null +++ b/cmd/crawlergo/flag.go @@ -0,0 +1,284 @@ +package main + +import ( + "fmt" + + "github.com/Qianlitp/crawlergo/pkg/config" + + "github.com/urfave/cli/v2" +) + +var cliFlags = []cli.Flag{ + SetChromePath(), + SetCustomHeaders(), + SetPostData(), + SetMaxCrawledCount(), + SetFilterMod(), + SetOutputMode(), + SetOutputJSON(), + SetIgcognitoContext(), + SetMaxTabCount(), + SetFuzzPath(), + SetFuzzPathDict(), + SetRobotsPath(), + SetRequestProxy(), + SetEncodeURL(), + SetTabRunTTL(), + SetWaitDomContentLoadedTTL(), + SetEventTriggerMode(), + SetEventTriggerInterval(), + SetBeforeExitDelay(), + SetIgnoreUrlKeywords(), + SetFormValues(), + SetFormKeywordValue(), + SetPushToProxy(), + SetPushPoolMax(), + SetLogLevel(), + SetNoHeadless(), +} + +func SetChromePath() *cli.PathFlag { + return &cli.PathFlag{ + Name: "chromium-path", + Aliases: []string{"c"}, + Usage: "`Path` of chromium executable. Such as \"/home/test/chrome-linux/chrome\"", + Required: true, + Destination: &taskConfig.ChromiumPath, + EnvVars: []string{"CRAWLERGO_CHROMIUM_PATH"}, + } +} + +func SetCustomHeaders() *cli.StringFlag { + return &cli.StringFlag{ + Name: "custom-headers", + Usage: "add additional `Headers` to each request. The input string will be called json.Unmarshal", + Value: fmt.Sprintf(`{"Spider-Name": "crawlergo", "User-Agent": "%s"}`, config.DefaultUA), + Destination: &taskConfig.ExtraHeadersString, + } +} + +func SetPostData() *cli.StringFlag { + return &cli.StringFlag{ + Name: "post-data", + Aliases: []string{"d"}, + Usage: "set `PostData` to target and use POST method.", + Destination: &postData, + } +} + +func SetMaxCrawledCount() *cli.IntFlag { + return &cli.IntFlag{ + Name: "max-crawled-count", + Aliases: []string{"m"}, + Value: config.MaxCrawlCount, + Usage: "the maximum `Number` of URLs visited by the crawler in this task.", + Destination: &taskConfig.MaxCrawlCount, + } +} + +func SetFilterMod() *cli.StringFlag { + return &cli.StringFlag{ + Name: "filter-mode", + Aliases: []string{"f"}, + Value: "smart", + Usage: "filtering `Mode` used for collected requests. Allowed mode:\"simple\", \"smart\" or \"strict\".", + Destination: &taskConfig.FilterMode, + } +} + +func SetOutputMode() *cli.StringFlag { + return &cli.StringFlag{ + Name: "output-mode", + Aliases: []string{"o"}, + Value: "console", + Usage: "console print or serialize output. Allowed mode:\"console\" ,\"json\" or \"none\".", + Destination: &outputMode, + } +} + +func SetOutputJSON() *cli.StringFlag { + return &cli.StringFlag{ + Name: "output-dir", + Usage: "write output to a json file.Such as result_www_test_com.json", + Destination: &outputJsonPath, + } +} + +func SetIgcognitoContext() *cli.BoolFlag { + return &cli.BoolFlag{ + Name: "incognito-context", + Aliases: []string{"i"}, + Value: true, + Usage: "whether the browser is launched in incognito mode.", + Destination: &taskConfig.IncognitoContext, + } +} + +func SetMaxTabCount() *cli.IntFlag { + return &cli.IntFlag{ + Name: "max-tab-count", + Aliases: []string{"t"}, + Value: 8, + Usage: "maximum `Number` of tabs allowed.", + Destination: &taskConfig.MaxTabsCount, + } +} + +func SetFuzzPath() *cli.BoolFlag { + return &cli.BoolFlag{ + Name: "fuzz-path", + Value: false, + Usage: "whether to fuzz the target with common paths.", + Destination: &taskConfig.PathByFuzz, + } +} + +func SetFuzzPathDict() *cli.PathFlag { + return &cli.PathFlag{ + Name: "fuzz-path-dict", + Usage: "`Path` of fuzz dict. Such as \"/home/test/fuzz_path.txt\"", + Destination: &taskConfig.FuzzDictPath, + } +} + +func SetRobotsPath() *cli.BoolFlag { + return &cli.BoolFlag{ + Name: "robots-path", + Value: false, + Usage: "whether to resolve paths from /robots.txt.", + Destination: &taskConfig.PathFromRobots, + } +} + +func SetRequestProxy() *cli.StringFlag { + return &cli.StringFlag{ + Name: "request-proxy", + Usage: "all requests connect through defined proxy server.", + Destination: &taskConfig.Proxy, + } +} + +// return &cli.BoolFlag{ +// Name: "bypass", +// Value: false, +// Usage: "whether to encode url with detected charset.", +// Destination: &taskConfig.EncodeURLWithCharset, +//}, +func SetEncodeURL() *cli.BoolFlag { + return &cli.BoolFlag{ + Name: "encode-url", + Value: false, + Usage: "whether to encode url with detected charset.", + Destination: &taskConfig.EncodeURLWithCharset, + } +} + +func SetTabRunTTL() *cli.DurationFlag { + + return &cli.DurationFlag{ + Name: "tab-run-timeout", + Value: config.TabRunTimeout, + Usage: "the `Timeout` of a single tab task.", + Destination: &taskConfig.TabRunTimeout, + } +} + +func SetWaitDomContentLoadedTTL() *cli.DurationFlag { + return &cli.DurationFlag{ + Name: "wait-dom-content-loaded-timeout", + Value: config.DomContentLoadedTimeout, + Usage: "the `Timeout` of waiting for a page dom ready.", + Destination: &taskConfig.DomContentLoadedTimeout, + } +} + +func SetEventTriggerMode() *cli.StringFlag { + return &cli.StringFlag{ + Name: "event-trigger-mode", + Value: config.EventTriggerAsync, + Usage: "this `Value` determines how the crawler automatically triggers events.Allowed mode:\"async\" or \"sync\".", + Destination: &taskConfig.EventTriggerMode, + } +} + +func SetEventTriggerInterval() *cli.DurationFlag { + return &cli.DurationFlag{ + Name: "event-trigger-interval", + Value: config.EventTriggerInterval, + Usage: "the `Interval` of triggering each event.", + Destination: &taskConfig.EventTriggerInterval, + } +} + +func SetBeforeExitDelay() *cli.DurationFlag { + return &cli.DurationFlag{ + Name: "before-exit-delay", + Value: config.BeforeExitDelay, + Usage: "the `Time` of waiting before crawler exit.", + Destination: &taskConfig.BeforeExitDelay, + } +} + +func SetIgnoreUrlKeywords() *cli.StringSliceFlag { + return &cli.StringSliceFlag{ + Name: "ignore-url-keywords", + Aliases: []string{"iuk"}, + Value: ignoreKeywords, + Usage: "crawlergo will not crawl these URLs matched by `Keywords`. e.g.: -iuk logout -iuk quit -iuk exit", + DefaultText: "Default [logout quit exit]", + } +} + +func SetFormValues() *cli.StringSliceFlag { + return &cli.StringSliceFlag{ + Name: "form-values", + Aliases: []string{"fv"}, + Value: customFormTypeValues, + Usage: "custom filling text for each form type. e.g.: -fv username=crawlergo_nice -fv password=admin123", + } +} + +// 根据关键词自行选择填充文本 +func SetFormKeywordValue() *cli.StringSliceFlag { + return &cli.StringSliceFlag{ + Name: "form-keyword-values", + Aliases: []string{"fkv"}, + Value: customFormKeywordValues, + Usage: "custom filling text, fuzzy matched by keyword. e.g.: -fkv user=crawlergo_nice -fkv pass=admin123", + } +} + +func SetPushToProxy() *cli.StringFlag { + return &cli.StringFlag{ + Name: "push-to-proxy", + Usage: "every request in 'req_list' will be pushed to the proxy `Address`. Such as \"http://127.0.0.1:8080/\"", + Destination: &pushAddress, + } +} + +func SetPushPoolMax() *cli.IntFlag { + return &cli.IntFlag{ + Name: "push-pool-max", + Usage: "maximum `Number` of concurrency when pushing results to proxy.", + Value: DefaultMaxPushProxyPoolMax, + Destination: &pushProxyPoolMax, + } +} + +func SetLogLevel() *cli.StringFlag { + return &cli.StringFlag{ + Name: "log-level", + Usage: "log print `Level`, options include debug, info, warn, error and fatal.", + Value: DefaultLogLevel, + Destination: &logLevel, + } +} + +func SetNoHeadless() *cli.BoolFlag { + return &cli.BoolFlag{ + Name: "no-headless", + Value: false, + Usage: "no headless mode", + Destination: &taskConfig.NoHeadless, + } +} diff --git a/cmd/crawlergo/crawlergo_cmd.go b/cmd/crawlergo/main.go similarity index 50% rename from cmd/crawlergo/crawlergo_cmd.go rename to cmd/crawlergo/main.go index d58efa7..2ba887b 100755 --- a/cmd/crawlergo/crawlergo_cmd.go +++ b/cmd/crawlergo/main.go @@ -1,23 +1,25 @@ package main import ( - "crawlergo/pkg" - "crawlergo/pkg/config" - "crawlergo/pkg/logger" - model2 "crawlergo/pkg/model" - "crawlergo/pkg/tools" - "crawlergo/pkg/tools/requests" "encoding/json" "errors" "fmt" - "github.com/panjf2000/ants/v2" - "github.com/sirupsen/logrus" - "github.com/urfave/cli/v2" "log" "os" "os/signal" "strings" "sync" + "syscall" + + "github.com/Qianlitp/crawlergo/pkg" + "github.com/Qianlitp/crawlergo/pkg/config" + "github.com/Qianlitp/crawlergo/pkg/logger" + model2 "github.com/Qianlitp/crawlergo/pkg/model" + "github.com/Qianlitp/crawlergo/pkg/tools" + "github.com/Qianlitp/crawlergo/pkg/tools/requests" + "github.com/panjf2000/ants/v2" + "github.com/sirupsen/logrus" + "github.com/urfave/cli/v2" ) /** @@ -46,21 +48,26 @@ type ProxyTask struct { pushProxy string } -const DefaultMaxPushProxyPoolMax = 10 -const DefaultLogLevel = "Info" - -var taskConfig pkg.TaskConfig -var outputMode string -var postData string -var signalChan chan os.Signal -var ignoreKeywords *cli.StringSlice -var customFormTypeValues *cli.StringSlice -var customFormKeywordValues *cli.StringSlice -var pushAddress string -var pushProxyPoolMax int -var pushProxyWG sync.WaitGroup -var outputJsonPath string -var logLevel string +const ( + DefaultMaxPushProxyPoolMax = 10 + DefaultLogLevel = "Info" +) + +var ( + taskConfig pkg.TaskConfig + outputMode string + postData string + signalChan chan os.Signal + ignoreKeywords *cli.StringSlice + customFormTypeValues *cli.StringSlice + customFormKeywordValues *cli.StringSlice + pushAddress string + pushProxyPoolMax int + pushProxyWG sync.WaitGroup + outputJsonPath string + logLevel string + Version string +) func main() { author := cli.Author{ @@ -73,181 +80,13 @@ func main() { customFormKeywordValues = cli.NewStringSlice() app := &cli.App{ - Name: "crawlergo", - Usage: "A powerful browser crawler for web vulnerability scanners", - UsageText: "crawlergo [global options] url1 url2 url3 ... (must be same host)", - Version: "v0.4.2", - Authors: []*cli.Author{&author}, - Flags: []cli.Flag{ - &cli.PathFlag{ - Name: "chromium-path", - Aliases: []string{"c"}, - Usage: "`Path` of chromium executable. Such as \"/home/test/chrome-linux/chrome\"", - Required: true, - Destination: &taskConfig.ChromiumPath, - EnvVars: []string{"CRAWLERGO_CHROMIUM_PATH"}, - }, - &cli.StringFlag{ - Name: "custom-headers", - Usage: "add additional `Headers` to each request. The input string will be called json.Unmarshal", - Value: fmt.Sprintf(`{"Spider-Name": "crawlergo", "User-Agent": "%s"}`, config.DefaultUA), - Destination: &taskConfig.ExtraHeadersString, - }, - &cli.StringFlag{ - Name: "post-data", - Aliases: []string{"d"}, - Usage: "set `PostData` to target and use POST method.", - Destination: &postData, - }, - &cli.IntFlag{ - Name: "max-crawled-count", - Aliases: []string{"m"}, - Value: config.MaxCrawlCount, - Usage: "the maximum `Number` of URLs visited by the crawler in this task.", - Destination: &taskConfig.MaxCrawlCount, - }, - &cli.StringFlag{ - Name: "filter-mode", - Aliases: []string{"f"}, - Value: "smart", - Usage: "filtering `Mode` used for collected requests. Allowed mode:\"simple\", \"smart\" or \"strict\".", - Destination: &taskConfig.FilterMode, - }, - &cli.StringFlag{ - Name: "output-mode", - Aliases: []string{"o"}, - Value: "console", - Usage: "console print or serialize output. Allowed mode:\"console\" ,\"json\" or \"none\".", - Destination: &outputMode, - }, - &cli.StringFlag{ - Name: "output-json", - Usage: "write output to a json file.Such as result_www_crawlergo_com.json", - Destination: &outputJsonPath, - }, - &cli.BoolFlag{ - Name: "incognito-context", - Aliases: []string{"i"}, - Value: true, - Usage: "whether the browser is launched in incognito mode.", - Destination: &taskConfig.IncognitoContext, - }, - &cli.IntFlag{ - Name: "max-tab-count", - Aliases: []string{"t"}, - Value: 8, - Usage: "maximum `Number` of tabs allowed.", - Destination: &taskConfig.MaxTabsCount, - }, - &cli.BoolFlag{ - Name: "fuzz-path", - Value: false, - Usage: "whether to fuzz the target with common paths.", - Destination: &taskConfig.PathByFuzz, - }, - &cli.PathFlag{ - Name: "fuzz-path-dict", - Usage: "`Path` of fuzz dict. Such as \"/home/test/fuzz_path.txt\"", - Destination: &taskConfig.FuzzDictPath, - }, - &cli.BoolFlag{ - Name: "robots-path", - Value: false, - Usage: "whether to resolve paths from /robots.txt.", - Destination: &taskConfig.PathFromRobots, - }, - &cli.StringFlag{ - Name: "request-proxy", - Usage: "all requests connect through defined proxy server.", - Destination: &taskConfig.Proxy, - }, - //&cli.BoolFlag{ - // Name: "bypass", - // Value: false, - // Usage: "whether to encode url with detected charset.", - // Destination: &taskConfig.EncodeURLWithCharset, - //}, - &cli.BoolFlag{ - Name: "encode-url", - Value: false, - Usage: "whether to encode url with detected charset.", - Destination: &taskConfig.EncodeURLWithCharset, - }, - &cli.DurationFlag{ - Name: "tab-run-timeout", - Value: config.TabRunTimeout, - Usage: "the `Timeout` of a single tab task.", - Destination: &taskConfig.TabRunTimeout, - }, - &cli.DurationFlag{ - Name: "wait-dom-content-loaded-timeout", - Value: config.DomContentLoadedTimeout, - Usage: "the `Timeout` of waiting for a page dom ready.", - Destination: &taskConfig.DomContentLoadedTimeout, - }, - &cli.StringFlag{ - Name: "event-trigger-mode", - Value: config.EventTriggerAsync, - Usage: "this `Value` determines how the crawler automatically triggers events.Allowed mode:\"async\" or \"sync\".", - Destination: &taskConfig.EventTriggerMode, - }, - &cli.DurationFlag{ - Name: "event-trigger-interval", - Value: config.EventTriggerInterval, - Usage: "the `Interval` of triggering each event.", - Destination: &taskConfig.EventTriggerInterval, - }, - &cli.DurationFlag{ - Name: "before-exit-delay", - Value: config.BeforeExitDelay, - Usage: "the `Time` of waiting before crawler exit.", - Destination: &taskConfig.BeforeExitDelay, - }, - &cli.StringSliceFlag{ - Name: "ignore-url-keywords", - Aliases: []string{"iuk"}, - Value: ignoreKeywords, - Usage: "crawlergo will not crawl these URLs matched by `Keywords`. e.g.: -iuk logout -iuk quit -iuk exit", - DefaultText: "Default [logout quit exit]", - }, - &cli.StringSliceFlag{ - Name: "form-values", - Aliases: []string{"fv"}, - Value: customFormTypeValues, - Usage: "custom filling text for each form type. e.g.: -fv username=crawlergo_nice -fv password=admin123", - }, - // 根据关键词自行选择填充文本 - &cli.StringSliceFlag{ - Name: "form-keyword-values", - Aliases: []string{"fkv"}, - Value: customFormKeywordValues, - Usage: "custom filling text, fuzzy matched by keyword. e.g.: -fkv user=crawlergo_nice -fkv pass=admin123", - }, - &cli.StringFlag{ - Name: "push-to-proxy", - Usage: "every request in 'req_list' will be pushed to the proxy `Address`. Such as \"http://127.0.0.1:8080/\"", - Destination: &pushAddress, - }, - &cli.IntFlag{ - Name: "push-pool-max", - Usage: "maximum `Number` of concurrency when pushing results to proxy.", - Value: DefaultMaxPushProxyPoolMax, - Destination: &pushProxyPoolMax, - }, - &cli.StringFlag{ - Name: "log-level", - Usage: "log print `Level`, options include debug, info, warn, error and fatal.", - Value: DefaultLogLevel, - Destination: &logLevel, - }, - &cli.BoolFlag{ - Name: "no-headless", - Value: false, - Usage: "no headless mode", - Destination: &taskConfig.NoHeadless, - }, - }, - Action: run, + Name: "crawlergo", + Usage: "A powerful browser crawler for web vulnerability scanners", + UsageText: "crawlergo [global options] url1 url2 url3 ... (must be same host)", + Version: Version, + Authors: []*cli.Author{&author}, + Flags: cliFlags, + Action: run, } err := app.Run(os.Args) @@ -258,7 +97,7 @@ func main() { func run(c *cli.Context) error { signalChan = make(chan os.Signal, 1) - signal.Notify(signalChan, os.Interrupt) + signal.Notify(signalChan, syscall.SIGTERM, syscall.SIGQUIT, syscall.SIGINT) if c.Args().Len() == 0 { logger.Logger.Error("url must be set") @@ -449,14 +288,12 @@ func (p *ProxyTask) doRequest() { } func handleExit(t *pkg.CrawlerTask) { - select { - case <-signalChan: - fmt.Println("exit ...") - t.Pool.Tune(1) - t.Pool.Release() - t.Browser.Close() - os.Exit(-1) - } + <-signalChan + fmt.Println("exit ...") + t.Pool.Tune(1) + t.Pool.Release() + t.Browser.Close() + os.Exit(-1) } func getJsonSerialize(result *pkg.Result) []byte { diff --git a/go.mod b/go.mod index 05c2633..4531566 100755 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ -module crawlergo +module github.com/Qianlitp/crawlergo -go 1.12 +go 1.16 replace git.apache.org/thrift.git => github.com/apache/thrift v0.13.0 diff --git a/pkg/domain_collect.go b/pkg/domain_collect.go index 1bebd34..94e67b3 100755 --- a/pkg/domain_collect.go +++ b/pkg/domain_collect.go @@ -1,9 +1,10 @@ package pkg import ( - "crawlergo/pkg/model" - mapset "github.com/deckarep/golang-set" "strings" + + "github.com/Qianlitp/crawlergo/pkg/model" + mapset "github.com/deckarep/golang-set" ) func SubDomainCollect(reqList []*model.Request, HostLimit string) []string { diff --git a/pkg/engine/after_dom_tasks.go b/pkg/engine/after_dom_tasks.go index ec94281..de5bbba 100755 --- a/pkg/engine/after_dom_tasks.go +++ b/pkg/engine/after_dom_tasks.go @@ -2,14 +2,15 @@ package engine import ( "context" - "crawlergo/pkg/config" - "crawlergo/pkg/js" - "crawlergo/pkg/logger" - "github.com/chromedp/cdproto/cdp" - "github.com/chromedp/chromedp" "os" "strings" "time" + + "github.com/Qianlitp/crawlergo/pkg/config" + "github.com/Qianlitp/crawlergo/pkg/js" + "github.com/Qianlitp/crawlergo/pkg/logger" + "github.com/chromedp/cdproto/cdp" + "github.com/chromedp/chromedp" ) /** diff --git a/pkg/engine/after_loaded_tasks.go b/pkg/engine/after_loaded_tasks.go index a6d6a46..878f637 100755 --- a/pkg/engine/after_loaded_tasks.go +++ b/pkg/engine/after_loaded_tasks.go @@ -2,14 +2,15 @@ package engine import ( "context" - "crawlergo/pkg/config" - "crawlergo/pkg/js" - "crawlergo/pkg/logger" - "crawlergo/pkg/tools" "fmt" + "time" + + "github.com/Qianlitp/crawlergo/pkg/config" + "github.com/Qianlitp/crawlergo/pkg/js" + "github.com/Qianlitp/crawlergo/pkg/logger" + "github.com/Qianlitp/crawlergo/pkg/tools" "github.com/chromedp/cdproto/cdp" "github.com/chromedp/chromedp" - "time" ) /** diff --git a/pkg/engine/browser.go b/pkg/engine/browser.go index 849efb5..5a1a9e6 100755 --- a/pkg/engine/browser.go +++ b/pkg/engine/browser.go @@ -2,11 +2,12 @@ package engine import ( "context" - "crawlergo/pkg/logger" "log" "sync" "time" + "github.com/Qianlitp/crawlergo/pkg/logger" + "github.com/chromedp/cdproto/browser" "github.com/chromedp/chromedp" ) @@ -68,7 +69,11 @@ func InitBrowser(chromiumPath string, incognito bool, extraHeaders map[string]in ) // https://github.com/chromedp/chromedp/issues/824#issuecomment-845664441 // 如果需要在一个浏览器上创建多个tab,则需要先创建浏览器的上下文,即运行下面的语句 - chromedp.Run(bctx) + err := chromedp.Run(bctx) + if err != nil { + // not found chrome process need exit + logger.Logger.Fatal("chromedp run error: ", err.Error()) + } bro.Cancel = &cancel bro.Ctx = &bctx bro.ExtraHeaders = extraHeaders diff --git a/pkg/engine/collect_links.go b/pkg/engine/collect_links.go index b371336..b721755 100755 --- a/pkg/engine/collect_links.go +++ b/pkg/engine/collect_links.go @@ -2,13 +2,14 @@ package engine import ( "context" - "crawlergo/pkg/config" - "crawlergo/pkg/logger" "fmt" - "github.com/chromedp/cdproto/cdp" - "github.com/chromedp/chromedp" "regexp" "time" + + "github.com/Qianlitp/crawlergo/pkg/config" + "github.com/Qianlitp/crawlergo/pkg/logger" + "github.com/chromedp/cdproto/cdp" + "github.com/chromedp/chromedp" ) /** diff --git a/pkg/engine/intercept_request.go b/pkg/engine/intercept_request.go index 38a4f7d..c1b56ee 100755 --- a/pkg/engine/intercept_request.go +++ b/pkg/engine/intercept_request.go @@ -3,20 +3,21 @@ package engine import ( "bufio" "context" - "crawlergo/pkg/config" - "crawlergo/pkg/logger" - model2 "crawlergo/pkg/model" - "crawlergo/pkg/tools" - "crawlergo/pkg/tools/requests" "encoding/base64" - "github.com/chromedp/cdproto/fetch" - "github.com/chromedp/cdproto/network" "io" "net/textproto" "regexp" "strconv" "strings" "time" + + "github.com/Qianlitp/crawlergo/pkg/config" + "github.com/Qianlitp/crawlergo/pkg/logger" + model2 "github.com/Qianlitp/crawlergo/pkg/model" + "github.com/Qianlitp/crawlergo/pkg/tools" + "github.com/Qianlitp/crawlergo/pkg/tools/requests" + "github.com/chromedp/cdproto/fetch" + "github.com/chromedp/cdproto/network" ) /** diff --git a/pkg/engine/tab.go b/pkg/engine/tab.go index 4a2890d..ba014f1 100755 --- a/pkg/engine/tab.go +++ b/pkg/engine/tab.go @@ -2,12 +2,17 @@ package engine import ( "context" - "crawlergo/pkg/config" - "crawlergo/pkg/js" - "crawlergo/pkg/logger" - model2 "crawlergo/pkg/model" "encoding/json" "fmt" + "regexp" + "strings" + "sync" + "time" + + "github.com/Qianlitp/crawlergo/pkg/config" + "github.com/Qianlitp/crawlergo/pkg/js" + "github.com/Qianlitp/crawlergo/pkg/logger" + model2 "github.com/Qianlitp/crawlergo/pkg/model" "github.com/chromedp/cdproto/cdp" "github.com/chromedp/cdproto/dom" "github.com/chromedp/cdproto/fetch" @@ -16,10 +21,6 @@ import ( "github.com/chromedp/cdproto/runtime" "github.com/chromedp/chromedp" "github.com/gogf/gf/encoding/gcharset" - "regexp" - "strings" - "sync" - "time" ) type Tab struct { diff --git a/pkg/filter/simple_filter.go b/pkg/filter/simple_filter.go index 71cde30..ce91386 100755 --- a/pkg/filter/simple_filter.go +++ b/pkg/filter/simple_filter.go @@ -1,10 +1,11 @@ package filter import ( - "crawlergo/pkg/config" - "crawlergo/pkg/model" - "github.com/deckarep/golang-set" "strings" + + "github.com/Qianlitp/crawlergo/pkg/config" + "github.com/Qianlitp/crawlergo/pkg/model" + mapset "github.com/deckarep/golang-set" ) type SimpleFilter struct { diff --git a/pkg/filter/smart_filter.go b/pkg/filter/smart_filter.go index f1ae700..15aa47e 100755 --- a/pkg/filter/smart_filter.go +++ b/pkg/filter/smart_filter.go @@ -1,16 +1,17 @@ package filter import ( - "crawlergo/pkg/config" - "crawlergo/pkg/logger" - "crawlergo/pkg/model" - "crawlergo/pkg/tools" "go/types" "regexp" "sort" "strings" "sync" + "github.com/Qianlitp/crawlergo/pkg/config" + "github.com/Qianlitp/crawlergo/pkg/logger" + "github.com/Qianlitp/crawlergo/pkg/model" + "github.com/Qianlitp/crawlergo/pkg/tools" + mapset "github.com/deckarep/golang-set" ) @@ -97,19 +98,18 @@ func (s *SmartFilter) DoFilter(req *model.Request) bool { return true } + req.Filter.FragmentID = s.calcFragmentID(req.URL.Fragment) + // 标记 if req.Method == config.GET || req.Method == config.DELETE || req.Method == config.HEAD || req.Method == config.OPTIONS { s.getMark(req) + s.repeatCountStatistic(req) } else if req.Method == config.POST || req.Method == config.PUT { s.postMark(req) } else { logger.Logger.Debug("dont support such method: " + req.Method) } - if req.Method == config.GET || req.Method == config.DELETE || req.Method == config.HEAD || req.Method == config.OPTIONS { - s.repeatCountStatistic(req) - } - // 对标记后的请求进行去重 uniqueId := req.Filter.UniqueId if s.uniqueMarkedIds.Contains(uniqueId) { @@ -126,16 +126,16 @@ func (s *SmartFilter) DoFilter(req *model.Request) bool { s.overCountMark(req) // 重新计算 QueryMapId - req.Filter.QueryMapId = s.getParamMapID(req.Filter.MarkedQueryMap) + req.Filter.QueryMapId = getParamMapID(req.Filter.MarkedQueryMap) // 重新计算 PathId - req.Filter.PathId = s.getPathID(req.Filter.MarkedPath) + req.Filter.PathId = getPathID(req.Filter.MarkedPath) } else { // 重新计算 PostDataId - req.Filter.PostDataId = s.getParamMapID(req.Filter.MarkedPostDataMap) + req.Filter.PostDataId = getParamMapID(req.Filter.MarkedPostDataMap) } // 重新计算请求唯一ID - req.Filter.UniqueId = s.getMarkedUniqueID(req) + req.Filter.UniqueId = getMarkedUniqueID(req) // 新的ID再次去重 newUniqueId := req.Filter.UniqueId @@ -173,21 +173,21 @@ func (s *SmartFilter) getMark(req *model.Request) { // 依次打标记 queryMap := todoURL.QueryMap() - queryMap = s.markParamName(queryMap) + queryMap = markParamName(queryMap) queryMap = s.markParamValue(queryMap, *req) - markedPath := s.MarkPath(todoURL.Path) + markedPath := MarkPath(todoURL.Path) // 计算唯一的ID var queryKeyID string var queryMapID string if len(queryMap) != 0 { - queryKeyID = s.getKeysID(queryMap) - queryMapID = s.getParamMapID(queryMap) + queryKeyID = getKeysID(queryMap) + queryMapID = getParamMapID(queryMap) } else { queryKeyID = "" queryMapID = "" } - pathID := s.getPathID(markedPath) + pathID := getPathID(markedPath) req.Filter.MarkedQueryMap = queryMap req.Filter.QueryKeysId = queryKeyID @@ -196,7 +196,7 @@ func (s *SmartFilter) getMark(req *model.Request) { req.Filter.PathId = pathID // 最后计算标记后的唯一请求ID - req.Filter.UniqueId = s.getMarkedUniqueID(req) + req.Filter.UniqueId = getMarkedUniqueID(req) } /** @@ -205,18 +205,18 @@ func (s *SmartFilter) getMark(req *model.Request) { func (s *SmartFilter) postMark(req *model.Request) { postDataMap := req.PostDataMap() - postDataMap = s.markParamName(postDataMap) + postDataMap = markParamName(postDataMap) postDataMap = s.markParamValue(postDataMap, *req) - markedPath := s.MarkPath(req.URL.Path) + markedPath := MarkPath(req.URL.Path) // 计算唯一的ID var postDataMapID string if len(postDataMap) != 0 { - postDataMapID = s.getParamMapID(postDataMap) + postDataMapID = getParamMapID(postDataMap) } else { postDataMapID = "" } - pathID := s.getPathID(markedPath) + pathID := getPathID(markedPath) req.Filter.MarkedPostDataMap = postDataMap req.Filter.PostDataId = postDataMapID @@ -224,13 +224,13 @@ func (s *SmartFilter) postMark(req *model.Request) { req.Filter.PathId = pathID // 最后计算标记后的唯一请求ID - req.Filter.UniqueId = s.getMarkedUniqueID(req) + req.Filter.UniqueId = getMarkedUniqueID(req) } /** 标记参数名 */ -func (s *SmartFilter) markParamName(paramMap map[string]interface{}) map[string]interface{} { +func markParamName(paramMap map[string]interface{}) map[string]interface{} { markedParamMap := map[string]interface{}{} for key, value := range paramMap { // 纯字母不处理 @@ -300,7 +300,7 @@ func (s *SmartFilter) markParamValue(paramMap map[string]interface{}, req model. } else if onlyAlphaNumRegex.MatchString(valueStr) && numberRegex.MatchString(valueStr) { markedParamMap[key] = MixAlphaNumMark // 含有一些特殊符号 - } else if s.hasSpecialSymbol(valueStr) { + } else if hasSpecialSymbol(valueStr) { markedParamMap[key] = MixSymbolMark // 数字出现的次数超过3,视为数值型参数 } else if b := OneNumberRegex.ReplaceAllString(valueStr, "0"); strings.Count(b, "0") >= 3 { @@ -339,7 +339,7 @@ func (s *SmartFilter) markParamValue(paramMap map[string]interface{}, req model. /** 标记路径 */ -func (s *SmartFilter) MarkPath(path string) string { +func MarkPath(path string) string { pathParts := strings.Split(path, "/") for index, part := range pathParts { if len(part) >= 32 { @@ -356,7 +356,7 @@ func (s *SmartFilter) MarkPath(path string) string { pathParts[index] = NumberMark } // 含有特殊符号 - } else if s.hasSpecialSymbol(part) { + } else if hasSpecialSymbol(part) { pathParts[index] = MixSymbolMark } else if chineseRegex.MatchString(part) { pathParts[index] = ChineseMark @@ -464,7 +464,7 @@ func (s *SmartFilter) repeatCountStatistic(req *model.Request) { } // 相对于上一级目录,本级path目录的数量统计,存在文件后缀的情况下,放行常见脚本后缀 - if req.URL.ParentPath() == "" || s.inCommonScriptSuffix(req.URL.FileExt()) { + if req.URL.ParentPath() == "" || inCommonScriptSuffix(req.URL.FileExt()) { return } @@ -538,7 +538,7 @@ func (s *SmartFilter) overCountMark(req *model.Request) { } // 处理本级path的伪静态 - if req.URL.ParentPath() == "" || s.inCommonScriptSuffix(req.URL.FileExt()) { + if req.URL.ParentPath() == "" || inCommonScriptSuffix(req.URL.FileExt()) { return } parentPathId := tools.StrMd5(req.URL.ParentPath()) @@ -554,10 +554,27 @@ func (s *SmartFilter) overCountMark(req *model.Request) { } } +// calcFragmentID 计算 fragment 唯一值,如果 fragment 的格式为 url path +func (s *SmartFilter) calcFragmentID(fragment string) string { + if fragment == "" || !strings.HasPrefix(fragment, "/") { + return "" + } + fakeUrl, err := model.GetUrl(fragment) + if err != nil { + logger.Logger.Error("cannot calculate url fragment: ", err) + return "" + } + // XXX: discuss https://github.com/Qianlitp/crawlergo/issues/100 + fakeReq := model.GetRequest(config.GET, fakeUrl) + s.getMark(&fakeReq) + // s.repeatCountStatistic(&fakeReq) + return fakeReq.Filter.UniqueId +} + /** 计算标记后的唯一请求ID */ -func (s *SmartFilter) getMarkedUniqueID(req *model.Request) string { +func getMarkedUniqueID(req *model.Request) string { var paramId string if req.Method == config.GET || req.Method == config.DELETE || req.Method == config.HEAD || req.Method == config.OPTIONS { paramId = req.Filter.QueryMapId @@ -565,7 +582,7 @@ func (s *SmartFilter) getMarkedUniqueID(req *model.Request) string { paramId = req.Filter.PostDataId } - uniqueStr := req.Method + paramId + req.Filter.PathId + req.URL.Host + uniqueStr := req.Method + paramId + req.Filter.PathId + req.URL.Host + req.Filter.FragmentID if req.RedirectionFlag { uniqueStr += "Redirection" } @@ -573,16 +590,13 @@ func (s *SmartFilter) getMarkedUniqueID(req *model.Request) string { uniqueStr += "https" } - if req.URL.Fragment != "" && strings.HasPrefix(req.URL.Fragment, "/") { - uniqueStr += req.URL.Fragment - } return tools.StrMd5(uniqueStr) } /** 计算请求参数的key标记后的唯一ID */ -func (s *SmartFilter) getKeysID(dataMap map[string]interface{}) string { +func getKeysID(dataMap map[string]interface{}) string { var keys []string var idStr string for key := range dataMap { @@ -598,7 +612,7 @@ func (s *SmartFilter) getKeysID(dataMap map[string]interface{}) string { /** 计算请求参数标记后的唯一ID */ -func (s *SmartFilter) getParamMapID(dataMap map[string]interface{}) string { +func getParamMapID(dataMap map[string]interface{}) string { var keys []string var idStr string var markReplaceRegex = regexp.MustCompile(`{{.+}}`) @@ -619,14 +633,14 @@ func (s *SmartFilter) getParamMapID(dataMap map[string]interface{}) string { /** 计算PATH标记后的唯一ID */ -func (s *SmartFilter) getPathID(path string) string { +func getPathID(path string) string { return tools.StrMd5(path) } /** 判断字符串中是否存在以下特殊符号 */ -func (s *SmartFilter) hasSpecialSymbol(str string) bool { +func hasSpecialSymbol(str string) bool { symbolList := []string{"{", "}", " ", "|", "#", "@", "$", "*", ",", "<", ">", "/", "?", "\\", "+", "="} for _, sym := range symbolList { if strings.Contains(str, sym) { @@ -636,7 +650,7 @@ func (s *SmartFilter) hasSpecialSymbol(str string) bool { return false } -func (s *SmartFilter) inCommonScriptSuffix(suffix string) bool { +func inCommonScriptSuffix(suffix string) bool { for _, value := range config.ScriptSuffix { if value == suffix { return true diff --git a/pkg/filter/smart_filter_test.go b/pkg/filter/smart_filter_test.go new file mode 100644 index 0000000..2ce1fb8 --- /dev/null +++ b/pkg/filter/smart_filter_test.go @@ -0,0 +1,56 @@ +package filter + +import ( + "testing" + + "github.com/Qianlitp/crawlergo/pkg/config" + "github.com/Qianlitp/crawlergo/pkg/model" + + "github.com/stretchr/testify/assert" +) + +var ( + // queryUrls = []string{ + // "http://test.nil.local.com/cctv/abcd?keyword=crawlergocrawlergo&end=1", + // "http://test.nil.local.com/cctv/abcd?keyword=crawlergocrawlergo&end=1", + // } + + fragmentUrls = []string{ + // 基准组 + "http://testhtml5.vuwm.com/latest#/page/1", + "http://testhtml5.vuwm.com/latest#/page/search?keyword=Crawlergo&source=2&demo=1423&c=afa", + // 被标记成 {{long}} + "http://testhtml5.vuwm.com/latest#/page/search/fasdfsdafsdfsdfsdfasfsfasfafdsafssfasdfsd", + + // 对照组 + "http://testhtml5.vuwm.com/latest#/page/2", + // 不应该被标记成 {{long}} + "http://testhtml5.vuwm.com/latest#/page/search?keyword=CrawlergoCrawlergoCrawlergo&source=1&demo=1255&c=afa", + } + + // completeUrls = []string{ + // "https://test.local.com:1234/adfatd/123456/sx14xi?user=crawlergo&pwd=fa1424&end=1#/user/info", + // } + smart = SmartFilter{} +) + +func TestDoFilter_countFragment(t *testing.T) { + smart.Init() + reqs := []model.Request{} + for _, fu := range fragmentUrls { + url, err := model.GetUrl(fu) + assert.Nil(t, err) + reqs = append(reqs, model.GetRequest(config.GET, url)) + } + // #/page/1 和 #/page/2 是同一种类型 + assert.Equal(t, smart.calcFragmentID(reqs[0].URL.Fragment), smart.calcFragmentID(reqs[3].URL.Fragment)) + assert.Equal(t, smart.calcFragmentID(reqs[1].URL.Fragment), smart.calcFragmentID(reqs[4].URL.Fragment)) + for _, rq := range reqs[:2] { + // 第一次出现都不应该过滤 + assert.Equal(t, smart.DoFilter(&rq), false) + } + for _, rq := range reqs[3:] { + // 同类型出现第二次,应该被过滤 + assert.Equal(t, smart.DoFilter(&rq), true) + } +} diff --git a/pkg/model/request.go b/pkg/model/request.go index 22d90d1..1c5688a 100755 --- a/pkg/model/request.go +++ b/pkg/model/request.go @@ -1,13 +1,14 @@ package model import ( - "crawlergo/pkg/config" - "crawlergo/pkg/tools" "encoding/json" "errors" "fmt" "net/url" "strings" + + "github.com/Qianlitp/crawlergo/pkg/config" + "github.com/Qianlitp/crawlergo/pkg/tools" ) type Filter struct { @@ -17,6 +18,7 @@ type Filter struct { MarkedPostDataMap map[string]interface{} PostDataId string MarkedPath string + FragmentID string PathId string UniqueId string } diff --git a/pkg/model/url.go b/pkg/model/url.go index af1764a..e38839f 100755 --- a/pkg/model/url.go +++ b/pkg/model/url.go @@ -9,7 +9,7 @@ import ( "golang.org/x/net/publicsuffix" - "crawlergo/pkg/tools/requests" + "github.com/Qianlitp/crawlergo/pkg/tools/requests" ) type URL struct { diff --git a/pkg/model/url_test.go b/pkg/model/url_test.go index 8c775a4..ca64edc 100644 --- a/pkg/model/url_test.go +++ b/pkg/model/url_test.go @@ -4,6 +4,7 @@ import ( "net/url" "testing" + "github.com/stretchr/testify/assert" "golang.org/x/net/publicsuffix" ) @@ -43,3 +44,15 @@ func TestRootDomain(t *testing.T) { } } } + +func TestGetUrl(t *testing.T) { + testPath := "/user/info" + testQueyPath := "/user/info?keyword=crawlergocrawlergo&end=1" + url, err := GetUrl(testPath) + assert.Nil(t, err) + assert.NotNil(t, url) + queryUrl, err := GetUrl(testQueyPath) + assert.Nil(t, err) + assert.Equal(t, queryUrl.Path, testPath) + assert.Equal(t, queryUrl.RawQuery, "keyword=crawlergocrawlergo&end=1") +} diff --git a/pkg/path_expansion.go b/pkg/path_expansion.go index 6b7fd9d..5aecd62 100755 --- a/pkg/path_expansion.go +++ b/pkg/path_expansion.go @@ -1,16 +1,17 @@ package pkg import ( - "crawlergo/pkg/config" - "crawlergo/pkg/logger" - model2 "crawlergo/pkg/model" - "crawlergo/pkg/tools" - "crawlergo/pkg/tools/requests" "fmt" "regexp" "strings" "sync" + "github.com/Qianlitp/crawlergo/pkg/config" + "github.com/Qianlitp/crawlergo/pkg/logger" + model2 "github.com/Qianlitp/crawlergo/pkg/model" + "github.com/Qianlitp/crawlergo/pkg/tools" + "github.com/Qianlitp/crawlergo/pkg/tools/requests" + mapset "github.com/deckarep/golang-set" "github.com/panjf2000/ants/v2" ) diff --git a/pkg/task_main.go b/pkg/task_main.go index cc57ba5..78e5815 100755 --- a/pkg/task_main.go +++ b/pkg/task_main.go @@ -1,14 +1,14 @@ package pkg import ( - "crawlergo/pkg/config" - engine2 "crawlergo/pkg/engine" - filter2 "crawlergo/pkg/filter" - "crawlergo/pkg/logger" - "crawlergo/pkg/model" "encoding/json" "sync" - "time" + + "github.com/Qianlitp/crawlergo/pkg/config" + engine2 "github.com/Qianlitp/crawlergo/pkg/engine" + filter2 "github.com/Qianlitp/crawlergo/pkg/filter" + "github.com/Qianlitp/crawlergo/pkg/logger" + "github.com/Qianlitp/crawlergo/pkg/model" "github.com/panjf2000/ants/v2" ) @@ -34,37 +34,10 @@ type Result struct { resultLock sync.Mutex // 合并结果时加锁 } -type TaskConfig struct { - MaxCrawlCount int // 最大爬取的数量 - FilterMode string // simple、smart、strict - ExtraHeaders map[string]interface{} - ExtraHeadersString string - AllDomainReturn bool // 全部域名收集 - SubDomainReturn bool // 子域名收集 - IncognitoContext bool // 开启隐身模式 - NoHeadless bool // headless模式 - DomContentLoadedTimeout time.Duration - TabRunTimeout time.Duration // 单个标签页超时 - PathByFuzz bool // 通过字典进行Path Fuzz - FuzzDictPath string //Fuzz目录字典 - PathFromRobots bool // 解析Robots文件找出路径 - MaxTabsCount int // 允许开启的最大标签页数量 即同时爬取的数量 - ChromiumPath string // Chromium的程序路径 `/home/zhusiyu1/chrome-linux/chrome` - EventTriggerMode string // 事件触发的调用方式: 异步 或 顺序 - EventTriggerInterval time.Duration // 事件触发的间隔 - BeforeExitDelay time.Duration // 退出前的等待时间,等待DOM渲染,等待XHR发出捕获 - EncodeURLWithCharset bool // 使用检测到的字符集自动编码URL - IgnoreKeywords []string // 忽略的关键字,匹配上之后将不再扫描且不发送请求 - Proxy string // 请求代理 - CustomFormValues map[string]string // 自定义表单填充参数 - CustomFormKeywordValues map[string]string // 自定义表单关键词填充内容 -} - type tabTask struct { crawlerTask *CrawlerTask browser *engine2.Browser req *model.Request - pool *ants.Pool } /** @@ -99,40 +72,19 @@ func NewCrawlerTask(targets []*model.Request, taskConf TaskConfig) (*CrawlerTask req.Source = config.FromTarget } - if taskConf.TabRunTimeout == 0 { - taskConf.TabRunTimeout = config.TabRunTimeout - } - - if taskConf.MaxTabsCount == 0 { - taskConf.MaxTabsCount = config.MaxTabsCount - } - - if taskConf.FilterMode == config.StrictFilterMode { - crawlerTask.smartFilter.StrictMode = true - } - - if taskConf.MaxCrawlCount == 0 { - taskConf.MaxCrawlCount = config.MaxCrawlCount - } - - if taskConf.DomContentLoadedTimeout == 0 { - taskConf.DomContentLoadedTimeout = config.DomContentLoadedTimeout - } - - if taskConf.EventTriggerInterval == 0 { - taskConf.EventTriggerInterval = config.EventTriggerInterval - } - - if taskConf.BeforeExitDelay == 0 { - taskConf.BeforeExitDelay = config.BeforeExitDelay - } - - if taskConf.EventTriggerMode == "" { - taskConf.EventTriggerMode = config.DefaultEventTriggerMode - } - - if len(taskConf.IgnoreKeywords) == 0 { - taskConf.IgnoreKeywords = config.DefaultIgnoreKeywords + // 业务代码与数据代码分离, 初始化一些默认配置 + // 使用 funtion option 和一个代理来初始化 taskConf 的配置 + for _, fn := range []TaskConfigOptFunc{ + WithTabRunTimeout(config.TabRunTimeout), + WithMaxTabsCount(config.MaxTabsCount), + WithMaxCrawlCount(config.MaxCrawlCount), + WithDomContentLoadedTimeout(config.DomContentLoadedTimeout), + WithEventTriggerInterval(config.EventTriggerInterval), + WithBeforeExitDelay(config.BeforeExitDelay), + WithEventTriggerMode(config.DefaultEventTriggerMode), + WithIgnoreKeywords(config.DefaultIgnoreKeywords), + } { + fn(&taskConf) } if taskConf.ExtraHeadersString != "" { @@ -215,9 +167,7 @@ func (t *CrawlerTask) Run() { // 对全部请求进行唯一去重 todoFilterAll := make([]*model.Request, len(t.Result.AllReqList)) - for index := range t.Result.AllReqList { - todoFilterAll[index] = t.Result.AllReqList[index] - } + copy(todoFilterAll, t.Result.AllReqList) t.Result.AllReqList = []*model.Request{} var simpleFilter filter2.SimpleFilter diff --git a/pkg/taskconfig.go b/pkg/taskconfig.go new file mode 100644 index 0000000..9391c65 --- /dev/null +++ b/pkg/taskconfig.go @@ -0,0 +1,207 @@ +package pkg + +import "time" + +type TaskConfig struct { + MaxCrawlCount int // 最大爬取的数量 + FilterMode string // simple、smart、strict + ExtraHeaders map[string]interface{} + ExtraHeadersString string + AllDomainReturn bool // 全部域名收集 + SubDomainReturn bool // 子域名收集 + IncognitoContext bool // 开启隐身模式 + NoHeadless bool // headless模式 + DomContentLoadedTimeout time.Duration + TabRunTimeout time.Duration // 单个标签页超时 + PathByFuzz bool // 通过字典进行Path Fuzz + FuzzDictPath string //Fuzz目录字典 + PathFromRobots bool // 解析Robots文件找出路径 + MaxTabsCount int // 允许开启的最大标签页数量 即同时爬取的数量 + ChromiumPath string // Chromium的程序路径 `/home/zhusiyu1/chrome-linux/chrome` + EventTriggerMode string // 事件触发的调用方式: 异步 或 顺序 + EventTriggerInterval time.Duration // 事件触发的间隔 + BeforeExitDelay time.Duration // 退出前的等待时间,等待DOM渲染,等待XHR发出捕获 + EncodeURLWithCharset bool // 使用检测到的字符集自动编码URL + IgnoreKeywords []string // 忽略的关键字,匹配上之后将不再扫描且不发送请求 + Proxy string // 请求代理 + CustomFormValues map[string]string // 自定义表单填充参数 + CustomFormKeywordValues map[string]string // 自定义表单关键词填充内容 +} + +type TaskConfigOptFunc func(*TaskConfig) + +func NewTaskConfig(optFuncs ...TaskConfigOptFunc) *TaskConfig { + conf := &TaskConfig{} + for _, fn := range optFuncs { + fn(conf) + } + return conf +} + +func WithMaxCrawlCount(maxCrawlCount int) TaskConfigOptFunc { + return func(tc *TaskConfig) { + if tc.MaxCrawlCount == 0 { + tc.MaxCrawlCount = maxCrawlCount + } + } +} + +func WithFilterMode(gen string) TaskConfigOptFunc { + return func(tc *TaskConfig) { + if tc.FilterMode == "" { + tc.FilterMode = gen + } + } +} + +func WithExtraHeaders(gen map[string]interface{}) TaskConfigOptFunc { + return func(tc *TaskConfig) { + if tc.ExtraHeaders == nil { + tc.ExtraHeaders = gen + } + } +} + +func WithExtraHeadersString(gen string) TaskConfigOptFunc { + return func(tc *TaskConfig) { + if tc.ExtraHeadersString == "" { + tc.ExtraHeadersString = gen + } + } +} + +func WithAllDomainReturn(gen bool) TaskConfigOptFunc { + return func(tc *TaskConfig) { + if !tc.AllDomainReturn { + tc.AllDomainReturn = gen + } + } +} +func WithSubDomainReturn(gen bool) TaskConfigOptFunc { + return func(tc *TaskConfig) { + if !tc.SubDomainReturn { + tc.SubDomainReturn = gen + } + } +} +func WithIncognitoContext(gen bool) TaskConfigOptFunc { + return func(tc *TaskConfig) { + if !tc.IncognitoContext { + tc.IncognitoContext = gen + } + } +} +func WithNoHeadless(gen bool) TaskConfigOptFunc { + return func(tc *TaskConfig) { + if !tc.NoHeadless { + tc.NoHeadless = gen + } + } +} + +func WithDomContentLoadedTimeout(gen time.Duration) TaskConfigOptFunc { + return func(tc *TaskConfig) { + if tc.DomContentLoadedTimeout == 0 { + tc.DomContentLoadedTimeout = gen + } + } +} + +func WithTabRunTimeout(gen time.Duration) TaskConfigOptFunc { + return func(tc *TaskConfig) { + if tc.TabRunTimeout == 0 { + tc.TabRunTimeout = gen + } + } +} +func WithPathByFuzz(gen bool) TaskConfigOptFunc { + return func(tc *TaskConfig) { + if !tc.PathByFuzz { + tc.PathByFuzz = gen + } + } +} +func WithFuzzDictPath(gen string) TaskConfigOptFunc { + return func(tc *TaskConfig) { + if tc.FuzzDictPath == "" { + tc.FuzzDictPath = gen + } + } +} +func WithPathFromRobots(gen bool) TaskConfigOptFunc { + return func(tc *TaskConfig) { + if !tc.PathFromRobots { + tc.PathFromRobots = gen + } + } +} +func WithMaxTabsCount(gen int) TaskConfigOptFunc { + return func(tc *TaskConfig) { + if tc.MaxTabsCount == 0 { + tc.MaxTabsCount = gen + } + } +} +func WithChromiumPath(gen string) TaskConfigOptFunc { + return func(tc *TaskConfig) { + if tc.ChromiumPath == "" { + tc.ChromiumPath = gen + } + } +} +func WithEventTriggerMode(gen string) TaskConfigOptFunc { + return func(tc *TaskConfig) { + if tc.EventTriggerMode == "" { + tc.EventTriggerMode = gen + } + } +} +func WithEventTriggerInterval(gen time.Duration) TaskConfigOptFunc { + return func(tc *TaskConfig) { + if tc.EventTriggerInterval == 0 { + tc.EventTriggerInterval = gen + } + } +} +func WithBeforeExitDelay(gen time.Duration) TaskConfigOptFunc { + return func(tc *TaskConfig) { + if tc.BeforeExitDelay == 0 { + tc.BeforeExitDelay = gen + } + } +} +func WithEncodeURLWithCharset(gen bool) TaskConfigOptFunc { + return func(tc *TaskConfig) { + if !tc.EncodeURLWithCharset { + tc.EncodeURLWithCharset = gen + } + } +} +func WithIgnoreKeywords(gen []string) TaskConfigOptFunc { + return func(tc *TaskConfig) { + if tc.IgnoreKeywords == nil || len(tc.IgnoreKeywords) == 0 { + tc.IgnoreKeywords = gen + } + } +} +func WithProxy(gen string) TaskConfigOptFunc { + return func(tc *TaskConfig) { + if tc.Proxy == "" { + tc.Proxy = gen + } + } +} +func WithCustomFormValues(gen map[string]string) TaskConfigOptFunc { + return func(tc *TaskConfig) { + if tc.CustomFormValues == nil || len(tc.CustomFormValues) == 0 { + tc.CustomFormValues = gen + } + } +} +func WithCustomFormKeywordValues(gen map[string]string) TaskConfigOptFunc { + return func(tc *TaskConfig) { + if tc.CustomFormKeywordValues == nil || len(tc.CustomFormKeywordValues) == 0 { + tc.CustomFormKeywordValues = gen + } + } +} diff --git a/pkg/taskconfig_test.go b/pkg/taskconfig_test.go new file mode 100644 index 0000000..7d96fc4 --- /dev/null +++ b/pkg/taskconfig_test.go @@ -0,0 +1,45 @@ +package pkg_test + +import ( + "testing" + "time" + + "github.com/Qianlitp/crawlergo/pkg" + "github.com/Qianlitp/crawlergo/pkg/config" + "github.com/stretchr/testify/assert" +) + +func TestTaskConfigOptFunc(t *testing.T) { + // 测试 https://github.com/Qianlitp/crawlergo/pull/101 修改的代码 + var taskConf pkg.TaskConfig + for _, fn := range []pkg.TaskConfigOptFunc{ + pkg.WithTabRunTimeout(config.TabRunTimeout), + pkg.WithMaxTabsCount(config.MaxTabsCount), + pkg.WithMaxCrawlCount(config.MaxCrawlCount), + pkg.WithDomContentLoadedTimeout(config.DomContentLoadedTimeout), + pkg.WithEventTriggerInterval(config.EventTriggerInterval), + pkg.WithBeforeExitDelay(config.BeforeExitDelay), + pkg.WithEventTriggerMode(config.DefaultEventTriggerMode), + pkg.WithIgnoreKeywords(config.DefaultIgnoreKeywords), + } { + fn(&taskConf) + } + + // 应该都要等于默认配置 + assert.Equal(t, taskConf.TabRunTimeout, config.TabRunTimeout) + assert.Equal(t, taskConf.MaxTabsCount, config.MaxTabsCount) + assert.Equal(t, taskConf.MaxCrawlCount, config.MaxCrawlCount) + assert.Equal(t, taskConf.DomContentLoadedTimeout, config.DomContentLoadedTimeout) + assert.Equal(t, taskConf.EventTriggerInterval, config.EventTriggerInterval) + assert.Equal(t, taskConf.BeforeExitDelay, config.BeforeExitDelay) + assert.Equal(t, taskConf.EventTriggerMode, config.DefaultEventTriggerMode) + assert.Equal(t, taskConf.IgnoreKeywords, config.DefaultIgnoreKeywords) + + // 重设超时时间 + taskConf.TabRunTimeout = time.Minute * 5 + + // 企图覆盖自定义的时间, 不应该允许, 程序初始化时只能配置一次, 先由用户配置 + pkg.WithTabRunTimeout(time.Second * 5)(&taskConf) + assert.NotEqual(t, taskConf.TabRunTimeout, time.Second*5) + assert.NotEqual(t, taskConf.TabRunTimeout, config.TabRunTimeout) +} diff --git a/pkg/tools/common.go b/pkg/tools/common.go index 970cc09..9bb6471 100644 --- a/pkg/tools/common.go +++ b/pkg/tools/common.go @@ -2,13 +2,14 @@ package tools import ( "bufio" - "crawlergo/pkg/logger" "crypto/md5" "encoding/hex" "fmt" "io" "os" "strings" + + "github.com/Qianlitp/crawlergo/pkg/logger" ) func StrMd5(str string) string { diff --git a/pkg/tools/requests/requests.go b/pkg/tools/requests/requests.go index 9605111..7dc25e9 100755 --- a/pkg/tools/requests/requests.go +++ b/pkg/tools/requests/requests.go @@ -2,14 +2,15 @@ package requests import ( "bytes" - "crawlergo/pkg/logger" "crypto/tls" "fmt" - "github.com/pkg/errors" "net/http" "net/url" "strings" "time" + + "github.com/Qianlitp/crawlergo/pkg/logger" + "github.com/pkg/errors" ) const DefaultUa = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)" + diff --git a/pkg/tools/requests/response.go b/pkg/tools/requests/response.go index c9d1121..f2b4f63 100644 --- a/pkg/tools/requests/response.go +++ b/pkg/tools/requests/response.go @@ -1,9 +1,10 @@ package requests import ( - "crawlergo/pkg/logger" "io/ioutil" "net/http" + + "github.com/Qianlitp/crawlergo/pkg/logger" ) // 自定义一些函数