Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

introduce passive crawling #781

Merged
merged 18 commits into from
Mar 20, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions cmd/katana/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,10 @@ pipelines offering both headless and non-headless crawling.`)
flagSet.StringVarP(&options.ChromeWSUrl, "chrome-ws-url", "cwu", "", "use chrome browser instance launched elsewhere with the debugger listening at this URL"),
flagSet.BoolVarP(&options.XhrExtraction, "xhr-extraction", "xhr", false, "extract xhr request url,method in jsonl output"),
)
flagSet.CreateGroup("passive", "Passive",
flagSet.BoolVarP(&options.Passive, "passive", "ps", false, "enable passive sources to discover target endpoints"),
flagSet.StringSliceVarP(&options.PassiveSource, "passive-source", "pss", nil, "passive source to use for url discovery (wayback,urlscan,commoncrawl,virustotal,alienvault)", goflags.NormalizedStringSliceOptions),
)

flagSet.CreateGroup("scope", "Scope",
flagSet.StringSliceVarP(&options.Scope, "crawl-scope", "cs", nil, "in scope url regex to be followed by crawler", goflags.FileCommaSeparatedStringSliceOptions),
Expand Down
8 changes: 6 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,13 @@ require (
github.com/pkg/errors v0.9.1
github.com/projectdiscovery/dsl v0.0.45
github.com/projectdiscovery/fastdialer v0.0.59
github.com/projectdiscovery/goflags v0.1.39
github.com/projectdiscovery/goflags v0.1.40
github.com/projectdiscovery/gologger v1.1.12
github.com/projectdiscovery/hmap v0.0.40
github.com/projectdiscovery/mapcidr v1.1.16
github.com/projectdiscovery/ratelimit v0.0.30
github.com/projectdiscovery/retryablehttp-go v1.0.49
github.com/projectdiscovery/useragent v0.0.39
github.com/projectdiscovery/utils v0.0.79
github.com/projectdiscovery/wappalyzergo v0.0.109
github.com/remeh/sizedwaitgroup v1.0.0
Expand Down Expand Up @@ -55,6 +56,7 @@ require (
github.com/kataras/jwt v0.1.8 // indirect
github.com/klauspost/compress v1.16.7 // indirect
github.com/klauspost/pgzip v1.2.5 // indirect
github.com/kr/pretty v0.3.1 // indirect
github.com/lucasb-eyer/go-colorful v1.2.0 // indirect
github.com/mattn/go-colorable v0.1.13 // indirect
github.com/mattn/go-isatty v0.0.19 // indirect
Expand All @@ -68,9 +70,11 @@ require (
github.com/projectdiscovery/asnmap v1.0.6 // indirect
github.com/projectdiscovery/blackrock v0.0.1 // indirect
github.com/projectdiscovery/gostruct v0.0.2 // indirect
github.com/projectdiscovery/stringsutil v0.0.2 // indirect
github.com/quic-go/quic-go v0.37.7 // indirect
github.com/refraction-networking/utls v1.5.4 // indirect
github.com/rivo/uniseg v0.4.4 // indirect
github.com/rogpeppe/go-internal v1.12.0 // indirect
github.com/sashabaranov/go-openai v1.14.2 // indirect
github.com/shoenig/go-m1cpu v0.1.6 // indirect
github.com/smacker/go-tree-sitter v0.0.0-20220628134258-ac06e95cfa11 // indirect
Expand Down Expand Up @@ -132,7 +136,7 @@ require (
github.com/zmap/zcrypto v0.0.0-20230422215203-9a665e1e9968 // indirect
go.etcd.io/bbolt v1.3.7 // indirect
golang.org/x/crypto v0.17.0 // indirect
golang.org/x/exp v0.0.0-20230626212559-97b1e661b5df // indirect
golang.org/x/exp v0.0.0-20230626212559-97b1e661b5df
golang.org/x/mod v0.12.0 // indirect
golang.org/x/sys v0.16.0 // indirect
golang.org/x/text v0.14.0 // indirect
Expand Down
17 changes: 14 additions & 3 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ github.com/cloudflare/circl v1.3.7 h1:qlCDlTPz2n9fu58M0Nh1J/JzcFpfgkFHHX3O35r5vc
github.com/cloudflare/circl v1.3.7/go.mod h1:sRTcRWXGLrKw6yIGJ+l7amYJFfAXbZG0kBSc8r4zxgA=
github.com/cnf/structhash v0.0.0-20201127153200-e1b16c1ebc08 h1:ox2F0PSMlrAAiAdknSRMDrAr8mfxPCfSZolH+/qQnyQ=
github.com/cnf/structhash v0.0.0-20201127153200-e1b16c1ebc08/go.mod h1:pCxVEbcm3AMg7ejXyorUXi6HQCzOIBf7zEDVPtw0/U4=
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
Expand Down Expand Up @@ -124,8 +125,9 @@ github.com/klauspost/cpuid v1.2.0/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgo
github.com/klauspost/pgzip v1.2.5 h1:qnWYvvKqedOF2ulHpMG72XQol4ILEJ8k2wwRl/Km8oE=
github.com/klauspost/pgzip v1.2.5/go.mod h1:Ch1tH69qFZu15pkjo5kYi6mth2Zzwzt50oCQKQE9RUs=
github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI=
github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
Expand Down Expand Up @@ -190,6 +192,7 @@ github.com/onsi/gomega v1.27.6/go.mod h1:PIQNjfQwkP3aQAH7lf7j87O/5FiNr+ZR8+ipb+q
github.com/op/go-logging v0.0.0-20160315200505-970db520ece7/go.mod h1:HzydrMdWErDVzsI23lYNej1Htcns9BCg93Dk0bBINWk=
github.com/pierrec/lz4/v4 v4.1.2 h1:qvY3YFXRQE/XB8MlLzJH7mSzBs74eA2gg52YTk6jUPM=
github.com/pierrec/lz4/v4 v4.1.2/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4=
github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA=
github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
Expand All @@ -204,8 +207,8 @@ github.com/projectdiscovery/dsl v0.0.45 h1:fDMmBkpk5L7NhvihBxYoIch7UVBqLyKI6JzqK
github.com/projectdiscovery/dsl v0.0.45/go.mod h1:G9mK7rQT5FkmNcMlCCfYxQrFcmSucSGWb2QgJZC0i0A=
github.com/projectdiscovery/fastdialer v0.0.59 h1:5D0ws7JsYYMC8lm2VKgf91q0kAk6dzqoSHmLIuA2mww=
github.com/projectdiscovery/fastdialer v0.0.59/go.mod h1:VdYQimFCHafSPP3c+OXgu1DP+FNLJq+U6Xk/ybfJ0/A=
github.com/projectdiscovery/goflags v0.1.39 h1:Dj8UY+FJEfxzT20+kyDkeTlGs+Ys19SiXWek7x1tPTs=
github.com/projectdiscovery/goflags v0.1.39/go.mod h1:ouB+HpJvhKZJjT8Bd13RYD6au7sQ7nGylbfad3JFy+E=
github.com/projectdiscovery/goflags v0.1.40 h1:yQCB0m0YxkonbkF7hRlRSGAeWEUJdJh5MLqW2gQFdps=
github.com/projectdiscovery/goflags v0.1.40/go.mod h1:lNrjzQPzSt06waiLmZuBfcusNdLHVvueCgGcSovE6d4=
github.com/projectdiscovery/gologger v1.1.12 h1:uX/QkQdip4PubJjjG0+uk5DtyAi1ANPJUvpmimXqv4A=
github.com/projectdiscovery/gologger v1.1.12/go.mod h1:DI8nywPLERS5mo8QEA9E7gd5HZ3Je14SjJBH3F5/kLw=
github.com/projectdiscovery/gostruct v0.0.2 h1:s8gP8ApugGM4go1pA+sVlPDXaWqNP5BBDDSv7VEdG1M=
Expand All @@ -222,6 +225,10 @@ github.com/projectdiscovery/retryabledns v1.0.57 h1:+DOL9xYSIx74FRrOIKKHVp5R9ci5
github.com/projectdiscovery/retryabledns v1.0.57/go.mod h1:qIigOcmO9d0Ce/z6mHzLl0Aiz2WJcNk2gUGhRcCQ1k4=
github.com/projectdiscovery/retryablehttp-go v1.0.49 h1:mvlvl2kTN+ctpDIRlusVWui7eyFlElBoKTr8crS7yvY=
github.com/projectdiscovery/retryablehttp-go v1.0.49/go.mod h1:VaJ7Au+1LP8C2u0qmx4NN1IdAxxkhoXpIcc9LAQzFo4=
github.com/projectdiscovery/stringsutil v0.0.2 h1:uzmw3IVLJSMW1kEg8eCStG/cGbYYZAja8BH3LqqJXMA=
github.com/projectdiscovery/stringsutil v0.0.2/go.mod h1:EJ3w6bC5fBYjVou6ryzodQq37D5c6qbAYQpGmAy+DC0=
github.com/projectdiscovery/useragent v0.0.39 h1:s2jyXdtjVo0MfYYkifx7irrOIoA0JhzhZaBkpcoWgV4=
github.com/projectdiscovery/useragent v0.0.39/go.mod h1:wO6GQImJ2IQ5K+GDggS/Rhg6IV9Z2Du6NbqC/um0g0w=
github.com/projectdiscovery/utils v0.0.79 h1:ptO3Qo2e24SK5w5yvDk2whsvSEIk7gSX+RNhBQPRKqc=
github.com/projectdiscovery/utils v0.0.79/go.mod h1:tBFlI+1warN7y7hKpFf6pqqOszvufENofy9Md0qlZQo=
github.com/projectdiscovery/wappalyzergo v0.0.109 h1:BERfwTRn1dvB1tbhyc5m67R8VkC9zbVuPsEq4VEm07k=
Expand All @@ -236,8 +243,12 @@ github.com/rivo/uniseg v0.1.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJ
github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
github.com/rivo/uniseg v0.4.4 h1:8TfxU8dW6PdqD27gjM8MVNuicgxIjxpm4K7x4jp8sis=
github.com/rivo/uniseg v0.4.4/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs=
github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8=
github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4=
github.com/rs/xid v1.5.0 h1:mKX4bl4iPYJtEIxp6CYiUuLQ/8DYMoz0PUdtGgMFRVc=
github.com/rs/xid v1.5.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg=
github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU=
github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d h1:hrujxIzL1woJ7AwssoOcM/tq5JjjG2yYOc8odClEiXA=
github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU=
github.com/sashabaranov/go-openai v1.14.2 h1:5DPTtR9JBjKPJS008/A409I5ntFhUPPGCmaAihcPRyo=
Expand Down
3 changes: 3 additions & 0 deletions internal/runner/runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"github.com/projectdiscovery/katana/pkg/engine"
"github.com/projectdiscovery/katana/pkg/engine/hybrid"
"github.com/projectdiscovery/katana/pkg/engine/parser"
"github.com/projectdiscovery/katana/pkg/engine/passive"
"github.com/projectdiscovery/katana/pkg/engine/standard"
"github.com/projectdiscovery/katana/pkg/types"
"github.com/projectdiscovery/mapcidr"
Expand Down Expand Up @@ -98,6 +99,8 @@ func New(options *types.Options) (*Runner, error) {
switch {
case options.Headless:
crawler, err = hybrid.New(crawlerOptions)
case options.Passive:
crawler, err = passive.New(crawlerOptions)
default:
crawler, err = standard.New(crawlerOptions)
}
Expand Down
3 changes: 3 additions & 0 deletions pkg/engine/passive/doc.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
// Package passive implements the functionality for a non-headless crawler.
// It uses net/http for making requests and goquery for scraping web page HTML.
package passive
6 changes: 6 additions & 0 deletions pkg/engine/passive/extractor/extractor.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
package extractor

// UrlExtractor is an interface that defines the contract for domain extraction.
type UrlExtractor interface {
Extract(text string) []string
}
29 changes: 29 additions & 0 deletions pkg/engine/passive/extractor/regex_extractor.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
package extractor

import (
"regexp"
"strings"
)

// RegexUrlExtractor is a concrete implementation of the UrlExtractor interface, using regex for extraction.
type RegexUrlExtractor struct {
extractor *regexp.Regexp
}

// NewRegexUrlExtractor creates a new regular expression to extract urls
func NewRegexUrlExtractor() (*RegexUrlExtractor, error) {
extractor, err := regexp.Compile(`(?:http|https)?://(?:www\.)?[a-zA-Z0-9./?=_%:-]*`)
if err != nil {
return nil, err
}
return &RegexUrlExtractor{extractor: extractor}, nil
}

// Extract implements the UrlExtractor interface, using the regex to find urls in the given text.
func (re *RegexUrlExtractor) Extract(text string) []string {
matches := re.extractor.FindAllString(text, -1)
for i, match := range matches {
matches[i] = strings.ToLower(match)
}
return matches
}
124 changes: 124 additions & 0 deletions pkg/engine/passive/httpclient/httpclient.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
package httpclient

import (
"bytes"
"context"
"crypto/tls"
"fmt"
"io"
"net"
"net/http"
"net/url"
"time"

"github.com/projectdiscovery/gologger"
"github.com/projectdiscovery/useragent"
)

type HttpClient struct {
Client *http.Client
}

type BasicAuth struct {
Username string
Password string
}

func NewHttpClient(timeout int) *HttpClient {
Transport := &http.Transport{
MaxIdleConns: 100,
MaxIdleConnsPerHost: 100,
TLSClientConfig: &tls.Config{
InsecureSkipVerify: true,
Dismissed Show dismissed Hide dismissed
},
Dial: (&net.Dialer{
Timeout: time.Duration(timeout) * time.Second,
}).Dial,
}

client := &http.Client{
Transport: Transport,
Timeout: time.Duration(timeout) * time.Second,
}

httpClient := &HttpClient{Client: client}

return httpClient
}

func (hc *HttpClient) Get(ctx context.Context, getURL, cookies string, headers map[string]string) (*http.Response, error) {
return hc.HTTPRequest(ctx, http.MethodGet, getURL, cookies, headers, nil, BasicAuth{})
}

func (hc *HttpClient) SimpleGet(ctx context.Context, getURL string) (*http.Response, error) {
return hc.HTTPRequest(ctx, http.MethodGet, getURL, "", map[string]string{}, nil, BasicAuth{})
}

func (hc *HttpClient) Post(ctx context.Context, postURL, cookies string, headers map[string]string, body io.Reader) (*http.Response, error) {
return hc.HTTPRequest(ctx, http.MethodPost, postURL, cookies, headers, body, BasicAuth{})
}

func (hc *HttpClient) SimplePost(ctx context.Context, postURL, contentType string, body io.Reader) (*http.Response, error) {
return hc.HTTPRequest(ctx, http.MethodPost, postURL, "", map[string]string{"Content-Type": contentType}, body, BasicAuth{})
}

func (hc *HttpClient) HTTPRequest(ctx context.Context, method, requestURL, cookies string, headers map[string]string, body io.Reader, basicAuth BasicAuth) (*http.Response, error) {
req, err := http.NewRequestWithContext(ctx, method, requestURL, body)
if err != nil {
return nil, err
}

userAgent := useragent.PickRandom()
req.Header.Set("User-Agent", userAgent.String())
req.Header.Set("Accept", "*/*")
req.Header.Set("Accept-Language", "en")
req.Header.Set("Connection", "close")

if basicAuth.Username != "" || basicAuth.Password != "" {
req.SetBasicAuth(basicAuth.Username, basicAuth.Password)
}

if cookies != "" {
req.Header.Set("Cookie", cookies)
}

for key, value := range headers {
req.Header.Set(key, value)
}

return httpRequestWrapper(hc.Client, req)
}

func (hc *HttpClient) DiscardHTTPResponse(response *http.Response) {
if response != nil {
_, err := io.Copy(io.Discard, response.Body)
if err != nil {
gologger.Warning().Msgf("Could not discard response body: %s\n", err)
return
}
response.Body.Close()
}
}

func (hc *HttpClient) Close() {
hc.Client.CloseIdleConnections()
}

func httpRequestWrapper(client *http.Client, request *http.Request) (*http.Response, error) {
response, err := client.Do(request)
if err != nil {
return nil, err
}

if response.StatusCode != http.StatusOK {
requestURL, _ := url.QueryUnescape(request.URL.String())

gologger.Debug().MsgFunc(func() string {
buffer := new(bytes.Buffer)
_, _ = buffer.ReadFrom(response.Body)
return fmt.Sprintf("Response for failed request against %s:\n%s", requestURL, buffer.String())
})
return response, fmt.Errorf("unexpected status code %d received from %s", response.StatusCode, requestURL)
}
return response, nil
}
Loading
Loading