diff --git a/README.md b/README.md index d5f7834..4655427 100644 --- a/README.md +++ b/README.md @@ -34,26 +34,27 @@ func main() { } // Scrape a single URL - url := "https://mendable.ai" - scrapedData, err := app.ScrapeURL(url, nil) + scrapeResult, err := app.ScrapeURL("example.com", nil) if err != nil { - log.Fatalf("Error occurred while scraping: %v", err) + log.Fatalf("Failed to scrape URL: %v", err) } - fmt.Println(scrapedData) + fmt.Println(scrapeResult.Markdown) // Crawl a website - crawlUrl := "https://mendable.ai" - params := map[string]any{ - "pageOptions": map[string]any{ - "onlyMainContent": true, - }, + idempotencyKey := uuid.New().String() // optional idempotency key + crawlParams := &firecrawl.CrawlParams{ + ExcludePaths: []string{"blog/*"}, + MaxDepth: prt(2), } - - crawlResult, err := app.CrawlURL(crawlUrl, params) + crawlResult, err := app.CrawlURL("example.com", crawlParams, &idempotencyKey) if err != nil { - log.Fatalf("Error occurred while crawling: %v", err) + log.Fatalf("Failed to crawl URL: %v", err) } - fmt.Println(crawlResult) + jsonCrawlResult, err := json.MarshalIndent(crawlResult, "", " ") + if err != nil { + log.Fatalf("Failed to marshal crawl result: %v", err) + } + fmt.Println(string(jsonCrawlResult)) } ``` @@ -62,7 +63,7 @@ func main() { To scrape a single URL with error handling, use the `ScrapeURL` method. It takes the URL as a parameter and returns the scraped data as a dictionary. ```go -url := "https://mendable.ai" +url := "https://example.com" scrapedData, err := app.ScrapeURL(url, nil) if err != nil { log.Fatalf("Failed to scrape URL: %v", err) @@ -111,48 +112,41 @@ if err != nil { fmt.Println(scrapeResult) ``` -### Search for a query - -To search the web, get the most relevant results, scrap each page and return the markdown, use the `Search` method. The method takes the query as a parameter and returns the search results. +### Crawling a Website +To crawl a website, use the `CrawlUrl` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format. ```go -query := "what is mendable?" -searchResult, err := app.Search(query) +response, err := app.CrawlURL("https://roastmywebsite.ai", nil,nil) + if err != nil { - log.Fatalf("Failed to search: %v", err) + log.Fatalf("Failed to crawl URL: %v", err) } -fmt.Println(searchResult) + +fmt.Println(response) ``` -### Crawling a Website +### Asynchronous Crawl -To crawl a website, use the `CrawlUrl` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format. +To initiate an asynchronous crawl of a website, utilize the `AsyncCrawlURL` method. This method requires the starting URL and optional parameters as inputs. The `params` argument enables you to define various settings for the asynchronous crawl, such as the maximum number of pages to crawl, permitted domains, and the output format. Upon successful initiation, this method returns an ID, which is essential for subsequently checking the status of the crawl. ```go -crawlParams := map[string]any{ - "crawlerOptions": map[string]any{ - "excludes": []string{"blog/*"}, - "includes": []string{}, // leave empty for all pages - "limit": 1000, - }, - "pageOptions": map[string]any{ - "onlyMainContent": true, - }, -} -crawlResult, err := app.CrawlURL("mendable.ai", crawlParams, true, 2, idempotencyKey) +response, err := app.AsyncCrawlURL("https://roastmywebsite.ai", nil, nil) + if err != nil { - log.Fatalf("Failed to crawl URL: %v", err) + log.Fatalf("Failed to crawl URL: %v", err) } -fmt.Println(crawlResult) + +fmt.Println(response) ``` + ### Checking Crawl Status -To check the status of a crawl job, use the `CheckCrawlStatus` method. It takes the job ID as a parameter and returns the current status of the crawl job. +To check the status of a crawl job, use the `CheckCrawlStatus` method. It takes the crawl ID as a parameter and returns the current status of the crawl job. ```go -status, err := app.CheckCrawlStatus(jobId) +status, err := app.CheckCrawlStatus(id) if err != nil { log.Fatalf("Failed to check crawl status: %v", err) } diff --git a/firecrawl.go b/firecrawl.go index 9a9dcfe..f0847ad 100644 --- a/firecrawl.go +++ b/firecrawl.go @@ -14,63 +14,60 @@ import ( // FirecrawlDocumentMetadata represents metadata for a Firecrawl document type FirecrawlDocumentMetadata struct { - Title string `json:"title,omitempty"` - Description string `json:"description,omitempty"` - Language string `json:"language,omitempty"` - Keywords string `json:"keywords,omitempty"` - Robots string `json:"robots,omitempty"` - OGTitle string `json:"ogTitle,omitempty"` - OGDescription string `json:"ogDescription,omitempty"` - OGURL string `json:"ogUrl,omitempty"` - OGImage string `json:"ogImage,omitempty"` - OGAudio string `json:"ogAudio,omitempty"` - OGDeterminer string `json:"ogDeterminer,omitempty"` - OGLocale string `json:"ogLocale,omitempty"` - OGLocaleAlternate []string `json:"ogLocaleAlternate,omitempty"` - OGSiteName string `json:"ogSiteName,omitempty"` - OGVideo string `json:"ogVideo,omitempty"` - DCTermsCreated string `json:"dctermsCreated,omitempty"` - DCDateCreated string `json:"dcDateCreated,omitempty"` - DCDate string `json:"dcDate,omitempty"` - DCTermsType string `json:"dctermsType,omitempty"` - DCType string `json:"dcType,omitempty"` - DCTermsAudience string `json:"dctermsAudience,omitempty"` - DCTermsSubject string `json:"dctermsSubject,omitempty"` - DCSubject string `json:"dcSubject,omitempty"` - DCDescription string `json:"dcDescription,omitempty"` - DCTermsKeywords string `json:"dctermsKeywords,omitempty"` - ModifiedTime string `json:"modifiedTime,omitempty"` - PublishedTime string `json:"publishedTime,omitempty"` - ArticleTag string `json:"articleTag,omitempty"` - ArticleSection string `json:"articleSection,omitempty"` - SourceURL string `json:"sourceURL,omitempty"` - PageStatusCode int `json:"pageStatusCode,omitempty"` - PageError string `json:"pageError,omitempty"` + Title *string `json:"title,omitempty"` + Description *string `json:"description,omitempty"` + Language *string `json:"language,omitempty"` + Keywords *string `json:"keywords,omitempty"` + Robots *string `json:"robots,omitempty"` + OGTitle *string `json:"ogTitle,omitempty"` + OGDescription *string `json:"ogDescription,omitempty"` + OGURL *string `json:"ogUrl,omitempty"` + OGImage *string `json:"ogImage,omitempty"` + OGAudio *string `json:"ogAudio,omitempty"` + OGDeterminer *string `json:"ogDeterminer,omitempty"` + OGLocale *string `json:"ogLocale,omitempty"` + OGLocaleAlternate []*string `json:"ogLocaleAlternate,omitempty"` + OGSiteName *string `json:"ogSiteName,omitempty"` + OGVideo *string `json:"ogVideo,omitempty"` + DCTermsCreated *string `json:"dctermsCreated,omitempty"` + DCDateCreated *string `json:"dcDateCreated,omitempty"` + DCDate *string `json:"dcDate,omitempty"` + DCTermsType *string `json:"dctermsType,omitempty"` + DCType *string `json:"dcType,omitempty"` + DCTermsAudience *string `json:"dctermsAudience,omitempty"` + DCTermsSubject *string `json:"dctermsSubject,omitempty"` + DCSubject *string `json:"dcSubject,omitempty"` + DCDescription *string `json:"dcDescription,omitempty"` + DCTermsKeywords *string `json:"dctermsKeywords,omitempty"` + ModifiedTime *string `json:"modifiedTime,omitempty"` + PublishedTime *string `json:"publishedTime,omitempty"` + ArticleTag *string `json:"articleTag,omitempty"` + ArticleSection *string `json:"articleSection,omitempty"` + SourceURL *string `json:"sourceURL,omitempty"` + StatusCode *int `json:"statusCode,omitempty"` + Error *string `json:"error,omitempty"` } // FirecrawlDocument represents a document in Firecrawl type FirecrawlDocument struct { - ID string `json:"id,omitempty"` - URL string `json:"url,omitempty"` - Content string `json:"content"` - Markdown string `json:"markdown,omitempty"` - HTML string `json:"html,omitempty"` - LLMExtraction map[string]any `json:"llm_extraction,omitempty"` - CreatedAt *time.Time `json:"createdAt,omitempty"` - UpdatedAt *time.Time `json:"updatedAt,omitempty"` - Type string `json:"type,omitempty"` - Metadata *FirecrawlDocumentMetadata `json:"metadata,omitempty"` - ChildrenLinks []string `json:"childrenLinks,omitempty"` - Provider string `json:"provider,omitempty"` - Warning string `json:"warning,omitempty"` - Index int `json:"index,omitempty"` + Markdown string `json:"markdown,omitempty"` + HTML string `json:"html,omitempty"` + RawHTML string `json:"rawHtml,omitempty"` + Screenshot string `json:"screenshot,omitempty"` + Links []string `json:"links,omitempty"` + Metadata *FirecrawlDocumentMetadata `json:"metadata,omitempty"` } -// ExtractorOptions represents options for extraction. -type ExtractorOptions struct { - Mode string `json:"mode,omitempty"` - ExtractionPrompt string `json:"extractionPrompt,omitempty"` - ExtractionSchema any `json:"extractionSchema,omitempty"` +// ScrapeParams represents the parameters for a scrape request. +type ScrapeParams struct { + Formats []string `json:"formats,omitempty"` + Headers *map[string]string `json:"headers,omitempty"` + IncludeTags []string `json:"includeTags,omitempty"` + ExcludeTags []string `json:"excludeTags,omitempty"` + OnlyMainContent *bool `json:"onlyMainContent,omitempty"` + WaitFor *int `json:"waitFor,omitempty"` + ParsePDF *bool `json:"parsePDF,omitempty"` + Timeout *int `json:"timeout,omitempty"` } // ScrapeResponse represents the response for scraping operations @@ -79,30 +76,35 @@ type ScrapeResponse struct { Data *FirecrawlDocument `json:"data,omitempty"` } -// SearchResponse represents the response for searching operations -type SearchResponse struct { - Success bool `json:"success"` - Data []*FirecrawlDocument `json:"data,omitempty"` +// CrawlParams represents the parameters for a crawl request. +type CrawlParams struct { + ScrapeOptions ScrapeParams `json:"scrapeOptions"` + Webhook *string `json:"webhook,omitempty"` + Limit *int `json:"limit,omitempty"` + IncludePaths []string `json:"includePaths,omitempty"` + ExcludePaths []string `json:"excludePaths,omitempty"` + MaxDepth *int `json:"maxDepth,omitempty"` + AllowBackwardLinks *bool `json:"allowBackwardLinks,omitempty"` + AllowExternalLinks *bool `json:"allowExternalLinks,omitempty"` + IgnoreSitemap *bool `json:"ignoreSitemap,omitempty"` } // CrawlResponse represents the response for crawling operations type CrawlResponse struct { - Success bool `json:"success"` - JobID string `json:"jobId,omitempty"` - Data []*FirecrawlDocument `json:"data,omitempty"` + Success bool `json:"success"` + ID string `json:"id,omitempty"` + URL string `json:"url,omitempty"` } -// JobStatusResponse represents the response for checking crawl job status -type JobStatusResponse struct { - Success bool `json:"success"` +// CrawlStatusResponse (old JobStatusResponse) represents the response for checking crawl job +type CrawlStatusResponse struct { Status string `json:"status"` - Current int `json:"current,omitempty"` - CurrentURL string `json:"current_url,omitempty"` - CurrentStep string `json:"current_step,omitempty"` Total int `json:"total,omitempty"` - JobID string `json:"jobId,omitempty"` + Completed int `json:"completed,omitempty"` + CreditsUsed int `json:"creditsUsed,omitempty"` + ExpiresAt string `json:"expiresAt,omitempty"` + Next *string `json:"next,omitempty"` Data []*FirecrawlDocument `json:"data,omitempty"` - PartialData []*FirecrawlDocument `json:"partial_data,omitempty"` } // CancelCrawlJobResponse represents the response for canceling a crawl job @@ -111,6 +113,21 @@ type CancelCrawlJobResponse struct { Status string `json:"status"` } +// MapParams represents the parameters for a map request. +type MapParams struct { + IncludeSubdomains *bool `json:"includeSubdomains,omitempty"` + Search *string `json:"search,omitempty"` + IgnoreSitemap *bool `json:"ignoreSitemap,omitempty"` + Limit *int `json:"limit,omitempty"` +} + +// MapResponse represents the response for mapping operations +type MapResponse struct { + Success bool `json:"success"` + Links []string `json:"links,omitempty"` + Error string `json:"error,omitempty"` +} + // requestOptions represents options for making requests. type requestOptions struct { retries int @@ -163,9 +180,10 @@ func withBackoff(backoff int) requestOption { // FirecrawlApp represents a client for the Firecrawl API. type FirecrawlApp struct { - APIKey string - APIURL string - Client *http.Client + APIKey string + APIURL string + Client *http.Client + Version string } // NewFirecrawlApp creates a new instance of FirecrawlApp with the provided API key and API URL. @@ -212,33 +230,60 @@ func NewFirecrawlApp(apiKey, apiURL string) (*FirecrawlApp, error) { // - params: Optional parameters for the scrape request, including extractor options for LLM extraction. // // Returns: -// - *FirecrawlDocument: The scraped document data. +// - *FirecrawlDocument or *FirecrawlDocumentV0: The scraped document data depending on the API version. // - error: An error if the scrape request fails. -func (app *FirecrawlApp) ScrapeURL(url string, params map[string]any) (*FirecrawlDocument, error) { - headers := app.prepareHeaders("") +func (app *FirecrawlApp) ScrapeURL(url string, params *ScrapeParams) (*FirecrawlDocument, error) { + headers := app.prepareHeaders(nil) scrapeBody := map[string]any{"url": url} + // if params != nil { + // if extractorOptions, ok := params["extractorOptions"].(ExtractorOptions); ok { + // if schema, ok := extractorOptions.ExtractionSchema.(interface{ schema() any }); ok { + // extractorOptions.ExtractionSchema = schema.schema() + // } + // if extractorOptions.Mode == "" { + // extractorOptions.Mode = "llm-extraction" + // } + // scrapeBody["extractorOptions"] = extractorOptions + // } + + // for key, value := range params { + // if key != "extractorOptions" { + // scrapeBody[key] = value + // } + // } + // } + if params != nil { - if extractorOptions, ok := params["extractorOptions"].(ExtractorOptions); ok { - if schema, ok := extractorOptions.ExtractionSchema.(interface{ schema() any }); ok { - extractorOptions.ExtractionSchema = schema.schema() - } - if extractorOptions.Mode == "" { - extractorOptions.Mode = "llm-extraction" - } - scrapeBody["extractorOptions"] = extractorOptions + if params.Formats != nil { + scrapeBody["formats"] = params.Formats } - - for key, value := range params { - if key != "extractorOptions" { - scrapeBody[key] = value - } + if params.Headers != nil { + scrapeBody["headers"] = params.Headers + } + if params.IncludeTags != nil { + scrapeBody["includeTags"] = params.IncludeTags + } + if params.ExcludeTags != nil { + scrapeBody["excludeTags"] = params.ExcludeTags + } + if params.OnlyMainContent != nil { + scrapeBody["onlyMainContent"] = params.OnlyMainContent + } + if params.WaitFor != nil { + scrapeBody["waitFor"] = params.WaitFor + } + if params.ParsePDF != nil { + scrapeBody["parsePDF"] = params.ParsePDF + } + if params.Timeout != nil { + scrapeBody["timeout"] = params.Timeout } } resp, err := app.makeRequest( http.MethodPost, - fmt.Sprintf("%s/v0/scrape", app.APIURL), + fmt.Sprintf("%s/v1/scrape", app.APIURL), scrapeBody, headers, "scrape URL", @@ -249,55 +294,95 @@ func (app *FirecrawlApp) ScrapeURL(url string, params map[string]any) (*Firecraw var scrapeResponse ScrapeResponse err = json.Unmarshal(resp, &scrapeResponse) - if err != nil { - return nil, err - } if scrapeResponse.Success { return scrapeResponse.Data, nil } + if err != nil { + return nil, err + } + return nil, fmt.Errorf("failed to scrape URL") } -// Search performs a search query using the Firecrawl API and returns the search results. +// CrawlURL starts a crawl job for the specified URL using the Firecrawl API. // // Parameters: -// - query: The search query string. -// - params: Optional parameters for the search request. +// - url: The URL to crawl. +// - params: Optional parameters for the crawl request. +// - idempotencyKey: An optional idempotency key to ensure the request is idempotent (can be nil). +// - pollInterval: An optional interval (in seconds) at which to poll the job status. Default is 2 seconds. // // Returns: -// - []*FirecrawlDocument: A slice of FirecrawlDocument containing the search results. -// - error: An error if the search request fails. -func (app *FirecrawlApp) Search(query string, params map[string]any) ([]*FirecrawlDocument, error) { - headers := app.prepareHeaders("") - searchBody := map[string]any{"query": query} - for k, v := range params { - searchBody[k] = v +// - CrawlStatusResponse: The crawl result if the job is completed. +// - error: An error if the crawl request fails. +func (app *FirecrawlApp) CrawlURL(url string, params *CrawlParams, idempotencyKey *string, pollInterval ...int) (*CrawlStatusResponse, error) { + var key string + if idempotencyKey != nil { + key = *idempotencyKey + } + + headers := app.prepareHeaders(&key) + crawlBody := map[string]any{"url": url} + + if params != nil { + if params.ScrapeOptions.Formats != nil { + crawlBody["scrapeOptions"] = params.ScrapeOptions + } + if params.Webhook != nil { + crawlBody["webhook"] = params.Webhook + } + if params.Limit != nil { + crawlBody["limit"] = params.Limit + } + if params.IncludePaths != nil { + crawlBody["includePaths"] = params.IncludePaths + } + if params.ExcludePaths != nil { + crawlBody["excludePaths"] = params.ExcludePaths + } + if params.MaxDepth != nil { + crawlBody["maxDepth"] = params.MaxDepth + } + if params.AllowBackwardLinks != nil { + crawlBody["allowBackwardLinks"] = params.AllowBackwardLinks + } + if params.AllowExternalLinks != nil { + crawlBody["allowExternalLinks"] = params.AllowExternalLinks + } + if params.IgnoreSitemap != nil { + crawlBody["ignoreSitemap"] = params.IgnoreSitemap + } + } + + actualPollInterval := 2 + if len(pollInterval) > 0 { + actualPollInterval = pollInterval[0] } resp, err := app.makeRequest( http.MethodPost, - fmt.Sprintf("%s/v0/search", app.APIURL), - searchBody, + fmt.Sprintf("%s/v1/crawl", app.APIURL), + crawlBody, headers, - "search", + "start crawl job", + withRetries(3), + withBackoff(500), ) if err != nil { return nil, err } - var searchResponse SearchResponse - err = json.Unmarshal(resp, &searchResponse) + var crawlResponse CrawlResponse + err = json.Unmarshal(resp, &crawlResponse) if err != nil { return nil, err } - if searchResponse.Success { - return searchResponse.Data, nil - } + fmt.Println(crawlResponse) - return nil, fmt.Errorf("failed to search") + return app.monitorJobStatus(crawlResponse.ID, headers, actualPollInterval) } // CrawlURL starts a crawl job for the specified URL using the Firecrawl API. @@ -305,29 +390,60 @@ func (app *FirecrawlApp) Search(query string, params map[string]any) ([]*Firecra // Parameters: // - url: The URL to crawl. // - params: Optional parameters for the crawl request. -// - waitUntilDone: If true, the method will wait until the crawl job is completed before returning. -// - pollInterval: The interval (in seconds) at which to poll the job status if waitUntilDone is true. // - idempotencyKey: An optional idempotency key to ensure the request is idempotent. // // Returns: -// - any: The job ID if waitUntilDone is false, or the crawl result if waitUntilDone is true. +// - *CrawlResponse: The crawl response with id. // - error: An error if the crawl request fails. -func (app *FirecrawlApp) CrawlURL(url string, params map[string]any, waitUntilDone bool, pollInterval int, idempotencyKey string) (any, error) { - headers := app.prepareHeaders(idempotencyKey) +func (app *FirecrawlApp) AsyncCrawlURL(url string, params *CrawlParams, idempotencyKey *string) (*CrawlResponse, error) { + var key string + if idempotencyKey != nil { + key = *idempotencyKey + } + + headers := app.prepareHeaders(&key) crawlBody := map[string]any{"url": url} - for k, v := range params { - crawlBody[k] = v + + if params != nil { + if params.ScrapeOptions.Formats != nil { + crawlBody["scrapeOptions"] = params.ScrapeOptions + } + if params.Webhook != nil { + crawlBody["webhook"] = params.Webhook + } + if params.Limit != nil { + crawlBody["limit"] = params.Limit + } + if params.IncludePaths != nil { + crawlBody["includePaths"] = params.IncludePaths + } + if params.ExcludePaths != nil { + crawlBody["excludePaths"] = params.ExcludePaths + } + if params.MaxDepth != nil { + crawlBody["maxDepth"] = params.MaxDepth + } + if params.AllowBackwardLinks != nil { + crawlBody["allowBackwardLinks"] = params.AllowBackwardLinks + } + if params.AllowExternalLinks != nil { + crawlBody["allowExternalLinks"] = params.AllowExternalLinks + } + if params.IgnoreSitemap != nil { + crawlBody["ignoreSitemap"] = params.IgnoreSitemap + } } resp, err := app.makeRequest( http.MethodPost, - fmt.Sprintf("%s/v0/crawl", app.APIURL), + fmt.Sprintf("%s/v1/crawl", app.APIURL), crawlBody, headers, "start crawl job", withRetries(3), withBackoff(500), ) + if err != nil { return nil, err } @@ -338,30 +454,28 @@ func (app *FirecrawlApp) CrawlURL(url string, params map[string]any, waitUntilDo return nil, err } - if waitUntilDone { - return app.monitorJobStatus(crawlResponse.JobID, headers, pollInterval) - } - - if crawlResponse.JobID == "" { + if crawlResponse.ID == "" { return nil, fmt.Errorf("failed to get job ID") } - return crawlResponse.JobID, nil + return &crawlResponse, nil } // CheckCrawlStatus checks the status of a crawl job using the Firecrawl API. // // Parameters: -// - jobID: The ID of the crawl job to check. +// - ID: The ID of the crawl job to check. // // Returns: -// - *JobStatusResponse: The status of the crawl job. +// - *CrawlStatusResponse: The status of the crawl job. // - error: An error if the crawl status check request fails. -func (app *FirecrawlApp) CheckCrawlStatus(jobID string) (*JobStatusResponse, error) { - headers := app.prepareHeaders("") +func (app *FirecrawlApp) CheckCrawlStatus(ID string) (*CrawlStatusResponse, error) { + headers := app.prepareHeaders(nil) + apiURL := fmt.Sprintf("%s/v1/crawl/%s", app.APIURL, ID) + resp, err := app.makeRequest( http.MethodGet, - fmt.Sprintf("%s/v0/crawl/status/%s", app.APIURL, jobID), + apiURL, nil, headers, "check crawl status", @@ -372,7 +486,7 @@ func (app *FirecrawlApp) CheckCrawlStatus(jobID string) (*JobStatusResponse, err return nil, err } - var jobStatusResponse JobStatusResponse + var jobStatusResponse CrawlStatusResponse err = json.Unmarshal(resp, &jobStatusResponse) if err != nil { return nil, err @@ -384,16 +498,17 @@ func (app *FirecrawlApp) CheckCrawlStatus(jobID string) (*JobStatusResponse, err // CancelCrawlJob cancels a crawl job using the Firecrawl API. // // Parameters: -// - jobID: The ID of the crawl job to cancel. +// - ID: The ID of the crawl job to cancel. // // Returns: // - string: The status of the crawl job after cancellation. // - error: An error if the crawl job cancellation request fails. -func (app *FirecrawlApp) CancelCrawlJob(jobID string) (string, error) { - headers := app.prepareHeaders("") +func (app *FirecrawlApp) CancelCrawlJob(ID string) (string, error) { + headers := app.prepareHeaders(nil) + apiURL := fmt.Sprintf("%s/v1/crawl/%s", app.APIURL, ID) resp, err := app.makeRequest( http.MethodDelete, - fmt.Sprintf("%s/v0/crawl/cancel/%s", app.APIURL, jobID), + apiURL, nil, headers, "cancel crawl job", @@ -411,6 +526,70 @@ func (app *FirecrawlApp) CancelCrawlJob(jobID string) (string, error) { return cancelCrawlJobResponse.Status, nil } +// MapURL initiates a mapping operation for a URL using the Firecrawl API. +// +// Parameters: +// - url: The URL to map. +// - params: Optional parameters for the mapping request. +// +// Returns: +// - *MapResponse: The response from the mapping operation. +// - error: An error if the mapping request fails. +func (app *FirecrawlApp) MapURL(url string, params *MapParams) (*MapResponse, error) { + headers := app.prepareHeaders(nil) + jsonData := map[string]any{"url": url} + + if params != nil { + if params.IncludeSubdomains != nil { + jsonData["includeSubdomains"] = params.IncludeSubdomains + } + if params.Search != nil { + jsonData["search"] = params.Search + } + if params.IgnoreSitemap != nil { + jsonData["ignoreSitemap"] = params.IgnoreSitemap + } + if params.Limit != nil { + jsonData["limit"] = params.Limit + } + } + + resp, err := app.makeRequest( + http.MethodPost, + fmt.Sprintf("%s/v1/map", app.APIURL), + jsonData, + headers, + "map", + ) + if err != nil { + return nil, err + } + + var mapResponse MapResponse + err = json.Unmarshal(resp, &mapResponse) + if err != nil { + return nil, err + } + + if mapResponse.Success { + return &mapResponse, nil + } else { + return nil, fmt.Errorf("map operation failed: %s", mapResponse.Error) + } +} + +// SearchURL searches for a URL using the Firecrawl API. +// +// Parameters: +// - url: The URL to search for. +// - params: Optional parameters for the search request. +// - error: An error if the search request fails. +// +// Search is not implemented in API version 1.0.0. +func (app *FirecrawlApp) Search(query string, params *any) (any, error) { + return nil, fmt.Errorf("Search is not implemented in API version 1.0.0") +} + // prepareHeaders prepares the headers for an HTTP request. // // Parameters: @@ -419,13 +598,13 @@ func (app *FirecrawlApp) CancelCrawlJob(jobID string) (string, error) { // // Returns: // - map[string]string: A map containing the headers for the HTTP request. -func (app *FirecrawlApp) prepareHeaders(idempotencyKey string) map[string]string { +func (app *FirecrawlApp) prepareHeaders(idempotencyKey *string) map[string]string { headers := map[string]string{ "Content-Type": "application/json", "Authorization": fmt.Sprintf("Bearer %s", app.APIKey), } - if idempotencyKey != "" { - headers["x-idempotency-key"] = idempotencyKey + if idempotencyKey != nil { + headers["x-idempotency-key"] = *idempotencyKey } return headers } @@ -494,19 +673,20 @@ func (app *FirecrawlApp) makeRequest(method, url string, data map[string]any, he // monitorJobStatus monitors the status of a crawl job using the Firecrawl API. // // Parameters: -// - jobID: The ID of the crawl job to monitor. +// - ID: The ID of the crawl job to monitor. // - headers: The headers to be included in the request. // - pollInterval: The interval (in seconds) at which to poll the job status. // // Returns: -// - []*FirecrawlDocument: The crawl result if the job is completed. +// - *CrawlStatusResponse: The crawl result if the job is completed. // - error: An error if the crawl status check request fails. -func (app *FirecrawlApp) monitorJobStatus(jobID string, headers map[string]string, pollInterval int) ([]*FirecrawlDocument, error) { - attempts := 0 +func (app *FirecrawlApp) monitorJobStatus(ID string, headers map[string]string, pollInterval int) (*CrawlStatusResponse, error) { + attempts := 3 + for { resp, err := app.makeRequest( http.MethodGet, - fmt.Sprintf("%s/v0/crawl/status/%s", app.APIURL, jobID), + fmt.Sprintf("%s/v1/crawl/%s", app.APIURL, ID), nil, headers, "check crawl status", @@ -517,7 +697,7 @@ func (app *FirecrawlApp) monitorJobStatus(jobID string, headers map[string]strin return nil, err } - var statusData JobStatusResponse + var statusData CrawlStatusResponse err = json.Unmarshal(resp, &statusData) if err != nil { return nil, err @@ -530,13 +710,13 @@ func (app *FirecrawlApp) monitorJobStatus(jobID string, headers map[string]strin if status == "completed" { if statusData.Data != nil { - return statusData.Data, nil + return &statusData, nil } - attempts++ - if attempts > 3 { + attempts-- + if attempts == 0 { return nil, fmt.Errorf("crawl job completed but no data was returned") } - } else if status == "active" || status == "paused" || status == "pending" || status == "queued" || status == "waiting" { + } else if status == "active" || status == "paused" || status == "pending" || status == "queued" || status == "waiting" || status == "scraping" { pollInterval = max(pollInterval, 2) time.Sleep(time.Duration(pollInterval) * time.Second) } else { diff --git a/firecrawl_test.go b/firecrawl_test.go index c41c0b6..cf2b062 100644 --- a/firecrawl_test.go +++ b/firecrawl_test.go @@ -1,11 +1,13 @@ package firecrawl import ( + "log" "os" "testing" "time" "github.com/google/uuid" + "github.com/joho/godotenv" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) @@ -13,8 +15,16 @@ import ( var API_URL string var TEST_API_KEY string +func ptr[T any](v T) *T { + return &v +} + func init() { - API_URL = "http://127.0.0.1:3002" + err := godotenv.Load(".env") + if err != nil { + log.Fatalf("Error loading .env file: %v", err) + } + API_URL = os.Getenv("API_URL") TEST_API_KEY = os.Getenv("TEST_API_KEY") } @@ -39,7 +49,7 @@ func TestBlocklistedURL(t *testing.T) { _, err = app.ScrapeURL("https://facebook.com/fake-test", nil) assert.Error(t, err) - assert.Contains(t, err.Error(), "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions.") + assert.Contains(t, err.Error(), "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions.") } func TestSuccessfulResponseWithValidPreviewToken(t *testing.T) { @@ -50,7 +60,7 @@ func TestSuccessfulResponseWithValidPreviewToken(t *testing.T) { require.NoError(t, err) assert.NotNil(t, response) - assert.Contains(t, response.Content, "_Roast_") + assert.Contains(t, response.Markdown, "_Roast_") } func TestScrapeURLE2E(t *testing.T) { @@ -61,7 +71,7 @@ func TestScrapeURLE2E(t *testing.T) { require.NoError(t, err) assert.NotNil(t, response) - assert.Contains(t, response.Content, "_Roast_") + assert.Contains(t, response.Markdown, "_Roast_") assert.NotEqual(t, response.Markdown, "") assert.NotNil(t, response.Metadata) assert.Equal(t, response.HTML, "") @@ -71,18 +81,29 @@ func TestSuccessfulResponseWithValidAPIKeyAndIncludeHTML(t *testing.T) { app, err := NewFirecrawlApp(TEST_API_KEY, API_URL) require.NoError(t, err) - params := map[string]any{ - "pageOptions": map[string]any{ - "includeHtml": true, - }, + params := ScrapeParams{ + Formats: []string{"markdown", "html", "rawHtml", "screenshot", "links"}, + Headers: ptr(map[string]string{"x-key": "test"}), + IncludeTags: []string{"h1"}, + ExcludeTags: []string{"h2"}, + OnlyMainContent: ptr(true), + Timeout: ptr(30000), + WaitFor: ptr(1000), } - response, err := app.ScrapeURL("https://roastmywebsite.ai", params) + + response, err := app.ScrapeURL("https://roastmywebsite.ai", ¶ms) require.NoError(t, err) assert.NotNil(t, response) - assert.Contains(t, response.Content, "_Roast_") assert.Contains(t, response.Markdown, "_Roast_") assert.Contains(t, response.HTML, "= maxChecks { + break + } + + time.Sleep(5 * time.Second) // wait for 5 seconds - response, err := app.ScrapeURL("https://mendable.ai", params) + response, err := app.CheckCrawlStatus(asyncCrawlResponse.ID) + require.NoError(t, err) + assert.NotNil(t, response) + + assert.GreaterOrEqual(t, len(response.Data), 0) + assert.GreaterOrEqual(t, response.Total, 0) + assert.GreaterOrEqual(t, response.CreditsUsed, 0) + + if response.Status == "completed" { + break + } + + checks++ + } + + // Final check after loop or if completed + response, err := app.CheckCrawlStatus(asyncCrawlResponse.ID) require.NoError(t, err) assert.NotNil(t, response) - assert.Contains(t, response.LLMExtraction, "company_mission") - assert.IsType(t, true, response.LLMExtraction["supports_sso"]) - assert.IsType(t, true, response.LLMExtraction["is_open_source"]) + assert.Equal(t, "completed", response.Status) + assert.Greater(t, len(response.Data), 0) + assert.Greater(t, response.Total, 0) + assert.Greater(t, response.Completed, 0) + assert.Greater(t, response.CreditsUsed, 0) + assert.NotNil(t, response.Data[0].Markdown) + assert.Contains(t, response.Data[0].HTML, "