From 972996d6ee79b80064537621bdccb40c85f5cf33 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Fri, 23 Aug 2024 18:04:17 -0300 Subject: [PATCH 1/3] wip --- firecrawl.go | 173 ++++++++++++++++++++----- firecrawl_test.go | 80 ++++++------ firecrawl_test_v1.go | 292 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 473 insertions(+), 72 deletions(-) create mode 100644 firecrawl_test_v1.go diff --git a/firecrawl.go b/firecrawl.go index 9a9dcfe..8d01c0d 100644 --- a/firecrawl.go +++ b/firecrawl.go @@ -12,8 +12,8 @@ import ( "time" ) -// FirecrawlDocumentMetadata represents metadata for a Firecrawl document -type FirecrawlDocumentMetadata struct { +// FirecrawlDocumentMetadataV0 represents metadata for a Firecrawl document for v0 +type FirecrawlDocumentMetadataV0 struct { Title string `json:"title,omitempty"` Description string `json:"description,omitempty"` Language string `json:"language,omitempty"` @@ -48,8 +48,44 @@ type FirecrawlDocumentMetadata struct { PageError string `json:"pageError,omitempty"` } -// FirecrawlDocument represents a document in Firecrawl -type FirecrawlDocument struct { +// FirecrawlDocumentMetadata represents metadata for a Firecrawl document for v1 +type FirecrawlDocumentMetadata struct { + Title string `json:"title,omitempty"` + Description string `json:"description,omitempty"` + Language string `json:"language,omitempty"` + Keywords string `json:"keywords,omitempty"` + Robots string `json:"robots,omitempty"` + OGTitle string `json:"ogTitle,omitempty"` + OGDescription string `json:"ogDescription,omitempty"` + OGURL string `json:"ogUrl,omitempty"` + OGImage string `json:"ogImage,omitempty"` + OGAudio string `json:"ogAudio,omitempty"` + OGDeterminer string `json:"ogDeterminer,omitempty"` + OGLocale string `json:"ogLocale,omitempty"` + OGLocaleAlternate []string `json:"ogLocaleAlternate,omitempty"` + OGSiteName string `json:"ogSiteName,omitempty"` + OGVideo string `json:"ogVideo,omitempty"` + DCTermsCreated string `json:"dctermsCreated,omitempty"` + DCDateCreated string `json:"dcDateCreated,omitempty"` + DCDate string `json:"dcDate,omitempty"` + DCTermsType string `json:"dctermsType,omitempty"` + DCType string `json:"dcType,omitempty"` + DCTermsAudience string `json:"dctermsAudience,omitempty"` + DCTermsSubject string `json:"dctermsSubject,omitempty"` + DCSubject string `json:"dcSubject,omitempty"` + DCDescription string `json:"dcDescription,omitempty"` + DCTermsKeywords string `json:"dctermsKeywords,omitempty"` + ModifiedTime string `json:"modifiedTime,omitempty"` + PublishedTime string `json:"publishedTime,omitempty"` + ArticleTag string `json:"articleTag,omitempty"` + ArticleSection string `json:"articleSection,omitempty"` + SourceURL string `json:"sourceURL,omitempty"` + StatusCode int `json:"statusCode,omitempty"` + Error string `json:"error,omitempty"` +} + +// FirecrawlDocumentV0 represents a document in Firecrawl for v0 +type FirecrawlDocumentV0 struct { ID string `json:"id,omitempty"` URL string `json:"url,omitempty"` Content string `json:"content"` @@ -66,6 +102,23 @@ type FirecrawlDocument struct { Index int `json:"index,omitempty"` } +// FirecrawlDocument represents a document in Firecrawl for v1 +type FirecrawlDocument struct { + ID string `json:"id,omitempty"` + URL string `json:"url,omitempty"` + Markdown string `json:"markdown,omitempty"` + HTML string `json:"html,omitempty"` + LLMExtraction map[string]any `json:"llm_extraction,omitempty"` + CreatedAt *time.Time `json:"createdAt,omitempty"` + UpdatedAt *time.Time `json:"updatedAt,omitempty"` + Type string `json:"type,omitempty"` + Metadata *FirecrawlDocumentMetadata `json:"metadata,omitempty"` + ChildrenLinks []string `json:"childrenLinks,omitempty"` + Provider string `json:"provider,omitempty"` + Warning string `json:"warning,omitempty"` + Index int `json:"index,omitempty"` +} + // ExtractorOptions represents options for extraction. type ExtractorOptions struct { Mode string `json:"mode,omitempty"` @@ -73,39 +126,74 @@ type ExtractorOptions struct { ExtractionSchema any `json:"extractionSchema,omitempty"` } +// ScrapeResponseV0 represents the response for scraping operations for v0 +type ScrapeResponseV0 struct { + Success bool `json:"success"` + Data *FirecrawlDocumentV0 `json:"data,omitempty"` +} + // ScrapeResponse represents the response for scraping operations type ScrapeResponse struct { Success bool `json:"success"` Data *FirecrawlDocument `json:"data,omitempty"` } +// SearchResponseV0 represents the response for searching operations for v0 +type SearchResponseV0 struct { + Success bool `json:"success"` + Data []*FirecrawlDocumentV0 `json:"data,omitempty"` +} + // SearchResponse represents the response for searching operations type SearchResponse struct { Success bool `json:"success"` Data []*FirecrawlDocument `json:"data,omitempty"` } -// CrawlResponse represents the response for crawling operations +// CrawlResponseV0 represents the response for crawling operations for v0 +type CrawlResponseV0 struct { + Success bool `json:"success"` + JobID string `json:"jobId,omitempty"` + Data []*FirecrawlDocumentV0 `json:"data,omitempty"` +} + +// CrawlResponse represents the response for crawling operations for v1 type CrawlResponse struct { Success bool `json:"success"` - JobID string `json:"jobId,omitempty"` + ID string `json:"id,omitempty"` Data []*FirecrawlDocument `json:"data,omitempty"` } -// JobStatusResponse represents the response for checking crawl job status -type JobStatusResponse struct { - Success bool `json:"success"` +// JobStatusResponseV0 represents the response for checking crawl job status for v0 +type JobStatusResponseV0 struct { + Success bool `json:"success"` + Status string `json:"status"` + Current int `json:"current,omitempty"` + CurrentURL string `json:"current_url,omitempty"` + CurrentStep string `json:"current_step,omitempty"` + Total int `json:"total,omitempty"` + JobID string `json:"jobId,omitempty"` + Data []*FirecrawlDocumentV0 `json:"data,omitempty"` + PartialData []*FirecrawlDocumentV0 `json:"partial_data,omitempty"` +} + +// CrawlStatusResponse (old JobStatusResponse) represents the response for checking crawl job status for v1 +type CrawlStatusResponse struct { Status string `json:"status"` - Current int `json:"current,omitempty"` - CurrentURL string `json:"current_url,omitempty"` - CurrentStep string `json:"current_step,omitempty"` - Total int `json:"total,omitempty"` - JobID string `json:"jobId,omitempty"` + TotalCount int `json:"total_count,omitempty"` + CreditsUsed int `json:"credits_used,omitempty"` + ExpiresAt string `json:"expires_at,omitempty"` + Next string `json:"next,omitempty"` Data []*FirecrawlDocument `json:"data,omitempty"` - PartialData []*FirecrawlDocument `json:"partial_data,omitempty"` } -// CancelCrawlJobResponse represents the response for canceling a crawl job +// CancelCrawlJobResponseV0 represents the response for canceling a crawl job for v0 +type CancelCrawlJobResponseV0 struct { + Success bool `json:"success"` + Status string `json:"status"` +} + +// CancelCrawlJobResponse represents the response for canceling a crawl job for v1 type CancelCrawlJobResponse struct { Success bool `json:"success"` Status string `json:"status"` @@ -163,9 +251,10 @@ func withBackoff(backoff int) requestOption { // FirecrawlApp represents a client for the Firecrawl API. type FirecrawlApp struct { - APIKey string - APIURL string - Client *http.Client + APIKey string + APIURL string + Client *http.Client + Version string } // NewFirecrawlApp creates a new instance of FirecrawlApp with the provided API key and API URL. @@ -179,7 +268,7 @@ type FirecrawlApp struct { // Returns: // - *FirecrawlApp: A new instance of FirecrawlApp configured with the provided or retrieved API key and API URL. // - error: An error if the API key is not provided or retrieved. -func NewFirecrawlApp(apiKey, apiURL string) (*FirecrawlApp, error) { +func NewFirecrawlApp(apiKey, apiURL string, version string) (*FirecrawlApp, error) { if apiKey == "" { apiKey = os.Getenv("FIRECRAWL_API_KEY") if apiKey == "" { @@ -194,14 +283,19 @@ func NewFirecrawlApp(apiKey, apiURL string) (*FirecrawlApp, error) { } } + if version == "" { + version = "v1" + } + client := &http.Client{ Timeout: 60 * time.Second, } return &FirecrawlApp{ - APIKey: apiKey, - APIURL: apiURL, - Client: client, + APIKey: apiKey, + APIURL: apiURL, + Client: client, + Version: version, }, nil } @@ -212,9 +306,9 @@ func NewFirecrawlApp(apiKey, apiURL string) (*FirecrawlApp, error) { // - params: Optional parameters for the scrape request, including extractor options for LLM extraction. // // Returns: -// - *FirecrawlDocument: The scraped document data. +// - *FirecrawlDocument or *FirecrawlDocumentV0: The scraped document data depending on the API version. // - error: An error if the scrape request fails. -func (app *FirecrawlApp) ScrapeURL(url string, params map[string]any) (*FirecrawlDocument, error) { +func (app *FirecrawlApp) ScrapeURL(url string, params map[string]any) (any, error) { headers := app.prepareHeaders("") scrapeBody := map[string]any{"url": url} @@ -238,7 +332,7 @@ func (app *FirecrawlApp) ScrapeURL(url string, params map[string]any) (*Firecraw resp, err := app.makeRequest( http.MethodPost, - fmt.Sprintf("%s/v0/scrape", app.APIURL), + fmt.Sprintf("%s/%s/scrape", app.APIURL, app.Version), scrapeBody, headers, "scrape URL", @@ -247,14 +341,24 @@ func (app *FirecrawlApp) ScrapeURL(url string, params map[string]any) (*Firecraw return nil, err } - var scrapeResponse ScrapeResponse - err = json.Unmarshal(resp, &scrapeResponse) - if err != nil { - return nil, err + if app.Version == "v0" { + var scrapeResponseV0 ScrapeResponseV0 + err = json.Unmarshal(resp, &scrapeResponseV0) + + if scrapeResponseV0.Success { + return scrapeResponseV0.Data, nil + } + } else if app.Version == "v1" { + var scrapeResponse ScrapeResponse + err = json.Unmarshal(resp, &scrapeResponse) + + if scrapeResponse.Success { + return scrapeResponse.Data, nil + } } - if scrapeResponse.Success { - return scrapeResponse.Data, nil + if err != nil { + return nil, err } return nil, fmt.Errorf("failed to scrape URL") @@ -271,6 +375,11 @@ func (app *FirecrawlApp) ScrapeURL(url string, params map[string]any) (*Firecraw // - error: An error if the search request fails. func (app *FirecrawlApp) Search(query string, params map[string]any) ([]*FirecrawlDocument, error) { headers := app.prepareHeaders("") + + if app.Version == "v1" { + return nil, fmt.Errorf("Search is not supported in v1") + } + searchBody := map[string]any{"query": query} for k, v := range params { searchBody[k] = v diff --git a/firecrawl_test.go b/firecrawl_test.go index 9d56c7a..cba6c12 100644 --- a/firecrawl_test.go +++ b/firecrawl_test.go @@ -12,8 +12,8 @@ import ( "github.com/stretchr/testify/require" ) -var API_URL string -var TEST_API_KEY string +var API_URL_V0 string +var TEST_API_KEY_V0 string func init() { err := godotenv.Load("../.env") @@ -24,14 +24,14 @@ func init() { TEST_API_KEY = os.Getenv("TEST_API_KEY") } -func TestNoAPIKey(t *testing.T) { - _, err := NewFirecrawlApp("", API_URL) +func TestNoAPIKeyV0(t *testing.T) { + _, err := NewFirecrawlApp("", API_URL, "v0") assert.Error(t, err) assert.Contains(t, err.Error(), "no API key provided") } -func TestScrapeURLInvalidAPIKey(t *testing.T) { - app, err := NewFirecrawlApp("invalid_api_key", API_URL) +func TestScrapeURLInvalidAPIKeyV0(t *testing.T) { + app, err := NewFirecrawlApp("invalid_api_key", API_URL, "v0") require.NoError(t, err) _, err = app.ScrapeURL("https://firecrawl.dev", nil) @@ -39,8 +39,8 @@ func TestScrapeURLInvalidAPIKey(t *testing.T) { assert.Contains(t, err.Error(), "Unexpected error during scrape URL: Status code 401. Unauthorized: Invalid token") } -func TestBlocklistedURL(t *testing.T) { - app, err := NewFirecrawlApp(TEST_API_KEY, API_URL) +func TestBlocklistedURLV0(t *testing.T) { + app, err := NewFirecrawlApp(TEST_API_KEY, API_URL, "v0") require.NoError(t, err) _, err = app.ScrapeURL("https://facebook.com/fake-test", nil) @@ -48,8 +48,8 @@ func TestBlocklistedURL(t *testing.T) { assert.Contains(t, err.Error(), "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions.") } -func TestSuccessfulResponseWithValidPreviewToken(t *testing.T) { - app, err := NewFirecrawlApp("this_is_just_a_preview_token", API_URL) +func TestSuccessfulResponseWithValidPreviewTokenV0(t *testing.T) { + app, err := NewFirecrawlApp("this_is_just_a_preview_token", API_URL, "v0") require.NoError(t, err) response, err := app.ScrapeURL("https://roastmywebsite.ai", nil) @@ -59,8 +59,8 @@ func TestSuccessfulResponseWithValidPreviewToken(t *testing.T) { assert.Contains(t, response.Content, "_Roast_") } -func TestScrapeURLE2E(t *testing.T) { - app, err := NewFirecrawlApp(TEST_API_KEY, API_URL) +func TestScrapeURLE2EV0(t *testing.T) { + app, err := NewFirecrawlApp(TEST_API_KEY, API_URL, "v0") require.NoError(t, err) response, err := app.ScrapeURL("https://roastmywebsite.ai", nil) @@ -73,8 +73,8 @@ func TestScrapeURLE2E(t *testing.T) { assert.Equal(t, response.HTML, "") } -func TestSuccessfulResponseWithValidAPIKeyAndIncludeHTML(t *testing.T) { - app, err := NewFirecrawlApp(TEST_API_KEY, API_URL) +func TestSuccessfulResponseWithValidAPIKeyAndIncludeHTMLV0(t *testing.T) { + app, err := NewFirecrawlApp(TEST_API_KEY, API_URL, "v0") require.NoError(t, err) params := map[string]any{ @@ -92,8 +92,8 @@ func TestSuccessfulResponseWithValidAPIKeyAndIncludeHTML(t *testing.T) { assert.NotNil(t, response.Metadata) } -func TestSuccessfulResponseForValidScrapeWithPDFFile(t *testing.T) { - app, err := NewFirecrawlApp(TEST_API_KEY, API_URL) +func TestSuccessfulResponseForValidScrapeWithPDFFileV0(t *testing.T) { + app, err := NewFirecrawlApp(TEST_API_KEY, API_URL, "v0") require.NoError(t, err) response, err := app.ScrapeURL("https://arxiv.org/pdf/astro-ph/9301001.pdf", nil) @@ -104,8 +104,8 @@ func TestSuccessfulResponseForValidScrapeWithPDFFile(t *testing.T) { assert.NotNil(t, response.Metadata) } -func TestSuccessfulResponseForValidScrapeWithPDFFileWithoutExplicitExtension(t *testing.T) { - app, err := NewFirecrawlApp(TEST_API_KEY, API_URL) +func TestSuccessfulResponseForValidScrapeWithPDFFileWithoutExplicitExtensionV0(t *testing.T) { + app, err := NewFirecrawlApp(TEST_API_KEY, API_URL, "v0") require.NoError(t, err) response, err := app.ScrapeURL("https://arxiv.org/pdf/astro-ph/9301001", nil) @@ -117,8 +117,8 @@ func TestSuccessfulResponseForValidScrapeWithPDFFileWithoutExplicitExtension(t * assert.NotNil(t, response.Metadata) } -func TestCrawlURLInvalidAPIKey(t *testing.T) { - app, err := NewFirecrawlApp("invalid_api_key", API_URL) +func TestCrawlURLInvalidAPIKeyV0(t *testing.T) { + app, err := NewFirecrawlApp("invalid_api_key", API_URL, "v0") require.NoError(t, err) _, err = app.CrawlURL("https://firecrawl.dev", nil, false, 2, "") @@ -126,8 +126,8 @@ func TestCrawlURLInvalidAPIKey(t *testing.T) { assert.Contains(t, err.Error(), "Unexpected error during start crawl job: Status code 401. Unauthorized: Invalid token") } -func TestShouldReturnErrorForBlocklistedURL(t *testing.T) { - app, err := NewFirecrawlApp(TEST_API_KEY, API_URL) +func TestShouldReturnErrorForBlocklistedURLV0(t *testing.T) { + app, err := NewFirecrawlApp(TEST_API_KEY, API_URL, "v0") require.NoError(t, err) _, err = app.CrawlURL("https://twitter.com/fake-test", nil, false, 2, "") @@ -135,8 +135,8 @@ func TestShouldReturnErrorForBlocklistedURL(t *testing.T) { assert.Contains(t, err.Error(), "Unexpected error during start crawl job: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions.") } -func TestCrawlURLWaitForCompletionE2E(t *testing.T) { - app, err := NewFirecrawlApp(TEST_API_KEY, API_URL) +func TestCrawlURLWaitForCompletionE2EV0(t *testing.T) { + app, err := NewFirecrawlApp(TEST_API_KEY, API_URL, "v0") require.NoError(t, err) params := map[string]any{ @@ -154,8 +154,8 @@ func TestCrawlURLWaitForCompletionE2E(t *testing.T) { assert.Contains(t, data[0].Content, "_Roast_") } -func TestCrawlURLWithIdempotencyKeyE2E(t *testing.T) { - app, err := NewFirecrawlApp(TEST_API_KEY, API_URL) +func TestCrawlURLWithIdempotencyKeyE2EV0(t *testing.T) { + app, err := NewFirecrawlApp(TEST_API_KEY, API_URL, "v0") require.NoError(t, err) uniqueIdempotencyKey := uuid.New().String() @@ -178,8 +178,8 @@ func TestCrawlURLWithIdempotencyKeyE2E(t *testing.T) { assert.Contains(t, err.Error(), "Conflict: Failed to start crawl job due to a conflict. Idempotency key already used") } -func TestCheckCrawlStatusE2E(t *testing.T) { - app, err := NewFirecrawlApp(TEST_API_KEY, API_URL) +func TestCheckCrawlStatusE2EV0(t *testing.T) { + app, err := NewFirecrawlApp(TEST_API_KEY, API_URL, "v0") require.NoError(t, err) params := map[string]any{ @@ -205,8 +205,8 @@ func TestCheckCrawlStatusE2E(t *testing.T) { assert.Greater(t, len(statusResponse.Data), 0) } -func TestSearchE2E(t *testing.T) { - app, err := NewFirecrawlApp(TEST_API_KEY, API_URL) +func TestSearchE2EV0(t *testing.T) { + app, err := NewFirecrawlApp(TEST_API_KEY, API_URL, "v0") require.NoError(t, err) response, err := app.Search("test query", nil) @@ -217,8 +217,8 @@ func TestSearchE2E(t *testing.T) { assert.NotEqual(t, response[0].Content, "") } -func TestSearchInvalidAPIKey(t *testing.T) { - app, err := NewFirecrawlApp("invalid_api_key", API_URL) +func TestSearchInvalidAPIKeyV0(t *testing.T) { + app, err := NewFirecrawlApp("invalid_api_key", API_URL, "v0") require.NoError(t, err) _, err = app.Search("test query", nil) @@ -226,8 +226,8 @@ func TestSearchInvalidAPIKey(t *testing.T) { assert.Contains(t, err.Error(), "Unexpected error during search: Status code 401. Unauthorized: Invalid token") } -func TestLLMExtraction(t *testing.T) { - app, err := NewFirecrawlApp(TEST_API_KEY, API_URL) +func TestLLMExtractionV0(t *testing.T) { + app, err := NewFirecrawlApp(TEST_API_KEY, API_URL, "v0") require.NoError(t, err) params := map[string]any{ @@ -255,8 +255,8 @@ func TestLLMExtraction(t *testing.T) { assert.IsType(t, true, response.LLMExtraction["is_open_source"]) } -func TestCancelCrawlJobInvalidAPIKey(t *testing.T) { - app, err := NewFirecrawlApp("invalid_api_key", API_URL) +func TestCancelCrawlJobInvalidAPIKeyV0(t *testing.T) { + app, err := NewFirecrawlApp("invalid_api_key", API_URL, "v0") require.NoError(t, err) _, err = app.CancelCrawlJob("test query") @@ -264,8 +264,8 @@ func TestCancelCrawlJobInvalidAPIKey(t *testing.T) { assert.Contains(t, err.Error(), "Unexpected error during cancel crawl job: Status code 401. Unauthorized: Invalid token") } -func TestCancelNonExistingCrawlJob(t *testing.T) { - app, err := NewFirecrawlApp(TEST_API_KEY, API_URL) +func TestCancelNonExistingCrawlJobV0(t *testing.T) { + app, err := NewFirecrawlApp(TEST_API_KEY, API_URL, "v0") require.NoError(t, err) jobID := uuid.New().String() @@ -274,8 +274,8 @@ func TestCancelNonExistingCrawlJob(t *testing.T) { assert.Contains(t, err.Error(), "Job not found") } -func TestCancelCrawlJobE2E(t *testing.T) { - app, err := NewFirecrawlApp(TEST_API_KEY, API_URL) +func TestCancelCrawlJobE2EV0(t *testing.T) { + app, err := NewFirecrawlApp(TEST_API_KEY, API_URL, "v0") require.NoError(t, err) response, err := app.CrawlURL("https://firecrawl.dev", nil, false, 2, "") diff --git a/firecrawl_test_v1.go b/firecrawl_test_v1.go new file mode 100644 index 0000000..0a0832f --- /dev/null +++ b/firecrawl_test_v1.go @@ -0,0 +1,292 @@ +package firecrawl + +import ( + "log" + "os" + "testing" + "time" + + "github.com/google/uuid" + "github.com/joho/godotenv" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +var API_URL string +var TEST_API_KEY string + +func init() { + err := godotenv.Load("../.env") + if err != nil { + log.Fatalf("Error loading .env file: %v", err) + } + API_URL = os.Getenv("API_URL") + TEST_API_KEY = os.Getenv("TEST_API_KEY") +} + +func TestNoAPIKey(t *testing.T) { + _, err := NewFirecrawlApp("", API_URL, "v1") + assert.Error(t, err) + assert.Contains(t, err.Error(), "no API key provided") +} + +func TestScrapeURLInvalidAPIKey(t *testing.T) { + app, err := NewFirecrawlApp("invalid_api_key", API_URL, "v1") + require.NoError(t, err) + + _, err = app.ScrapeURL("https://firecrawl.dev", nil) + assert.Error(t, err) + assert.Contains(t, err.Error(), "Unexpected error during scrape URL: Status code 401. Unauthorized: Invalid token") +} + +func TestBlocklistedURL(t *testing.T) { + app, err := NewFirecrawlApp(TEST_API_KEY, API_URL, "v1") + require.NoError(t, err) + + _, err = app.ScrapeURL("https://facebook.com/fake-test", nil) + assert.Error(t, err) + assert.Contains(t, err.Error(), "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions.") +} + +func TestSuccessfulResponseWithValidPreviewToken(t *testing.T) { + app, err := NewFirecrawlApp("this_is_just_a_preview_token", API_URL, "v1") + require.NoError(t, err) + + response, err := app.ScrapeURL("https://roastmywebsite.ai", nil) + require.NoError(t, err) + assert.NotNil(t, response) + + assert.Contains(t, response.Content, "_Roast_") +} + +func TestScrapeURLE2E(t *testing.T) { + app, err := NewFirecrawlApp(TEST_API_KEY, API_URL, "v1") + require.NoError(t, err) + + response, err := app.ScrapeURL("https://roastmywebsite.ai", nil) + require.NoError(t, err) + assert.NotNil(t, response) + + assert.Contains(t, response.Content, "_Roast_") + assert.NotEqual(t, response.Markdown, "") + assert.NotNil(t, response.Metadata) + assert.Equal(t, response.HTML, "") +} + +func TestSuccessfulResponseWithValidAPIKeyAndIncludeHTML(t *testing.T) { + app, err := NewFirecrawlApp(TEST_API_KEY, API_URL, "v1") + require.NoError(t, err) + + params := map[string]any{ + "pageOptions": map[string]any{ + "includeHtml": true, + }, + } + response, err := app.ScrapeURL("https://roastmywebsite.ai", params) + require.NoError(t, err) + assert.NotNil(t, response) + + assert.Contains(t, response.Content, "_Roast_") + assert.Contains(t, response.Markdown, "_Roast_") + assert.Contains(t, response.HTML, " Date: Mon, 26 Aug 2024 16:58:22 -0300 Subject: [PATCH 2/3] feat: v1 map, type fix, ... --- firecrawl.go | 263 ++++++++++---- firecrawl_test.go | 343 ++++++++++++------- firecrawl_test_v1.go => firecrawl_test.go_V0 | 146 ++++---- 3 files changed, 481 insertions(+), 271 deletions(-) rename firecrawl_test_v1.go => firecrawl_test.go_V0 (58%) diff --git a/firecrawl.go b/firecrawl.go index 8d01c0d..d391cb3 100644 --- a/firecrawl.go +++ b/firecrawl.go @@ -104,19 +104,12 @@ type FirecrawlDocumentV0 struct { // FirecrawlDocument represents a document in Firecrawl for v1 type FirecrawlDocument struct { - ID string `json:"id,omitempty"` - URL string `json:"url,omitempty"` - Markdown string `json:"markdown,omitempty"` - HTML string `json:"html,omitempty"` - LLMExtraction map[string]any `json:"llm_extraction,omitempty"` - CreatedAt *time.Time `json:"createdAt,omitempty"` - UpdatedAt *time.Time `json:"updatedAt,omitempty"` - Type string `json:"type,omitempty"` - Metadata *FirecrawlDocumentMetadata `json:"metadata,omitempty"` - ChildrenLinks []string `json:"childrenLinks,omitempty"` - Provider string `json:"provider,omitempty"` - Warning string `json:"warning,omitempty"` - Index int `json:"index,omitempty"` + Markdown string `json:"markdown,omitempty"` + HTML string `json:"html,omitempty"` + RawHTML string `json:"rawHtml,omitempty"` + Screenshot string `json:"screenshot,omitempty"` + Links []string `json:"links,omitempty"` + Metadata *FirecrawlDocumentMetadata `json:"metadata,omitempty"` } // ExtractorOptions represents options for extraction. @@ -144,12 +137,6 @@ type SearchResponseV0 struct { Data []*FirecrawlDocumentV0 `json:"data,omitempty"` } -// SearchResponse represents the response for searching operations -type SearchResponse struct { - Success bool `json:"success"` - Data []*FirecrawlDocument `json:"data,omitempty"` -} - // CrawlResponseV0 represents the response for crawling operations for v0 type CrawlResponseV0 struct { Success bool `json:"success"` @@ -162,6 +149,7 @@ type CrawlResponse struct { Success bool `json:"success"` ID string `json:"id,omitempty"` Data []*FirecrawlDocument `json:"data,omitempty"` + URL string `json:"url,omitempty"` } // JobStatusResponseV0 represents the response for checking crawl job status for v0 @@ -180,9 +168,9 @@ type JobStatusResponseV0 struct { // CrawlStatusResponse (old JobStatusResponse) represents the response for checking crawl job status for v1 type CrawlStatusResponse struct { Status string `json:"status"` - TotalCount int `json:"total_count,omitempty"` - CreditsUsed int `json:"credits_used,omitempty"` - ExpiresAt string `json:"expires_at,omitempty"` + TotalCount int `json:"totalCount,omitempty"` + CreditsUsed int `json:"creditsUsed,omitempty"` + ExpiresAt string `json:"expiresAt,omitempty"` Next string `json:"next,omitempty"` Data []*FirecrawlDocument `json:"data,omitempty"` } @@ -199,6 +187,13 @@ type CancelCrawlJobResponse struct { Status string `json:"status"` } +// MapResponse represents the response for mapping operations +type MapResponse struct { + Success bool `json:"success"` + Links []string `json:"links,omitempty"` + Error string `json:"error,omitempty"` +} + // requestOptions represents options for making requests. type requestOptions struct { retries int @@ -373,7 +368,7 @@ func (app *FirecrawlApp) ScrapeURL(url string, params map[string]any) (any, erro // Returns: // - []*FirecrawlDocument: A slice of FirecrawlDocument containing the search results. // - error: An error if the search request fails. -func (app *FirecrawlApp) Search(query string, params map[string]any) ([]*FirecrawlDocument, error) { +func (app *FirecrawlApp) Search(query string, params map[string]any) ([]*FirecrawlDocumentV0, error) { headers := app.prepareHeaders("") if app.Version == "v1" { @@ -396,7 +391,7 @@ func (app *FirecrawlApp) Search(query string, params map[string]any) ([]*Firecra return nil, err } - var searchResponse SearchResponse + var searchResponse SearchResponseV0 err = json.Unmarshal(resp, &searchResponse) if err != nil { return nil, err @@ -430,7 +425,7 @@ func (app *FirecrawlApp) CrawlURL(url string, params map[string]any, waitUntilDo resp, err := app.makeRequest( http.MethodPost, - fmt.Sprintf("%s/v0/crawl", app.APIURL), + fmt.Sprintf("%s/%s/crawl", app.APIURL, app.Version), crawlBody, headers, "start crawl job", @@ -441,36 +436,62 @@ func (app *FirecrawlApp) CrawlURL(url string, params map[string]any, waitUntilDo return nil, err } - var crawlResponse CrawlResponse - err = json.Unmarshal(resp, &crawlResponse) - if err != nil { - return nil, err - } + if app.Version == "v0" { + var crawlResponse CrawlResponseV0 + err = json.Unmarshal(resp, &crawlResponse) + if err != nil { + return nil, err + } - if waitUntilDone { - return app.monitorJobStatus(crawlResponse.JobID, headers, pollInterval) - } + if waitUntilDone { + return app.monitorJobStatus(crawlResponse.JobID, headers, pollInterval, "") + } + + if crawlResponse.JobID == "" { + return nil, fmt.Errorf("failed to get job ID") + } + + return crawlResponse.JobID, nil + } else if app.Version == "v1" { + var crawlResponse CrawlResponse + err = json.Unmarshal(resp, &crawlResponse) + if err != nil { + return nil, err + } + + if waitUntilDone { + return app.monitorJobStatus(crawlResponse.ID, headers, pollInterval, crawlResponse.URL) + } - if crawlResponse.JobID == "" { - return nil, fmt.Errorf("failed to get job ID") + if crawlResponse.ID == "" { + return nil, fmt.Errorf("failed to get job ID") + } + + return crawlResponse.ID, nil } - return crawlResponse.JobID, nil + return nil, fmt.Errorf("invalid version") } // CheckCrawlStatus checks the status of a crawl job using the Firecrawl API. // // Parameters: -// - jobID: The ID of the crawl job to check. +// - ID: The ID of the crawl job to check. // // Returns: -// - *JobStatusResponse: The status of the crawl job. +// - *JobStatusResponse or *JobStatusResponseV0: The status of the crawl job. // - error: An error if the crawl status check request fails. -func (app *FirecrawlApp) CheckCrawlStatus(jobID string) (*JobStatusResponse, error) { +func (app *FirecrawlApp) CheckCrawlStatus(ID string) (any, error) { headers := app.prepareHeaders("") + apiURL := "" + if app.Version == "v0" { + apiURL = fmt.Sprintf("%s/v0/crawl/status/%s", app.APIURL, ID) + } else if app.Version == "v1" { + apiURL = fmt.Sprintf("%s/v1/crawl/%s", app.APIURL, ID) + } resp, err := app.makeRequest( http.MethodGet, - fmt.Sprintf("%s/v0/crawl/status/%s", app.APIURL, jobID), + apiURL, nil, headers, "check crawl status", @@ -481,28 +502,46 @@ func (app *FirecrawlApp) CheckCrawlStatus(jobID string) (*JobStatusResponse, err return nil, err } - var jobStatusResponse JobStatusResponse - err = json.Unmarshal(resp, &jobStatusResponse) - if err != nil { - return nil, err + if app.Version == "v0" { + var jobStatusResponse JobStatusResponseV0 + err = json.Unmarshal(resp, &jobStatusResponse) + if err != nil { + return nil, err + } + + return &jobStatusResponse, nil + } else if app.Version == "v1" { + var jobStatusResponse CrawlStatusResponse + err = json.Unmarshal(resp, &jobStatusResponse) + if err != nil { + return nil, err + } + + return &jobStatusResponse, nil } - return &jobStatusResponse, nil + return nil, fmt.Errorf("invalid version") } // CancelCrawlJob cancels a crawl job using the Firecrawl API. // // Parameters: -// - jobID: The ID of the crawl job to cancel. +// - ID: The ID of the crawl job to cancel. // // Returns: // - string: The status of the crawl job after cancellation. // - error: An error if the crawl job cancellation request fails. -func (app *FirecrawlApp) CancelCrawlJob(jobID string) (string, error) { +func (app *FirecrawlApp) CancelCrawlJob(ID string) (string, error) { headers := app.prepareHeaders("") + apiURL := "" + if app.Version == "v0" { + apiURL = fmt.Sprintf("%s/v0/crawl/cancel/%s", app.APIURL, ID) + } else if app.Version == "v1" { + apiURL = fmt.Sprintf("%s/v1/crawl/%s", app.APIURL, ID) + } resp, err := app.makeRequest( http.MethodDelete, - fmt.Sprintf("%s/v0/crawl/cancel/%s", app.APIURL, jobID), + apiURL, nil, headers, "cancel crawl job", @@ -520,6 +559,50 @@ func (app *FirecrawlApp) CancelCrawlJob(jobID string) (string, error) { return cancelCrawlJobResponse.Status, nil } +// MapURL initiates a mapping operation for a URL using the Firecrawl API. +// +// Parameters: +// - url: The URL to map. +// - params: Optional parameters for the mapping request. +// +// Returns: +// - *MapResponse: The response from the mapping operation. +// - error: An error if the mapping request fails. +func (app *FirecrawlApp) MapURL(url string, params map[string]any) (*MapResponse, error) { + if app.Version == "v0" { + return nil, fmt.Errorf("map is not supported in v0") + } + + headers := app.prepareHeaders("") + jsonData := map[string]any{"url": url} + for k, v := range params { + jsonData[k] = v + } + + resp, err := app.makeRequest( + http.MethodPost, + fmt.Sprintf("%s/%s/map", app.APIURL, app.Version), + jsonData, + headers, + "map", + ) + if err != nil { + return nil, err + } + + var mapResponse MapResponse + err = json.Unmarshal(resp, &mapResponse) + if err != nil { + return nil, err + } + + if mapResponse.Success { + return &mapResponse, nil + } else { + return nil, fmt.Errorf("map operation failed: %s", mapResponse.Error) + } +} + // prepareHeaders prepares the headers for an HTTP request. // // Parameters: @@ -603,19 +686,26 @@ func (app *FirecrawlApp) makeRequest(method, url string, data map[string]any, he // monitorJobStatus monitors the status of a crawl job using the Firecrawl API. // // Parameters: -// - jobID: The ID of the crawl job to monitor. +// - ID: The ID of the crawl job to monitor. // - headers: The headers to be included in the request. // - pollInterval: The interval (in seconds) at which to poll the job status. // // Returns: -// - []*FirecrawlDocument: The crawl result if the job is completed. +// - []*FirecrawlDocument or []*FirecrawlDocumentV0: The crawl result if the job is completed. // - error: An error if the crawl status check request fails. -func (app *FirecrawlApp) monitorJobStatus(jobID string, headers map[string]string, pollInterval int) ([]*FirecrawlDocument, error) { +func (app *FirecrawlApp) monitorJobStatus(ID string, headers map[string]string, pollInterval int, checkUrl string) (any, error) { attempts := 0 + apiURL := "" + if app.Version == "v0" { + apiURL = fmt.Sprintf("%s/v0/crawl/status/%s", app.APIURL, ID) + } else if app.Version == "v1" { + apiURL = checkUrl + } + for { resp, err := app.makeRequest( http.MethodGet, - fmt.Sprintf("%s/v0/crawl/status/%s", app.APIURL, jobID), + apiURL, nil, headers, "check crawl status", @@ -626,30 +716,59 @@ func (app *FirecrawlApp) monitorJobStatus(jobID string, headers map[string]strin return nil, err } - var statusData JobStatusResponse - err = json.Unmarshal(resp, &statusData) - if err != nil { - return nil, err - } + if app.Version == "v0" { + var statusData JobStatusResponseV0 + err = json.Unmarshal(resp, &statusData) + if err != nil { + return nil, err + } - status := statusData.Status - if status == "" { - return nil, fmt.Errorf("invalid status in response") - } + status := statusData.Status + if status == "" { + return nil, fmt.Errorf("invalid status in response") + } + + if status == "completed" { + if statusData.Data != nil { + return statusData.Data, nil + } + attempts++ + if attempts > 3 { + return nil, fmt.Errorf("crawl job completed but no data was returned") + } + } else if status == "active" || status == "paused" || status == "pending" || status == "queued" || status == "waiting" || status == "scraping" { + pollInterval = max(pollInterval, 2) + time.Sleep(time.Duration(pollInterval) * time.Second) + } else { + return nil, fmt.Errorf("crawl job failed or was stopped. Status: %s", status) + } - if status == "completed" { - if statusData.Data != nil { - return statusData.Data, nil + } else if app.Version == "v1" { + var statusData CrawlStatusResponse + err = json.Unmarshal(resp, &statusData) + if err != nil { + return nil, err } - attempts++ - if attempts > 3 { - return nil, fmt.Errorf("crawl job completed but no data was returned") + + status := statusData.Status + if status == "" { + return nil, fmt.Errorf("invalid status in response") + } + + if status == "completed" { + if statusData.Data != nil { + return statusData, nil + } + attempts++ + if attempts > 3 { + return nil, fmt.Errorf("crawl job completed but no data was returned") + } + } else if status == "active" || status == "paused" || status == "pending" || status == "queued" || status == "waiting" || status == "scraping" { + pollInterval = max(pollInterval, 2) + time.Sleep(time.Duration(pollInterval) * time.Second) + } else { + return nil, fmt.Errorf("crawl job failed or was stopped. Status: %s", status) } - } else if status == "active" || status == "paused" || status == "pending" || status == "queued" || status == "waiting" { - pollInterval = max(pollInterval, 2) - time.Sleep(time.Duration(pollInterval) * time.Second) - } else { - return nil, fmt.Errorf("crawl job failed or was stopped. Status: %s", status) } } } diff --git a/firecrawl_test.go b/firecrawl_test.go index cba6c12..101cbd8 100644 --- a/firecrawl_test.go +++ b/firecrawl_test.go @@ -12,8 +12,8 @@ import ( "github.com/stretchr/testify/require" ) -var API_URL_V0 string -var TEST_API_KEY_V0 string +var API_URL string +var TEST_API_KEY string func init() { err := godotenv.Load("../.env") @@ -24,14 +24,14 @@ func init() { TEST_API_KEY = os.Getenv("TEST_API_KEY") } -func TestNoAPIKeyV0(t *testing.T) { - _, err := NewFirecrawlApp("", API_URL, "v0") +func TestNoAPIKey(t *testing.T) { + _, err := NewFirecrawlApp("", API_URL, "v1") assert.Error(t, err) assert.Contains(t, err.Error(), "no API key provided") } -func TestScrapeURLInvalidAPIKeyV0(t *testing.T) { - app, err := NewFirecrawlApp("invalid_api_key", API_URL, "v0") +func TestScrapeURLInvalidAPIKey(t *testing.T) { + app, err := NewFirecrawlApp("invalid_api_key", API_URL, "v1") require.NoError(t, err) _, err = app.ScrapeURL("https://firecrawl.dev", nil) @@ -39,73 +39,90 @@ func TestScrapeURLInvalidAPIKeyV0(t *testing.T) { assert.Contains(t, err.Error(), "Unexpected error during scrape URL: Status code 401. Unauthorized: Invalid token") } -func TestBlocklistedURLV0(t *testing.T) { - app, err := NewFirecrawlApp(TEST_API_KEY, API_URL, "v0") +func TestBlocklistedURL(t *testing.T) { + app, err := NewFirecrawlApp(TEST_API_KEY, API_URL, "v1") require.NoError(t, err) _, err = app.ScrapeURL("https://facebook.com/fake-test", nil) assert.Error(t, err) - assert.Contains(t, err.Error(), "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions.") + assert.Contains(t, err.Error(), "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions.") } -func TestSuccessfulResponseWithValidPreviewTokenV0(t *testing.T) { - app, err := NewFirecrawlApp("this_is_just_a_preview_token", API_URL, "v0") +func TestSuccessfulResponseWithValidPreviewToken(t *testing.T) { + app, err := NewFirecrawlApp("this_is_just_a_preview_token", API_URL, "v1") require.NoError(t, err) response, err := app.ScrapeURL("https://roastmywebsite.ai", nil) require.NoError(t, err) assert.NotNil(t, response) - assert.Contains(t, response.Content, "_Roast_") + scrapeResponse := response.(*FirecrawlDocument) + assert.Contains(t, scrapeResponse.Markdown, "_Roast_") } -func TestScrapeURLE2EV0(t *testing.T) { - app, err := NewFirecrawlApp(TEST_API_KEY, API_URL, "v0") +func TestScrapeURLE2E(t *testing.T) { + app, err := NewFirecrawlApp(TEST_API_KEY, API_URL, "v1") require.NoError(t, err) response, err := app.ScrapeURL("https://roastmywebsite.ai", nil) require.NoError(t, err) assert.NotNil(t, response) - assert.Contains(t, response.Content, "_Roast_") - assert.NotEqual(t, response.Markdown, "") - assert.NotNil(t, response.Metadata) - assert.Equal(t, response.HTML, "") + scrapeResponse := response.(*FirecrawlDocument) + assert.Contains(t, scrapeResponse.Markdown, "_Roast_") + assert.NotEqual(t, scrapeResponse.Markdown, "") + assert.NotNil(t, scrapeResponse.Metadata) + assert.Equal(t, scrapeResponse.HTML, "") } -func TestSuccessfulResponseWithValidAPIKeyAndIncludeHTMLV0(t *testing.T) { - app, err := NewFirecrawlApp(TEST_API_KEY, API_URL, "v0") +func TestSuccessfulResponseWithValidAPIKeyAndIncludeHTML(t *testing.T) { + app, err := NewFirecrawlApp(TEST_API_KEY, API_URL, "v1") require.NoError(t, err) params := map[string]any{ - "pageOptions": map[string]any{ - "includeHtml": true, - }, + "formats": []string{"markdown", "html", "rawHtml", "screenshot", "links"}, + "headers": map[string]string{"x-key": "test"}, + "includeTags": []string{"h1"}, + "excludeTags": []string{"h2"}, + "onlyMainContent": true, + "timeout": 30000, + "waitFor": 1000, } + response, err := app.ScrapeURL("https://roastmywebsite.ai", params) require.NoError(t, err) assert.NotNil(t, response) - assert.Contains(t, response.Content, "_Roast_") - assert.Contains(t, response.Markdown, "_Roast_") - assert.Contains(t, response.HTML, "= maxChecks { + break + } - assert.Equal(t, "completed", statusResponse.Status) - assert.Greater(t, len(statusResponse.Data), 0) -} + time.Sleep(5 * time.Second) // wait for 5 seconds -func TestSearchE2EV0(t *testing.T) { - app, err := NewFirecrawlApp(TEST_API_KEY, API_URL, "v0") - require.NoError(t, err) - - response, err := app.Search("test query", nil) - require.NoError(t, err) - assert.NotNil(t, response) + statusResponse, err := app.CheckCrawlStatus(jobID) + require.NoError(t, err) + assert.NotNil(t, statusResponse) - assert.Greater(t, len(response), 2) - assert.NotEqual(t, response[0].Content, "") -} + checkCrawlStatusResponse := statusResponse.(*CrawlStatusResponse) + assert.Greater(t, len(checkCrawlStatusResponse.Data), 0) + assert.GreaterOrEqual(t, checkCrawlStatusResponse.TotalCount, 0) + assert.GreaterOrEqual(t, checkCrawlStatusResponse.CreditsUsed, 0) -func TestSearchInvalidAPIKeyV0(t *testing.T) { - app, err := NewFirecrawlApp("invalid_api_key", API_URL, "v0") - require.NoError(t, err) + if checkCrawlStatusResponse.Status == "completed" { + break + } - _, err = app.Search("test query", nil) - assert.Error(t, err) - assert.Contains(t, err.Error(), "Unexpected error during search: Status code 401. Unauthorized: Invalid token") -} - -func TestLLMExtractionV0(t *testing.T) { - app, err := NewFirecrawlApp(TEST_API_KEY, API_URL, "v0") - require.NoError(t, err) - - params := map[string]any{ - "extractorOptions": ExtractorOptions{ - Mode: "llm-extraction", - ExtractionPrompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source", - ExtractionSchema: map[string]any{ - "type": "object", - "properties": map[string]any{ - "company_mission": map[string]string{"type": "string"}, - "supports_sso": map[string]string{"type": "boolean"}, - "is_open_source": map[string]string{"type": "boolean"}, - }, - "required": []string{"company_mission", "supports_sso", "is_open_source"}, - }, - }, + checks++ } - response, err := app.ScrapeURL("https://mendable.ai", params) + // Final check after loop or if completed + statusResponse, err := app.CheckCrawlStatus(jobID) require.NoError(t, err) - assert.NotNil(t, response) + assert.NotNil(t, statusResponse) - assert.Contains(t, response.LLMExtraction, "company_mission") - assert.IsType(t, true, response.LLMExtraction["supports_sso"]) - assert.IsType(t, true, response.LLMExtraction["is_open_source"]) + finalStatusResponse := statusResponse.(*CrawlStatusResponse) + assert.Equal(t, "completed", finalStatusResponse.Status) + assert.Greater(t, len(finalStatusResponse.Data), 0) + assert.Greater(t, finalStatusResponse.TotalCount, 0) + assert.Greater(t, finalStatusResponse.CreditsUsed, 0) + assert.NotNil(t, finalStatusResponse.Data[0].Markdown) + assert.Contains(t, finalStatusResponse.Data[0].HTML, " Date: Thu, 29 Aug 2024 14:55:37 -0300 Subject: [PATCH 3/3] feat(v1): ok --- firecrawl.go | 632 +++++++++++++++++++++------------------------- firecrawl_test.go | 335 ++++++++++++++---------- 2 files changed, 487 insertions(+), 480 deletions(-) diff --git a/firecrawl.go b/firecrawl.go index d391cb3..f0847ad 100644 --- a/firecrawl.go +++ b/firecrawl.go @@ -12,97 +12,43 @@ import ( "time" ) -// FirecrawlDocumentMetadataV0 represents metadata for a Firecrawl document for v0 -type FirecrawlDocumentMetadataV0 struct { - Title string `json:"title,omitempty"` - Description string `json:"description,omitempty"` - Language string `json:"language,omitempty"` - Keywords string `json:"keywords,omitempty"` - Robots string `json:"robots,omitempty"` - OGTitle string `json:"ogTitle,omitempty"` - OGDescription string `json:"ogDescription,omitempty"` - OGURL string `json:"ogUrl,omitempty"` - OGImage string `json:"ogImage,omitempty"` - OGAudio string `json:"ogAudio,omitempty"` - OGDeterminer string `json:"ogDeterminer,omitempty"` - OGLocale string `json:"ogLocale,omitempty"` - OGLocaleAlternate []string `json:"ogLocaleAlternate,omitempty"` - OGSiteName string `json:"ogSiteName,omitempty"` - OGVideo string `json:"ogVideo,omitempty"` - DCTermsCreated string `json:"dctermsCreated,omitempty"` - DCDateCreated string `json:"dcDateCreated,omitempty"` - DCDate string `json:"dcDate,omitempty"` - DCTermsType string `json:"dctermsType,omitempty"` - DCType string `json:"dcType,omitempty"` - DCTermsAudience string `json:"dctermsAudience,omitempty"` - DCTermsSubject string `json:"dctermsSubject,omitempty"` - DCSubject string `json:"dcSubject,omitempty"` - DCDescription string `json:"dcDescription,omitempty"` - DCTermsKeywords string `json:"dctermsKeywords,omitempty"` - ModifiedTime string `json:"modifiedTime,omitempty"` - PublishedTime string `json:"publishedTime,omitempty"` - ArticleTag string `json:"articleTag,omitempty"` - ArticleSection string `json:"articleSection,omitempty"` - SourceURL string `json:"sourceURL,omitempty"` - PageStatusCode int `json:"pageStatusCode,omitempty"` - PageError string `json:"pageError,omitempty"` -} - -// FirecrawlDocumentMetadata represents metadata for a Firecrawl document for v1 +// FirecrawlDocumentMetadata represents metadata for a Firecrawl document type FirecrawlDocumentMetadata struct { - Title string `json:"title,omitempty"` - Description string `json:"description,omitempty"` - Language string `json:"language,omitempty"` - Keywords string `json:"keywords,omitempty"` - Robots string `json:"robots,omitempty"` - OGTitle string `json:"ogTitle,omitempty"` - OGDescription string `json:"ogDescription,omitempty"` - OGURL string `json:"ogUrl,omitempty"` - OGImage string `json:"ogImage,omitempty"` - OGAudio string `json:"ogAudio,omitempty"` - OGDeterminer string `json:"ogDeterminer,omitempty"` - OGLocale string `json:"ogLocale,omitempty"` - OGLocaleAlternate []string `json:"ogLocaleAlternate,omitempty"` - OGSiteName string `json:"ogSiteName,omitempty"` - OGVideo string `json:"ogVideo,omitempty"` - DCTermsCreated string `json:"dctermsCreated,omitempty"` - DCDateCreated string `json:"dcDateCreated,omitempty"` - DCDate string `json:"dcDate,omitempty"` - DCTermsType string `json:"dctermsType,omitempty"` - DCType string `json:"dcType,omitempty"` - DCTermsAudience string `json:"dctermsAudience,omitempty"` - DCTermsSubject string `json:"dctermsSubject,omitempty"` - DCSubject string `json:"dcSubject,omitempty"` - DCDescription string `json:"dcDescription,omitempty"` - DCTermsKeywords string `json:"dctermsKeywords,omitempty"` - ModifiedTime string `json:"modifiedTime,omitempty"` - PublishedTime string `json:"publishedTime,omitempty"` - ArticleTag string `json:"articleTag,omitempty"` - ArticleSection string `json:"articleSection,omitempty"` - SourceURL string `json:"sourceURL,omitempty"` - StatusCode int `json:"statusCode,omitempty"` - Error string `json:"error,omitempty"` -} - -// FirecrawlDocumentV0 represents a document in Firecrawl for v0 -type FirecrawlDocumentV0 struct { - ID string `json:"id,omitempty"` - URL string `json:"url,omitempty"` - Content string `json:"content"` - Markdown string `json:"markdown,omitempty"` - HTML string `json:"html,omitempty"` - LLMExtraction map[string]any `json:"llm_extraction,omitempty"` - CreatedAt *time.Time `json:"createdAt,omitempty"` - UpdatedAt *time.Time `json:"updatedAt,omitempty"` - Type string `json:"type,omitempty"` - Metadata *FirecrawlDocumentMetadata `json:"metadata,omitempty"` - ChildrenLinks []string `json:"childrenLinks,omitempty"` - Provider string `json:"provider,omitempty"` - Warning string `json:"warning,omitempty"` - Index int `json:"index,omitempty"` + Title *string `json:"title,omitempty"` + Description *string `json:"description,omitempty"` + Language *string `json:"language,omitempty"` + Keywords *string `json:"keywords,omitempty"` + Robots *string `json:"robots,omitempty"` + OGTitle *string `json:"ogTitle,omitempty"` + OGDescription *string `json:"ogDescription,omitempty"` + OGURL *string `json:"ogUrl,omitempty"` + OGImage *string `json:"ogImage,omitempty"` + OGAudio *string `json:"ogAudio,omitempty"` + OGDeterminer *string `json:"ogDeterminer,omitempty"` + OGLocale *string `json:"ogLocale,omitempty"` + OGLocaleAlternate []*string `json:"ogLocaleAlternate,omitempty"` + OGSiteName *string `json:"ogSiteName,omitempty"` + OGVideo *string `json:"ogVideo,omitempty"` + DCTermsCreated *string `json:"dctermsCreated,omitempty"` + DCDateCreated *string `json:"dcDateCreated,omitempty"` + DCDate *string `json:"dcDate,omitempty"` + DCTermsType *string `json:"dctermsType,omitempty"` + DCType *string `json:"dcType,omitempty"` + DCTermsAudience *string `json:"dctermsAudience,omitempty"` + DCTermsSubject *string `json:"dctermsSubject,omitempty"` + DCSubject *string `json:"dcSubject,omitempty"` + DCDescription *string `json:"dcDescription,omitempty"` + DCTermsKeywords *string `json:"dctermsKeywords,omitempty"` + ModifiedTime *string `json:"modifiedTime,omitempty"` + PublishedTime *string `json:"publishedTime,omitempty"` + ArticleTag *string `json:"articleTag,omitempty"` + ArticleSection *string `json:"articleSection,omitempty"` + SourceURL *string `json:"sourceURL,omitempty"` + StatusCode *int `json:"statusCode,omitempty"` + Error *string `json:"error,omitempty"` } -// FirecrawlDocument represents a document in Firecrawl for v1 +// FirecrawlDocument represents a document in Firecrawl type FirecrawlDocument struct { Markdown string `json:"markdown,omitempty"` HTML string `json:"html,omitempty"` @@ -112,17 +58,16 @@ type FirecrawlDocument struct { Metadata *FirecrawlDocumentMetadata `json:"metadata,omitempty"` } -// ExtractorOptions represents options for extraction. -type ExtractorOptions struct { - Mode string `json:"mode,omitempty"` - ExtractionPrompt string `json:"extractionPrompt,omitempty"` - ExtractionSchema any `json:"extractionSchema,omitempty"` -} - -// ScrapeResponseV0 represents the response for scraping operations for v0 -type ScrapeResponseV0 struct { - Success bool `json:"success"` - Data *FirecrawlDocumentV0 `json:"data,omitempty"` +// ScrapeParams represents the parameters for a scrape request. +type ScrapeParams struct { + Formats []string `json:"formats,omitempty"` + Headers *map[string]string `json:"headers,omitempty"` + IncludeTags []string `json:"includeTags,omitempty"` + ExcludeTags []string `json:"excludeTags,omitempty"` + OnlyMainContent *bool `json:"onlyMainContent,omitempty"` + WaitFor *int `json:"waitFor,omitempty"` + ParsePDF *bool `json:"parsePDF,omitempty"` + Timeout *int `json:"timeout,omitempty"` } // ScrapeResponse represents the response for scraping operations @@ -131,60 +76,49 @@ type ScrapeResponse struct { Data *FirecrawlDocument `json:"data,omitempty"` } -// SearchResponseV0 represents the response for searching operations for v0 -type SearchResponseV0 struct { - Success bool `json:"success"` - Data []*FirecrawlDocumentV0 `json:"data,omitempty"` -} - -// CrawlResponseV0 represents the response for crawling operations for v0 -type CrawlResponseV0 struct { - Success bool `json:"success"` - JobID string `json:"jobId,omitempty"` - Data []*FirecrawlDocumentV0 `json:"data,omitempty"` +// CrawlParams represents the parameters for a crawl request. +type CrawlParams struct { + ScrapeOptions ScrapeParams `json:"scrapeOptions"` + Webhook *string `json:"webhook,omitempty"` + Limit *int `json:"limit,omitempty"` + IncludePaths []string `json:"includePaths,omitempty"` + ExcludePaths []string `json:"excludePaths,omitempty"` + MaxDepth *int `json:"maxDepth,omitempty"` + AllowBackwardLinks *bool `json:"allowBackwardLinks,omitempty"` + AllowExternalLinks *bool `json:"allowExternalLinks,omitempty"` + IgnoreSitemap *bool `json:"ignoreSitemap,omitempty"` } -// CrawlResponse represents the response for crawling operations for v1 +// CrawlResponse represents the response for crawling operations type CrawlResponse struct { - Success bool `json:"success"` - ID string `json:"id,omitempty"` - Data []*FirecrawlDocument `json:"data,omitempty"` - URL string `json:"url,omitempty"` -} - -// JobStatusResponseV0 represents the response for checking crawl job status for v0 -type JobStatusResponseV0 struct { - Success bool `json:"success"` - Status string `json:"status"` - Current int `json:"current,omitempty"` - CurrentURL string `json:"current_url,omitempty"` - CurrentStep string `json:"current_step,omitempty"` - Total int `json:"total,omitempty"` - JobID string `json:"jobId,omitempty"` - Data []*FirecrawlDocumentV0 `json:"data,omitempty"` - PartialData []*FirecrawlDocumentV0 `json:"partial_data,omitempty"` + Success bool `json:"success"` + ID string `json:"id,omitempty"` + URL string `json:"url,omitempty"` } -// CrawlStatusResponse (old JobStatusResponse) represents the response for checking crawl job status for v1 +// CrawlStatusResponse (old JobStatusResponse) represents the response for checking crawl job type CrawlStatusResponse struct { Status string `json:"status"` - TotalCount int `json:"totalCount,omitempty"` + Total int `json:"total,omitempty"` + Completed int `json:"completed,omitempty"` CreditsUsed int `json:"creditsUsed,omitempty"` ExpiresAt string `json:"expiresAt,omitempty"` - Next string `json:"next,omitempty"` + Next *string `json:"next,omitempty"` Data []*FirecrawlDocument `json:"data,omitempty"` } -// CancelCrawlJobResponseV0 represents the response for canceling a crawl job for v0 -type CancelCrawlJobResponseV0 struct { +// CancelCrawlJobResponse represents the response for canceling a crawl job +type CancelCrawlJobResponse struct { Success bool `json:"success"` Status string `json:"status"` } -// CancelCrawlJobResponse represents the response for canceling a crawl job for v1 -type CancelCrawlJobResponse struct { - Success bool `json:"success"` - Status string `json:"status"` +// MapParams represents the parameters for a map request. +type MapParams struct { + IncludeSubdomains *bool `json:"includeSubdomains,omitempty"` + Search *string `json:"search,omitempty"` + IgnoreSitemap *bool `json:"ignoreSitemap,omitempty"` + Limit *int `json:"limit,omitempty"` } // MapResponse represents the response for mapping operations @@ -263,7 +197,7 @@ type FirecrawlApp struct { // Returns: // - *FirecrawlApp: A new instance of FirecrawlApp configured with the provided or retrieved API key and API URL. // - error: An error if the API key is not provided or retrieved. -func NewFirecrawlApp(apiKey, apiURL string, version string) (*FirecrawlApp, error) { +func NewFirecrawlApp(apiKey, apiURL string) (*FirecrawlApp, error) { if apiKey == "" { apiKey = os.Getenv("FIRECRAWL_API_KEY") if apiKey == "" { @@ -278,19 +212,14 @@ func NewFirecrawlApp(apiKey, apiURL string, version string) (*FirecrawlApp, erro } } - if version == "" { - version = "v1" - } - client := &http.Client{ Timeout: 60 * time.Second, } return &FirecrawlApp{ - APIKey: apiKey, - APIURL: apiURL, - Client: client, - Version: version, + APIKey: apiKey, + APIURL: apiURL, + Client: client, }, nil } @@ -303,31 +232,58 @@ func NewFirecrawlApp(apiKey, apiURL string, version string) (*FirecrawlApp, erro // Returns: // - *FirecrawlDocument or *FirecrawlDocumentV0: The scraped document data depending on the API version. // - error: An error if the scrape request fails. -func (app *FirecrawlApp) ScrapeURL(url string, params map[string]any) (any, error) { - headers := app.prepareHeaders("") +func (app *FirecrawlApp) ScrapeURL(url string, params *ScrapeParams) (*FirecrawlDocument, error) { + headers := app.prepareHeaders(nil) scrapeBody := map[string]any{"url": url} + // if params != nil { + // if extractorOptions, ok := params["extractorOptions"].(ExtractorOptions); ok { + // if schema, ok := extractorOptions.ExtractionSchema.(interface{ schema() any }); ok { + // extractorOptions.ExtractionSchema = schema.schema() + // } + // if extractorOptions.Mode == "" { + // extractorOptions.Mode = "llm-extraction" + // } + // scrapeBody["extractorOptions"] = extractorOptions + // } + + // for key, value := range params { + // if key != "extractorOptions" { + // scrapeBody[key] = value + // } + // } + // } + if params != nil { - if extractorOptions, ok := params["extractorOptions"].(ExtractorOptions); ok { - if schema, ok := extractorOptions.ExtractionSchema.(interface{ schema() any }); ok { - extractorOptions.ExtractionSchema = schema.schema() - } - if extractorOptions.Mode == "" { - extractorOptions.Mode = "llm-extraction" - } - scrapeBody["extractorOptions"] = extractorOptions + if params.Formats != nil { + scrapeBody["formats"] = params.Formats } - - for key, value := range params { - if key != "extractorOptions" { - scrapeBody[key] = value - } + if params.Headers != nil { + scrapeBody["headers"] = params.Headers + } + if params.IncludeTags != nil { + scrapeBody["includeTags"] = params.IncludeTags + } + if params.ExcludeTags != nil { + scrapeBody["excludeTags"] = params.ExcludeTags + } + if params.OnlyMainContent != nil { + scrapeBody["onlyMainContent"] = params.OnlyMainContent + } + if params.WaitFor != nil { + scrapeBody["waitFor"] = params.WaitFor + } + if params.ParsePDF != nil { + scrapeBody["parsePDF"] = params.ParsePDF + } + if params.Timeout != nil { + scrapeBody["timeout"] = params.Timeout } } resp, err := app.makeRequest( http.MethodPost, - fmt.Sprintf("%s/%s/scrape", app.APIURL, app.Version), + fmt.Sprintf("%s/v1/scrape", app.APIURL), scrapeBody, headers, "scrape URL", @@ -336,20 +292,11 @@ func (app *FirecrawlApp) ScrapeURL(url string, params map[string]any) (any, erro return nil, err } - if app.Version == "v0" { - var scrapeResponseV0 ScrapeResponseV0 - err = json.Unmarshal(resp, &scrapeResponseV0) + var scrapeResponse ScrapeResponse + err = json.Unmarshal(resp, &scrapeResponse) - if scrapeResponseV0.Success { - return scrapeResponseV0.Data, nil - } - } else if app.Version == "v1" { - var scrapeResponse ScrapeResponse - err = json.Unmarshal(resp, &scrapeResponse) - - if scrapeResponse.Success { - return scrapeResponse.Data, nil - } + if scrapeResponse.Success { + return scrapeResponse.Data, nil } if err != nil { @@ -359,49 +306,83 @@ func (app *FirecrawlApp) ScrapeURL(url string, params map[string]any) (any, erro return nil, fmt.Errorf("failed to scrape URL") } -// Search performs a search query using the Firecrawl API and returns the search results. +// CrawlURL starts a crawl job for the specified URL using the Firecrawl API. // // Parameters: -// - query: The search query string. -// - params: Optional parameters for the search request. +// - url: The URL to crawl. +// - params: Optional parameters for the crawl request. +// - idempotencyKey: An optional idempotency key to ensure the request is idempotent (can be nil). +// - pollInterval: An optional interval (in seconds) at which to poll the job status. Default is 2 seconds. // // Returns: -// - []*FirecrawlDocument: A slice of FirecrawlDocument containing the search results. -// - error: An error if the search request fails. -func (app *FirecrawlApp) Search(query string, params map[string]any) ([]*FirecrawlDocumentV0, error) { - headers := app.prepareHeaders("") +// - CrawlStatusResponse: The crawl result if the job is completed. +// - error: An error if the crawl request fails. +func (app *FirecrawlApp) CrawlURL(url string, params *CrawlParams, idempotencyKey *string, pollInterval ...int) (*CrawlStatusResponse, error) { + var key string + if idempotencyKey != nil { + key = *idempotencyKey + } + + headers := app.prepareHeaders(&key) + crawlBody := map[string]any{"url": url} - if app.Version == "v1" { - return nil, fmt.Errorf("Search is not supported in v1") + if params != nil { + if params.ScrapeOptions.Formats != nil { + crawlBody["scrapeOptions"] = params.ScrapeOptions + } + if params.Webhook != nil { + crawlBody["webhook"] = params.Webhook + } + if params.Limit != nil { + crawlBody["limit"] = params.Limit + } + if params.IncludePaths != nil { + crawlBody["includePaths"] = params.IncludePaths + } + if params.ExcludePaths != nil { + crawlBody["excludePaths"] = params.ExcludePaths + } + if params.MaxDepth != nil { + crawlBody["maxDepth"] = params.MaxDepth + } + if params.AllowBackwardLinks != nil { + crawlBody["allowBackwardLinks"] = params.AllowBackwardLinks + } + if params.AllowExternalLinks != nil { + crawlBody["allowExternalLinks"] = params.AllowExternalLinks + } + if params.IgnoreSitemap != nil { + crawlBody["ignoreSitemap"] = params.IgnoreSitemap + } } - searchBody := map[string]any{"query": query} - for k, v := range params { - searchBody[k] = v + actualPollInterval := 2 + if len(pollInterval) > 0 { + actualPollInterval = pollInterval[0] } resp, err := app.makeRequest( http.MethodPost, - fmt.Sprintf("%s/v0/search", app.APIURL), - searchBody, + fmt.Sprintf("%s/v1/crawl", app.APIURL), + crawlBody, headers, - "search", + "start crawl job", + withRetries(3), + withBackoff(500), ) if err != nil { return nil, err } - var searchResponse SearchResponseV0 - err = json.Unmarshal(resp, &searchResponse) + var crawlResponse CrawlResponse + err = json.Unmarshal(resp, &crawlResponse) if err != nil { return nil, err } - if searchResponse.Success { - return searchResponse.Data, nil - } + fmt.Println(crawlResponse) - return nil, fmt.Errorf("failed to search") + return app.monitorJobStatus(crawlResponse.ID, headers, actualPollInterval) } // CrawlURL starts a crawl job for the specified URL using the Firecrawl API. @@ -409,68 +390,75 @@ func (app *FirecrawlApp) Search(query string, params map[string]any) ([]*Firecra // Parameters: // - url: The URL to crawl. // - params: Optional parameters for the crawl request. -// - waitUntilDone: If true, the method will wait until the crawl job is completed before returning. -// - pollInterval: The interval (in seconds) at which to poll the job status if waitUntilDone is true. // - idempotencyKey: An optional idempotency key to ensure the request is idempotent. // // Returns: -// - any: The job ID if waitUntilDone is false, or the crawl result if waitUntilDone is true. +// - *CrawlResponse: The crawl response with id. // - error: An error if the crawl request fails. -func (app *FirecrawlApp) CrawlURL(url string, params map[string]any, waitUntilDone bool, pollInterval int, idempotencyKey string) (any, error) { - headers := app.prepareHeaders(idempotencyKey) +func (app *FirecrawlApp) AsyncCrawlURL(url string, params *CrawlParams, idempotencyKey *string) (*CrawlResponse, error) { + var key string + if idempotencyKey != nil { + key = *idempotencyKey + } + + headers := app.prepareHeaders(&key) crawlBody := map[string]any{"url": url} - for k, v := range params { - crawlBody[k] = v + + if params != nil { + if params.ScrapeOptions.Formats != nil { + crawlBody["scrapeOptions"] = params.ScrapeOptions + } + if params.Webhook != nil { + crawlBody["webhook"] = params.Webhook + } + if params.Limit != nil { + crawlBody["limit"] = params.Limit + } + if params.IncludePaths != nil { + crawlBody["includePaths"] = params.IncludePaths + } + if params.ExcludePaths != nil { + crawlBody["excludePaths"] = params.ExcludePaths + } + if params.MaxDepth != nil { + crawlBody["maxDepth"] = params.MaxDepth + } + if params.AllowBackwardLinks != nil { + crawlBody["allowBackwardLinks"] = params.AllowBackwardLinks + } + if params.AllowExternalLinks != nil { + crawlBody["allowExternalLinks"] = params.AllowExternalLinks + } + if params.IgnoreSitemap != nil { + crawlBody["ignoreSitemap"] = params.IgnoreSitemap + } } resp, err := app.makeRequest( http.MethodPost, - fmt.Sprintf("%s/%s/crawl", app.APIURL, app.Version), + fmt.Sprintf("%s/v1/crawl", app.APIURL), crawlBody, headers, "start crawl job", withRetries(3), withBackoff(500), ) + if err != nil { return nil, err } - if app.Version == "v0" { - var crawlResponse CrawlResponseV0 - err = json.Unmarshal(resp, &crawlResponse) - if err != nil { - return nil, err - } - - if waitUntilDone { - return app.monitorJobStatus(crawlResponse.JobID, headers, pollInterval, "") - } - - if crawlResponse.JobID == "" { - return nil, fmt.Errorf("failed to get job ID") - } - - return crawlResponse.JobID, nil - } else if app.Version == "v1" { - var crawlResponse CrawlResponse - err = json.Unmarshal(resp, &crawlResponse) - if err != nil { - return nil, err - } - - if waitUntilDone { - return app.monitorJobStatus(crawlResponse.ID, headers, pollInterval, crawlResponse.URL) - } - - if crawlResponse.ID == "" { - return nil, fmt.Errorf("failed to get job ID") - } + var crawlResponse CrawlResponse + err = json.Unmarshal(resp, &crawlResponse) + if err != nil { + return nil, err + } - return crawlResponse.ID, nil + if crawlResponse.ID == "" { + return nil, fmt.Errorf("failed to get job ID") } - return nil, fmt.Errorf("invalid version") + return &crawlResponse, nil } // CheckCrawlStatus checks the status of a crawl job using the Firecrawl API. @@ -479,16 +467,12 @@ func (app *FirecrawlApp) CrawlURL(url string, params map[string]any, waitUntilDo // - ID: The ID of the crawl job to check. // // Returns: -// - *JobStatusResponse or *JobStatusResponseV0: The status of the crawl job. +// - *CrawlStatusResponse: The status of the crawl job. // - error: An error if the crawl status check request fails. -func (app *FirecrawlApp) CheckCrawlStatus(ID string) (any, error) { - headers := app.prepareHeaders("") - apiURL := "" - if app.Version == "v0" { - apiURL = fmt.Sprintf("%s/v0/crawl/status/%s", app.APIURL, ID) - } else if app.Version == "v1" { - apiURL = fmt.Sprintf("%s/v1/crawl/%s", app.APIURL, ID) - } +func (app *FirecrawlApp) CheckCrawlStatus(ID string) (*CrawlStatusResponse, error) { + headers := app.prepareHeaders(nil) + apiURL := fmt.Sprintf("%s/v1/crawl/%s", app.APIURL, ID) + resp, err := app.makeRequest( http.MethodGet, apiURL, @@ -502,25 +486,13 @@ func (app *FirecrawlApp) CheckCrawlStatus(ID string) (any, error) { return nil, err } - if app.Version == "v0" { - var jobStatusResponse JobStatusResponseV0 - err = json.Unmarshal(resp, &jobStatusResponse) - if err != nil { - return nil, err - } - - return &jobStatusResponse, nil - } else if app.Version == "v1" { - var jobStatusResponse CrawlStatusResponse - err = json.Unmarshal(resp, &jobStatusResponse) - if err != nil { - return nil, err - } - - return &jobStatusResponse, nil + var jobStatusResponse CrawlStatusResponse + err = json.Unmarshal(resp, &jobStatusResponse) + if err != nil { + return nil, err } - return nil, fmt.Errorf("invalid version") + return &jobStatusResponse, nil } // CancelCrawlJob cancels a crawl job using the Firecrawl API. @@ -532,13 +504,8 @@ func (app *FirecrawlApp) CheckCrawlStatus(ID string) (any, error) { // - string: The status of the crawl job after cancellation. // - error: An error if the crawl job cancellation request fails. func (app *FirecrawlApp) CancelCrawlJob(ID string) (string, error) { - headers := app.prepareHeaders("") - apiURL := "" - if app.Version == "v0" { - apiURL = fmt.Sprintf("%s/v0/crawl/cancel/%s", app.APIURL, ID) - } else if app.Version == "v1" { - apiURL = fmt.Sprintf("%s/v1/crawl/%s", app.APIURL, ID) - } + headers := app.prepareHeaders(nil) + apiURL := fmt.Sprintf("%s/v1/crawl/%s", app.APIURL, ID) resp, err := app.makeRequest( http.MethodDelete, apiURL, @@ -568,20 +535,28 @@ func (app *FirecrawlApp) CancelCrawlJob(ID string) (string, error) { // Returns: // - *MapResponse: The response from the mapping operation. // - error: An error if the mapping request fails. -func (app *FirecrawlApp) MapURL(url string, params map[string]any) (*MapResponse, error) { - if app.Version == "v0" { - return nil, fmt.Errorf("map is not supported in v0") - } - - headers := app.prepareHeaders("") +func (app *FirecrawlApp) MapURL(url string, params *MapParams) (*MapResponse, error) { + headers := app.prepareHeaders(nil) jsonData := map[string]any{"url": url} - for k, v := range params { - jsonData[k] = v + + if params != nil { + if params.IncludeSubdomains != nil { + jsonData["includeSubdomains"] = params.IncludeSubdomains + } + if params.Search != nil { + jsonData["search"] = params.Search + } + if params.IgnoreSitemap != nil { + jsonData["ignoreSitemap"] = params.IgnoreSitemap + } + if params.Limit != nil { + jsonData["limit"] = params.Limit + } } resp, err := app.makeRequest( http.MethodPost, - fmt.Sprintf("%s/%s/map", app.APIURL, app.Version), + fmt.Sprintf("%s/v1/map", app.APIURL), jsonData, headers, "map", @@ -603,6 +578,18 @@ func (app *FirecrawlApp) MapURL(url string, params map[string]any) (*MapResponse } } +// SearchURL searches for a URL using the Firecrawl API. +// +// Parameters: +// - url: The URL to search for. +// - params: Optional parameters for the search request. +// - error: An error if the search request fails. +// +// Search is not implemented in API version 1.0.0. +func (app *FirecrawlApp) Search(query string, params *any) (any, error) { + return nil, fmt.Errorf("Search is not implemented in API version 1.0.0") +} + // prepareHeaders prepares the headers for an HTTP request. // // Parameters: @@ -611,13 +598,13 @@ func (app *FirecrawlApp) MapURL(url string, params map[string]any) (*MapResponse // // Returns: // - map[string]string: A map containing the headers for the HTTP request. -func (app *FirecrawlApp) prepareHeaders(idempotencyKey string) map[string]string { +func (app *FirecrawlApp) prepareHeaders(idempotencyKey *string) map[string]string { headers := map[string]string{ "Content-Type": "application/json", "Authorization": fmt.Sprintf("Bearer %s", app.APIKey), } - if idempotencyKey != "" { - headers["x-idempotency-key"] = idempotencyKey + if idempotencyKey != nil { + headers["x-idempotency-key"] = *idempotencyKey } return headers } @@ -691,21 +678,15 @@ func (app *FirecrawlApp) makeRequest(method, url string, data map[string]any, he // - pollInterval: The interval (in seconds) at which to poll the job status. // // Returns: -// - []*FirecrawlDocument or []*FirecrawlDocumentV0: The crawl result if the job is completed. +// - *CrawlStatusResponse: The crawl result if the job is completed. // - error: An error if the crawl status check request fails. -func (app *FirecrawlApp) monitorJobStatus(ID string, headers map[string]string, pollInterval int, checkUrl string) (any, error) { - attempts := 0 - apiURL := "" - if app.Version == "v0" { - apiURL = fmt.Sprintf("%s/v0/crawl/status/%s", app.APIURL, ID) - } else if app.Version == "v1" { - apiURL = checkUrl - } +func (app *FirecrawlApp) monitorJobStatus(ID string, headers map[string]string, pollInterval int) (*CrawlStatusResponse, error) { + attempts := 3 for { resp, err := app.makeRequest( http.MethodGet, - apiURL, + fmt.Sprintf("%s/v1/crawl/%s", app.APIURL, ID), nil, headers, "check crawl status", @@ -716,59 +697,30 @@ func (app *FirecrawlApp) monitorJobStatus(ID string, headers map[string]string, return nil, err } - if app.Version == "v0" { - var statusData JobStatusResponseV0 - err = json.Unmarshal(resp, &statusData) - if err != nil { - return nil, err - } - - status := statusData.Status - if status == "" { - return nil, fmt.Errorf("invalid status in response") - } - - if status == "completed" { - if statusData.Data != nil { - return statusData.Data, nil - } - attempts++ - if attempts > 3 { - return nil, fmt.Errorf("crawl job completed but no data was returned") - } - } else if status == "active" || status == "paused" || status == "pending" || status == "queued" || status == "waiting" || status == "scraping" { - pollInterval = max(pollInterval, 2) - time.Sleep(time.Duration(pollInterval) * time.Second) - } else { - return nil, fmt.Errorf("crawl job failed or was stopped. Status: %s", status) - } + var statusData CrawlStatusResponse + err = json.Unmarshal(resp, &statusData) + if err != nil { + return nil, err + } - } else if app.Version == "v1" { - var statusData CrawlStatusResponse - err = json.Unmarshal(resp, &statusData) - if err != nil { - return nil, err - } + status := statusData.Status + if status == "" { + return nil, fmt.Errorf("invalid status in response") + } - status := statusData.Status - if status == "" { - return nil, fmt.Errorf("invalid status in response") + if status == "completed" { + if statusData.Data != nil { + return &statusData, nil } - - if status == "completed" { - if statusData.Data != nil { - return statusData, nil - } - attempts++ - if attempts > 3 { - return nil, fmt.Errorf("crawl job completed but no data was returned") - } - } else if status == "active" || status == "paused" || status == "pending" || status == "queued" || status == "waiting" || status == "scraping" { - pollInterval = max(pollInterval, 2) - time.Sleep(time.Duration(pollInterval) * time.Second) - } else { - return nil, fmt.Errorf("crawl job failed or was stopped. Status: %s", status) + attempts-- + if attempts == 0 { + return nil, fmt.Errorf("crawl job completed but no data was returned") } + } else if status == "active" || status == "paused" || status == "pending" || status == "queued" || status == "waiting" || status == "scraping" { + pollInterval = max(pollInterval, 2) + time.Sleep(time.Duration(pollInterval) * time.Second) + } else { + return nil, fmt.Errorf("crawl job failed or was stopped. Status: %s", status) } } } diff --git a/firecrawl_test.go b/firecrawl_test.go index 101cbd8..cf2b062 100644 --- a/firecrawl_test.go +++ b/firecrawl_test.go @@ -15,8 +15,12 @@ import ( var API_URL string var TEST_API_KEY string +func ptr[T any](v T) *T { + return &v +} + func init() { - err := godotenv.Load("../.env") + err := godotenv.Load(".env") if err != nil { log.Fatalf("Error loading .env file: %v", err) } @@ -25,13 +29,13 @@ func init() { } func TestNoAPIKey(t *testing.T) { - _, err := NewFirecrawlApp("", API_URL, "v1") + _, err := NewFirecrawlApp("", API_URL) assert.Error(t, err) assert.Contains(t, err.Error(), "no API key provided") } func TestScrapeURLInvalidAPIKey(t *testing.T) { - app, err := NewFirecrawlApp("invalid_api_key", API_URL, "v1") + app, err := NewFirecrawlApp("invalid_api_key", API_URL) require.NoError(t, err) _, err = app.ScrapeURL("https://firecrawl.dev", nil) @@ -40,7 +44,7 @@ func TestScrapeURLInvalidAPIKey(t *testing.T) { } func TestBlocklistedURL(t *testing.T) { - app, err := NewFirecrawlApp(TEST_API_KEY, API_URL, "v1") + app, err := NewFirecrawlApp(TEST_API_KEY, API_URL) require.NoError(t, err) _, err = app.ScrapeURL("https://facebook.com/fake-test", nil) @@ -49,127 +53,117 @@ func TestBlocklistedURL(t *testing.T) { } func TestSuccessfulResponseWithValidPreviewToken(t *testing.T) { - app, err := NewFirecrawlApp("this_is_just_a_preview_token", API_URL, "v1") + app, err := NewFirecrawlApp("this_is_just_a_preview_token", API_URL) require.NoError(t, err) response, err := app.ScrapeURL("https://roastmywebsite.ai", nil) require.NoError(t, err) assert.NotNil(t, response) - scrapeResponse := response.(*FirecrawlDocument) - assert.Contains(t, scrapeResponse.Markdown, "_Roast_") + assert.Contains(t, response.Markdown, "_Roast_") } func TestScrapeURLE2E(t *testing.T) { - app, err := NewFirecrawlApp(TEST_API_KEY, API_URL, "v1") + app, err := NewFirecrawlApp(TEST_API_KEY, API_URL) require.NoError(t, err) response, err := app.ScrapeURL("https://roastmywebsite.ai", nil) require.NoError(t, err) assert.NotNil(t, response) - scrapeResponse := response.(*FirecrawlDocument) - assert.Contains(t, scrapeResponse.Markdown, "_Roast_") - assert.NotEqual(t, scrapeResponse.Markdown, "") - assert.NotNil(t, scrapeResponse.Metadata) - assert.Equal(t, scrapeResponse.HTML, "") + assert.Contains(t, response.Markdown, "_Roast_") + assert.NotEqual(t, response.Markdown, "") + assert.NotNil(t, response.Metadata) + assert.Equal(t, response.HTML, "") } func TestSuccessfulResponseWithValidAPIKeyAndIncludeHTML(t *testing.T) { - app, err := NewFirecrawlApp(TEST_API_KEY, API_URL, "v1") + app, err := NewFirecrawlApp(TEST_API_KEY, API_URL) require.NoError(t, err) - params := map[string]any{ - "formats": []string{"markdown", "html", "rawHtml", "screenshot", "links"}, - "headers": map[string]string{"x-key": "test"}, - "includeTags": []string{"h1"}, - "excludeTags": []string{"h2"}, - "onlyMainContent": true, - "timeout": 30000, - "waitFor": 1000, + params := ScrapeParams{ + Formats: []string{"markdown", "html", "rawHtml", "screenshot", "links"}, + Headers: ptr(map[string]string{"x-key": "test"}), + IncludeTags: []string{"h1"}, + ExcludeTags: []string{"h2"}, + OnlyMainContent: ptr(true), + Timeout: ptr(30000), + WaitFor: ptr(1000), } - response, err := app.ScrapeURL("https://roastmywebsite.ai", params) + response, err := app.ScrapeURL("https://roastmywebsite.ai", ¶ms) require.NoError(t, err) assert.NotNil(t, response) - scrapeResponse := response.(*FirecrawlDocument) - assert.NotNil(t, scrapeResponse) - assert.Contains(t, scrapeResponse.Markdown, "_Roast_") - assert.Contains(t, scrapeResponse.HTML, "