Python sdk/v2.1.0 (#1479)

* scrape options fixing types

* fix

* version bump

* Update __init__.py

---------

Co-authored-by: Nicolas <nicolascamara29@gmail.com>
This commit is contained in:
Rafael Miller
2025-04-18 13:42:36 -07:00
committed by GitHub
parent f451b71308
commit 9e259571b1
4 changed files with 181 additions and 147 deletions
+1 -20
View File
@@ -42,23 +42,7 @@ while attempts > 0 and crawl_status.status != 'completed':
crawl_status = app.check_crawl_status(async_result.id) crawl_status = app.check_crawl_status(async_result.id)
print(crawl_status) print(crawl_status)
# LLM Extraction: # JSON format:
# Define schema to extract contents into using pydantic
class ArticleSchema(BaseModel):
title: str
points: int
by: str
commentsURL: str
class TopArticlesSchema(BaseModel):
top: List[ArticleSchema] = Field(..., description="Top 5 stories")
extract_config = JsonConfig(schema=TopArticlesSchema.model_json_schema())
llm_extraction_result = app.scrape_url('https://news.ycombinator.com', formats=["extract"], extract=extract_config)
print(llm_extraction_result.extract)
# Define schema to extract contents into using json schema # Define schema to extract contents into using json schema
json_schema = { json_schema = {
"type": "object", "type": "object",
@@ -86,9 +70,6 @@ llm_extraction_result = app.scrape_url('https://news.ycombinator.com', formats=[
print(llm_extraction_result.json) print(llm_extraction_result.json)
print(llm_extraction_result['llm_extraction'])
# Map a website: # Map a website:
map_result = app.map_url('https://firecrawl.dev', search="blog") map_result = app.map_url('https://firecrawl.dev', search="blog")
print(map_result) print(map_result)
+17 -1
View File
@@ -2,7 +2,7 @@ import time
import nest_asyncio import nest_asyncio
import uuid import uuid
import asyncio import asyncio
from firecrawl.firecrawl import AsyncFirecrawlApp from firecrawl.firecrawl import AsyncFirecrawlApp, ScrapeOptions, JsonConfig
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from typing import List from typing import List
@@ -84,6 +84,20 @@ async def example_map_and_extract():
extract_result = await app.extract(['https://firecrawl.dev'], prompt="Extract the title, description, and links from the website", schema=extract_schema) extract_result = await app.extract(['https://firecrawl.dev'], prompt="Extract the title, description, and links from the website", schema=extract_schema)
print(extract_result) print(extract_result)
async def example_deep_research():
# Deep research example
research_result = await app.deep_research(
"What are the latest developments in large language models?",
max_urls=4
)
print("Research Results:", research_result)
async def example_generate_llms_text():
# Generate LLMs.txt example
llms_result = await app.generate_llms_text(
"https://firecrawl.dev")
print("LLMs.txt Results:", llms_result)
# Define event handlers for websocket # Define event handlers for websocket
def on_document(detail): def on_document(detail):
print("DOC", detail) print("DOC", detail)
@@ -115,6 +129,8 @@ async def main():
await example_llm_extraction() await example_llm_extraction()
await example_map_and_extract() await example_map_and_extract()
await example_websocket_crawl() await example_websocket_crawl()
await example_deep_research()
await example_generate_llms_text()
if __name__ == "__main__": if __name__ == "__main__":
asyncio.run(main()) asyncio.run(main())
+2 -2
View File
@@ -11,9 +11,9 @@ For more information visit https://github.com/firecrawl/
import logging import logging
import os import os
from .firecrawl import FirecrawlApp, JsonConfig # noqa from .firecrawl import FirecrawlApp, JsonConfig, ScrapeOptions # noqa
__version__ = "2.0.2" __version__ = "2.1.0"
# Define the logger for the Firecrawl project # Define the logger for the Firecrawl project
logger: logging.Logger = logging.getLogger("firecrawl") logger: logging.Logger = logging.getLogger("firecrawl")
+157 -120
View File
@@ -97,6 +97,16 @@ class ActionsResult(pydantic.BaseModel):
"""Result of actions performed during scraping.""" """Result of actions performed during scraping."""
screenshots: List[str] screenshots: List[str]
class ChangeTrackingData(pydantic.BaseModel):
"""
Data for the change tracking format.
"""
previousScrapeAt: Optional[str] = None
changeStatus: str # "new" | "same" | "changed" | "removed"
visibility: str # "visible" | "hidden"
diff: Optional[Dict[str, Any]] = None
json: Optional[Any] = None
class FirecrawlDocument(pydantic.BaseModel, Generic[T]): class FirecrawlDocument(pydantic.BaseModel, Generic[T]):
"""Document retrieved or processed by Firecrawl.""" """Document retrieved or processed by Firecrawl."""
url: Optional[str] = None url: Optional[str] = None
@@ -111,6 +121,7 @@ class FirecrawlDocument(pydantic.BaseModel, Generic[T]):
actions: Optional[ActionsResult] = None actions: Optional[ActionsResult] = None
title: Optional[str] = None # v1 search only title: Optional[str] = None # v1 search only
description: Optional[str] = None # v1 search only description: Optional[str] = None # v1 search only
changeTracking: Optional[ChangeTrackingData] = None
class LocationConfig(pydantic.BaseModel): class LocationConfig(pydantic.BaseModel):
"""Location configuration for scraping.""" """Location configuration for scraping."""
@@ -124,9 +135,9 @@ class WebhookConfig(pydantic.BaseModel):
metadata: Optional[Dict[str, str]] = None metadata: Optional[Dict[str, str]] = None
events: Optional[List[Literal["completed", "failed", "page", "started"]]] = None events: Optional[List[Literal["completed", "failed", "page", "started"]]] = None
class CommonOptions(pydantic.BaseModel): class ScrapeOptions(pydantic.BaseModel):
"""Parameters for scraping operations.""" """Parameters for scraping operations."""
formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None
headers: Optional[Dict[str, str]] = None headers: Optional[Dict[str, str]] = None
includeTags: Optional[List[str]] = None includeTags: Optional[List[str]] = None
excludeTags: Optional[List[str]] = None excludeTags: Optional[List[str]] = None
@@ -193,7 +204,7 @@ class JsonConfig(pydantic.BaseModel):
systemPrompt: Optional[str] = None systemPrompt: Optional[str] = None
agent: Optional[ExtractAgent] = None agent: Optional[ExtractAgent] = None
class ScrapeParams(CommonOptions): class ScrapeParams(ScrapeOptions):
"""Parameters for scraping operations.""" """Parameters for scraping operations."""
extract: Optional[JsonConfig] = None extract: Optional[JsonConfig] = None
jsonOptions: Optional[JsonConfig] = None jsonOptions: Optional[JsonConfig] = None
@@ -235,7 +246,7 @@ class CrawlParams(pydantic.BaseModel):
allowBackwardLinks: Optional[bool] = None allowBackwardLinks: Optional[bool] = None
allowExternalLinks: Optional[bool] = None allowExternalLinks: Optional[bool] = None
ignoreSitemap: Optional[bool] = None ignoreSitemap: Optional[bool] = None
scrapeOptions: Optional[CommonOptions] = None scrapeOptions: Optional[ScrapeOptions] = None
webhook: Optional[Union[str, WebhookConfig]] = None webhook: Optional[Union[str, WebhookConfig]] = None
deduplicateSimilarURLs: Optional[bool] = None deduplicateSimilarURLs: Optional[bool] = None
ignoreQueryParameters: Optional[bool] = None ignoreQueryParameters: Optional[bool] = None
@@ -289,7 +300,7 @@ class ExtractParams(pydantic.BaseModel):
includeSubdomains: Optional[bool] = None includeSubdomains: Optional[bool] = None
origin: Optional[str] = None origin: Optional[str] = None
showSources: Optional[bool] = None showSources: Optional[bool] = None
scrapeOptions: Optional[CommonOptions] = None scrapeOptions: Optional[ScrapeOptions] = None
class ExtractResponse(pydantic.BaseModel, Generic[T]): class ExtractResponse(pydantic.BaseModel, Generic[T]):
"""Response from extract operations.""" """Response from extract operations."""
@@ -309,7 +320,7 @@ class SearchParams(pydantic.BaseModel):
location: Optional[str] = None location: Optional[str] = None
origin: Optional[str] = "api" origin: Optional[str] = "api"
timeout: Optional[int] = 60000 timeout: Optional[int] = 60000
scrapeOptions: Optional[CommonOptions] = None scrapeOptions: Optional[ScrapeOptions] = None
class SearchResponse(pydantic.BaseModel): class SearchResponse(pydantic.BaseModel):
"""Response from search operations.""" """Response from search operations."""
@@ -378,16 +389,6 @@ class GenerateLLMsTextStatusResponse(pydantic.BaseModel):
error: Optional[str] = None error: Optional[str] = None
expiresAt: str expiresAt: str
class ChangeTrackingData(pydantic.BaseModel):
"""
Data for the change tracking format.
"""
previousScrapeAt: Optional[str] = None
changeStatus: str # "new" | "same" | "changed" | "removed"
visibility: str # "visible" | "hidden"
diff: Optional[Dict[str, Any]] = None
json: Optional[Any] = None
class SearchResponse(pydantic.BaseModel): class SearchResponse(pydantic.BaseModel):
""" """
Response from the search operation. Response from the search operation.
@@ -442,7 +443,7 @@ class FirecrawlApp:
self, self,
url: str, url: str,
*, *,
formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None, formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None,
include_tags: Optional[List[str]] = None, include_tags: Optional[List[str]] = None,
exclude_tags: Optional[List[str]] = None, exclude_tags: Optional[List[str]] = None,
only_main_content: Optional[bool] = None, only_main_content: Optional[bool] = None,
@@ -568,7 +569,7 @@ class FirecrawlApp:
country: Optional[str] = None, country: Optional[str] = None,
location: Optional[str] = None, location: Optional[str] = None,
timeout: Optional[int] = None, timeout: Optional[int] = None,
scrape_options: Optional[CommonOptions] = None, scrape_options: Optional[ScrapeOptions] = None,
params: Optional[Union[Dict[str, Any], SearchParams]] = None, params: Optional[Union[Dict[str, Any], SearchParams]] = None,
**kwargs) -> SearchResponse: **kwargs) -> SearchResponse:
""" """
@@ -583,7 +584,7 @@ class FirecrawlApp:
country (Optional[str]): Country code (default: "us") country (Optional[str]): Country code (default: "us")
location (Optional[str]): Geo-targeting location (Optional[str]): Geo-targeting
timeout (Optional[int]): Request timeout in milliseconds timeout (Optional[int]): Request timeout in milliseconds
scrape_options (Optional[CommonOptions]): Result scraping configuration scrape_options (Optional[ScrapeOptions]): Result scraping configuration
params (Optional[Union[Dict[str, Any], SearchParams]]): Additional search parameters params (Optional[Union[Dict[str, Any], SearchParams]]): Additional search parameters
**kwargs: Additional keyword arguments for future compatibility **kwargs: Additional keyword arguments for future compatibility
@@ -664,7 +665,7 @@ class FirecrawlApp:
allow_backward_links: Optional[bool] = None, allow_backward_links: Optional[bool] = None,
allow_external_links: Optional[bool] = None, allow_external_links: Optional[bool] = None,
ignore_sitemap: Optional[bool] = None, ignore_sitemap: Optional[bool] = None,
scrape_options: Optional[CommonOptions] = None, scrape_options: Optional[ScrapeOptions] = None,
webhook: Optional[Union[str, WebhookConfig]] = None, webhook: Optional[Union[str, WebhookConfig]] = None,
deduplicate_similar_urls: Optional[bool] = None, deduplicate_similar_urls: Optional[bool] = None,
ignore_query_parameters: Optional[bool] = None, ignore_query_parameters: Optional[bool] = None,
@@ -686,7 +687,7 @@ class FirecrawlApp:
allow_backward_links (Optional[bool]): Follow parent directory links allow_backward_links (Optional[bool]): Follow parent directory links
allow_external_links (Optional[bool]): Follow external domain links allow_external_links (Optional[bool]): Follow external domain links
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
scrape_options (Optional[CommonOptions]): Page scraping configuration scrape_options (Optional[ScrapeOptions]): Page scraping configuration
webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
deduplicate_similar_urls (Optional[bool]): Remove similar URLs deduplicate_similar_urls (Optional[bool]): Remove similar URLs
ignore_query_parameters (Optional[bool]): Ignore URL parameters ignore_query_parameters (Optional[bool]): Ignore URL parameters
@@ -768,7 +769,7 @@ class FirecrawlApp:
allow_backward_links: Optional[bool] = None, allow_backward_links: Optional[bool] = None,
allow_external_links: Optional[bool] = None, allow_external_links: Optional[bool] = None,
ignore_sitemap: Optional[bool] = None, ignore_sitemap: Optional[bool] = None,
scrape_options: Optional[CommonOptions] = None, scrape_options: Optional[ScrapeOptions] = None,
webhook: Optional[Union[str, WebhookConfig]] = None, webhook: Optional[Union[str, WebhookConfig]] = None,
deduplicate_similar_urls: Optional[bool] = None, deduplicate_similar_urls: Optional[bool] = None,
ignore_query_parameters: Optional[bool] = None, ignore_query_parameters: Optional[bool] = None,
@@ -789,7 +790,7 @@ class FirecrawlApp:
allow_backward_links (Optional[bool]): Follow parent directory links allow_backward_links (Optional[bool]): Follow parent directory links
allow_external_links (Optional[bool]): Follow external domain links allow_external_links (Optional[bool]): Follow external domain links
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
scrape_options (Optional[CommonOptions]): Page scraping configuration scrape_options (Optional[ScrapeOptions]): Page scraping configuration
webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
deduplicate_similar_urls (Optional[bool]): Remove similar URLs deduplicate_similar_urls (Optional[bool]): Remove similar URLs
ignore_query_parameters (Optional[bool]): Ignore URL parameters ignore_query_parameters (Optional[bool]): Ignore URL parameters
@@ -1007,7 +1008,7 @@ class FirecrawlApp:
allow_backward_links: Optional[bool] = None, allow_backward_links: Optional[bool] = None,
allow_external_links: Optional[bool] = None, allow_external_links: Optional[bool] = None,
ignore_sitemap: Optional[bool] = None, ignore_sitemap: Optional[bool] = None,
scrape_options: Optional[CommonOptions] = None, scrape_options: Optional[ScrapeOptions] = None,
webhook: Optional[Union[str, WebhookConfig]] = None, webhook: Optional[Union[str, WebhookConfig]] = None,
deduplicate_similar_urls: Optional[bool] = None, deduplicate_similar_urls: Optional[bool] = None,
ignore_query_parameters: Optional[bool] = None, ignore_query_parameters: Optional[bool] = None,
@@ -1028,7 +1029,7 @@ class FirecrawlApp:
allow_backward_links (Optional[bool]): Follow parent directory links allow_backward_links (Optional[bool]): Follow parent directory links
allow_external_links (Optional[bool]): Follow external domain links allow_external_links (Optional[bool]): Follow external domain links
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
scrape_options (Optional[CommonOptions]): Page scraping configuration scrape_options (Optional[ScrapeOptions]): Page scraping configuration
webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
deduplicate_similar_urls (Optional[bool]): Remove similar URLs deduplicate_similar_urls (Optional[bool]): Remove similar URLs
ignore_query_parameters (Optional[bool]): Ignore URL parameters ignore_query_parameters (Optional[bool]): Ignore URL parameters
@@ -1741,7 +1742,7 @@ class FirecrawlApp:
def async_extract( def async_extract(
self, self,
urls: List[str], urls: Optional[List[str]] = None,
*, *,
prompt: Optional[str] = None, prompt: Optional[str] = None,
schema: Optional[Any] = None, schema: Optional[Any] = None,
@@ -1749,8 +1750,7 @@ class FirecrawlApp:
allow_external_links: Optional[bool] = False, allow_external_links: Optional[bool] = False,
enable_web_search: Optional[bool] = False, enable_web_search: Optional[bool] = False,
show_sources: Optional[bool] = False, show_sources: Optional[bool] = False,
agent: Optional[Dict[str, Any]] = None, agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
idempotency_key: Optional[str] = None) -> ExtractResponse[Any]:
""" """
Initiate an asynchronous extract job. Initiate an asynchronous extract job.
@@ -1774,7 +1774,7 @@ class FirecrawlApp:
Raises: Raises:
ValueError: If job initiation fails ValueError: If job initiation fails
""" """
headers = self._prepare_headers(idempotency_key) headers = self._prepare_headers()
schema = schema schema = schema
if schema: if schema:
@@ -2922,9 +2922,9 @@ class AsyncFirecrawlApp(FirecrawlApp):
headers headers
) )
if response.status_code == 200: if response.get('success'):
try: try:
id = response.json().get('id') id = response.get('id')
except: except:
raise Exception(f'Failed to parse Firecrawl response as JSON.') raise Exception(f'Failed to parse Firecrawl response as JSON.')
return self._monitor_job_status(id, headers, poll_interval) return self._monitor_job_status(id, headers, poll_interval)
@@ -3050,7 +3050,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
headers headers
) )
if response.status_code == 200: if response.get('status_code') == 200:
try: try:
return BatchScrapeResponse(**response.json()) return BatchScrapeResponse(**response.json())
except: except:
@@ -3070,7 +3070,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
allow_backward_links: Optional[bool] = None, allow_backward_links: Optional[bool] = None,
allow_external_links: Optional[bool] = None, allow_external_links: Optional[bool] = None,
ignore_sitemap: Optional[bool] = None, ignore_sitemap: Optional[bool] = None,
scrape_options: Optional[CommonOptions] = None, scrape_options: Optional[ScrapeOptions] = None,
webhook: Optional[Union[str, WebhookConfig]] = None, webhook: Optional[Union[str, WebhookConfig]] = None,
deduplicate_similar_urls: Optional[bool] = None, deduplicate_similar_urls: Optional[bool] = None,
ignore_query_parameters: Optional[bool] = None, ignore_query_parameters: Optional[bool] = None,
@@ -3092,7 +3092,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
allow_backward_links (Optional[bool]): Follow parent directory links allow_backward_links (Optional[bool]): Follow parent directory links
allow_external_links (Optional[bool]): Follow external domain links allow_external_links (Optional[bool]): Follow external domain links
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
scrape_options (Optional[CommonOptions]): Page scraping configuration scrape_options (Optional[ScrapeOptions]): Page scraping configuration
webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
deduplicate_similar_urls (Optional[bool]): Remove similar URLs deduplicate_similar_urls (Optional[bool]): Remove similar URLs
ignore_query_parameters (Optional[bool]): Ignore URL parameters ignore_query_parameters (Optional[bool]): Ignore URL parameters
@@ -3148,15 +3148,15 @@ class AsyncFirecrawlApp(FirecrawlApp):
params_dict = final_params.dict(exclude_none=True) params_dict = final_params.dict(exclude_none=True)
params_dict['url'] = url params_dict['url'] = url
params_dict['origin'] = f"python-sdk@{version}" params_dict['origin'] = f"python-sdk@{version}"
# Make request # Make request
headers = self._prepare_headers(idempotency_key) headers = self._prepare_headers(idempotency_key)
response = await self._async_post_request( response = await self._async_post_request(
f'{self.api_url}/v1/crawl', params_dict, headers) f'{self.api_url}/v1/crawl', params_dict, headers)
if response.status_code == 200: print(response)
if response.get('success'):
try: try:
id = response.json().get('id') id = response.get('id')
except: except:
raise Exception(f'Failed to parse Firecrawl response as JSON.') raise Exception(f'Failed to parse Firecrawl response as JSON.')
return self._monitor_job_status(id, headers, poll_interval) return self._monitor_job_status(id, headers, poll_interval)
@@ -3176,11 +3176,12 @@ class AsyncFirecrawlApp(FirecrawlApp):
allow_backward_links: Optional[bool] = None, allow_backward_links: Optional[bool] = None,
allow_external_links: Optional[bool] = None, allow_external_links: Optional[bool] = None,
ignore_sitemap: Optional[bool] = None, ignore_sitemap: Optional[bool] = None,
scrape_options: Optional[CommonOptions] = None, scrape_options: Optional[ScrapeOptions] = None,
webhook: Optional[Union[str, WebhookConfig]] = None, webhook: Optional[Union[str, WebhookConfig]] = None,
deduplicate_similar_urls: Optional[bool] = None, deduplicate_similar_urls: Optional[bool] = None,
ignore_query_parameters: Optional[bool] = None, ignore_query_parameters: Optional[bool] = None,
regex_on_full_url: Optional[bool] = None, regex_on_full_url: Optional[bool] = None,
poll_interval: Optional[int] = 2,
idempotency_key: Optional[str] = None, idempotency_key: Optional[str] = None,
**kwargs **kwargs
) -> CrawlResponse: ) -> CrawlResponse:
@@ -3197,7 +3198,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
allow_backward_links (Optional[bool]): Follow parent directory links allow_backward_links (Optional[bool]): Follow parent directory links
allow_external_links (Optional[bool]): Follow external domain links allow_external_links (Optional[bool]): Follow external domain links
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
scrape_options (Optional[CommonOptions]): Page scraping configuration scrape_options (Optional[ScrapeOptions]): Page scraping configuration
webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
deduplicate_similar_urls (Optional[bool]): Remove similar URLs deduplicate_similar_urls (Optional[bool]): Remove similar URLs
ignore_query_parameters (Optional[bool]): Ignore URL parameters ignore_query_parameters (Optional[bool]): Ignore URL parameters
@@ -3262,9 +3263,9 @@ class AsyncFirecrawlApp(FirecrawlApp):
headers headers
) )
if response.status_code == 200: if response.get('success'):
try: try:
return CrawlResponse(**response.json()) return CrawlResponse(**response)
except: except:
raise Exception(f'Failed to parse Firecrawl response as JSON.') raise Exception(f'Failed to parse Firecrawl response as JSON.')
else: else:
@@ -3303,7 +3304,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
headers headers
) )
if status_data['status'] == 'completed': if status_data.get('status') == 'completed':
if 'data' in status_data: if 'data' in status_data:
data = status_data['data'] data = status_data['data']
while 'next' in status_data: while 'next' in status_data:
@@ -3317,26 +3318,24 @@ class AsyncFirecrawlApp(FirecrawlApp):
data.extend(next_data.get('data', [])) data.extend(next_data.get('data', []))
status_data = next_data status_data = next_data
status_data['data'] = data status_data['data'] = data
# Create CrawlStatusResponse object from status data
response = { response = CrawlStatusResponse(
'status': status_data.get('status'), status=status_data.get('status'),
'total': status_data.get('total'), total=status_data.get('total'),
'completed': status_data.get('completed'), completed=status_data.get('completed'),
'creditsUsed': status_data.get('creditsUsed'), creditsUsed=status_data.get('creditsUsed'),
'expiresAt': status_data.get('expiresAt'), expiresAt=status_data.get('expiresAt'),
'data': status_data.get('data') data=status_data.get('data'),
} success=False if 'error' in status_data else True
)
if 'error' in status_data: if 'error' in status_data:
response['error'] = status_data['error'] response.error = status_data.get('error')
if 'next' in status_data: if 'next' in status_data:
response['next'] = status_data['next'] response.next = status_data.get('next')
return { return response
'success': False if 'error' in status_data else True,
**response
}
async def _async_monitor_job_status(self, id: str, headers: Dict[str, str], poll_interval: int = 2) -> CrawlStatusResponse: async def _async_monitor_job_status(self, id: str, headers: Dict[str, str], poll_interval: int = 2) -> CrawlStatusResponse:
""" """
@@ -3359,7 +3358,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
headers headers
) )
if status_data['status'] == 'completed': if status_data.get('status') == 'completed':
if 'data' in status_data: if 'data' in status_data:
data = status_data['data'] data = status_data['data']
while 'next' in status_data: while 'next' in status_data:
@@ -3376,7 +3375,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
return status_data return status_data
else: else:
raise Exception('Job completed but no data was returned') raise Exception('Job completed but no data was returned')
elif status_data['status'] in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']: elif status_data.get('status') in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']:
await asyncio.sleep(max(poll_interval, 2)) await asyncio.sleep(max(poll_interval, 2))
else: else:
raise Exception(f'Job failed or was stopped. Status: {status_data["status"]}') raise Exception(f'Job failed or was stopped. Status: {status_data["status"]}')
@@ -3384,6 +3383,13 @@ class AsyncFirecrawlApp(FirecrawlApp):
async def map_url( async def map_url(
self, self,
url: str, url: str,
*,
search: Optional[str] = None,
ignore_sitemap: Optional[bool] = None,
include_subdomains: Optional[bool] = None,
sitemap_only: Optional[bool] = None,
limit: Optional[int] = None,
timeout: Optional[int] = None,
params: Optional[MapParams] = None) -> MapResponse: params: Optional[MapParams] = None) -> MapResponse:
""" """
Asynchronously map and discover links from a URL. Asynchronously map and discover links from a URL.
@@ -3409,21 +3415,40 @@ class AsyncFirecrawlApp(FirecrawlApp):
Raises: Raises:
Exception: If mapping fails Exception: If mapping fails
""" """
headers = self._prepare_headers() map_params = {}
json_data = {'url': url}
if params: if params:
json_data.update(params) map_params.update(params.dict(exclude_none=True))
json_data['origin'] = f"python-sdk@{version}"
# Add individual parameters
if search is not None:
map_params['search'] = search
if ignore_sitemap is not None:
map_params['ignoreSitemap'] = ignore_sitemap
if include_subdomains is not None:
map_params['includeSubdomains'] = include_subdomains
if sitemap_only is not None:
map_params['sitemapOnly'] = sitemap_only
if limit is not None:
map_params['limit'] = limit
if timeout is not None:
map_params['timeout'] = timeout
# Create final params object
final_params = MapParams(**map_params)
params_dict = final_params.dict(exclude_none=True)
params_dict['url'] = url
params_dict['origin'] = f"python-sdk@{version}"
# Make request
endpoint = f'/v1/map' endpoint = f'/v1/map'
response = await self._async_post_request( response = await self._async_post_request(
f'{self.api_url}{endpoint}', f'{self.api_url}{endpoint}',
json_data, params_dict,
headers headers={"Authorization": f"Bearer {self.api_key}"}
) )
if response.get('success') and 'links' in response: if response.get('success') and 'links' in response:
return response return MapResponse(**response)
elif 'error' in response: elif 'error' in response:
raise Exception(f'Failed to map URL. Error: {response["error"]}') raise Exception(f'Failed to map URL. Error: {response["error"]}')
else: else:
@@ -3431,27 +3456,28 @@ class AsyncFirecrawlApp(FirecrawlApp):
async def extract( async def extract(
self, self,
urls: List[str], urls: Optional[List[str]] = None,
params: Optional[ExtractParams] = None) -> ExtractResponse[Any]: *,
prompt: Optional[str] = None,
schema: Optional[Any] = None,
system_prompt: Optional[str] = None,
allow_external_links: Optional[bool] = False,
enable_web_search: Optional[bool] = False,
show_sources: Optional[bool] = False,
agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
""" """
Asynchronously extract structured information from URLs. Asynchronously extract structured information from URLs.
Args: Args:
urls (List[str]): URLs to extract from urls (Optional[List[str]]): URLs to extract from
params (Optional[ExtractParams]): See ExtractParams model: prompt (Optional[str]): Custom extraction prompt
Extraction Config: schema (Optional[Any]): JSON schema/Pydantic model
* prompt - Custom extraction prompt system_prompt (Optional[str]): System context
* schema - JSON schema/Pydantic model allow_external_links (Optional[bool]): Follow external links
* systemPrompt - System context enable_web_search (Optional[bool]): Enable web search
show_sources (Optional[bool]): Include source URLs
Behavior Options: agent (Optional[Dict[str, Any]]): Agent configuration
* allowExternalLinks - Follow external links
* enableWebSearch - Enable web search
* includeSubdomains - Include subdomains
* showSources - Include source URLs
Scraping Options:
* scrapeOptions - Page scraping config
Returns: Returns:
ExtractResponse with: ExtractResponse with:
@@ -3464,29 +3490,35 @@ class AsyncFirecrawlApp(FirecrawlApp):
""" """
headers = self._prepare_headers() headers = self._prepare_headers()
if not params or (not params.get('prompt') and not params.get('schema')): if not prompt and not schema:
raise ValueError("Either prompt or schema is required") raise ValueError("Either prompt or schema is required")
schema = params.get('schema') if not urls and not prompt:
raise ValueError("Either urls or prompt is required")
if schema: if schema:
if hasattr(schema, 'model_json_schema'): if hasattr(schema, 'model_json_schema'):
# Convert Pydantic model to JSON schema
schema = schema.model_json_schema() schema = schema.model_json_schema()
# Otherwise assume it's already a JSON schema dict
request_data = { request_data = {
'urls': urls, 'urls': urls or [],
'allowExternalLinks': params.get('allow_external_links', params.get('allowExternalLinks', False)), 'allowExternalLinks': allow_external_links,
'enableWebSearch': params.get('enable_web_search', params.get('enableWebSearch', False)), 'enableWebSearch': enable_web_search,
'showSources': params.get('show_sources', params.get('showSources', False)), 'showSources': show_sources,
'schema': schema, 'schema': schema,
'origin': f'python-sdk@{version}' 'origin': f'python-sdk@{get_version()}'
} }
if params.get('prompt'): # Only add prompt and systemPrompt if they exist
request_data['prompt'] = params['prompt'] if prompt:
if params.get('system_prompt'): request_data['prompt'] = prompt
request_data['systemPrompt'] = params['system_prompt'] if system_prompt:
elif params.get('systemPrompt'): request_data['systemPrompt'] = system_prompt
request_data['systemPrompt'] = params['systemPrompt']
if agent:
request_data['agent'] = agent
response = await self._async_post_request( response = await self._async_post_request(
f'{self.api_url}/v1/extract', f'{self.api_url}/v1/extract',
@@ -3506,7 +3538,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
) )
if status_data['status'] == 'completed': if status_data['status'] == 'completed':
return status_data return ExtractResponse(**status_data)
elif status_data['status'] in ['failed', 'cancelled']: elif status_data['status'] in ['failed', 'cancelled']:
raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}') raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}')
@@ -3562,14 +3594,14 @@ class AsyncFirecrawlApp(FirecrawlApp):
status_data = next_data status_data = next_data
status_data['data'] = data status_data['data'] = data
response = { response = BatchScrapeStatusResponse(
'status': status_data.get('status'), status=status_data.get('status'),
'total': status_data.get('total'), total=status_data.get('total'),
'completed': status_data.get('completed'), completed=status_data.get('completed'),
'creditsUsed': status_data.get('creditsUsed'), creditsUsed=status_data.get('creditsUsed'),
'expiresAt': status_data.get('expiresAt'), expiresAt=status_data.get('expiresAt'),
'data': status_data.get('data') data=status_data.get('data')
} )
if 'error' in status_data: if 'error' in status_data:
response['error'] = status_data['error'] response['error'] = status_data['error']
@@ -3689,8 +3721,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
allow_external_links: Optional[bool] = False, allow_external_links: Optional[bool] = False,
enable_web_search: Optional[bool] = False, enable_web_search: Optional[bool] = False,
show_sources: Optional[bool] = False, show_sources: Optional[bool] = False,
agent: Optional[Dict[str, Any]] = None, agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
idempotency_key: Optional[str] = None) -> ExtractResponse[Any]:
""" """
Initiate an asynchronous extraction job without waiting for completion. Initiate an asynchronous extraction job without waiting for completion.
@@ -3714,7 +3745,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
Raises: Raises:
ValueError: If job initiation fails ValueError: If job initiation fails
""" """
headers = self._prepare_headers(idempotency_key) headers = self._prepare_headers()
if not prompt and not schema: if not prompt and not schema:
raise ValueError("Either prompt or schema is required") raise ValueError("Either prompt or schema is required")
@@ -3726,14 +3757,14 @@ class AsyncFirecrawlApp(FirecrawlApp):
if hasattr(schema, 'model_json_schema'): if hasattr(schema, 'model_json_schema'):
schema = schema.model_json_schema() schema = schema.model_json_schema()
request_data = { request_data = ExtractResponse(
'urls': urls or [], urls=urls or [],
'allowExternalLinks': allow_external_links, allowExternalLinks=allow_external_links,
'enableWebSearch': enable_web_search, enableWebSearch=enable_web_search,
'showSources': show_sources, showSources=show_sources,
'schema': schema, schema=schema,
'origin': f'python-sdk@{version}' origin=f'python-sdk@{version}'
} )
if prompt: if prompt:
request_data['prompt'] = prompt request_data['prompt'] = prompt
@@ -3810,7 +3841,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
await asyncio.sleep(2) await asyncio.sleep(2)
return {'success': False, 'error': 'LLMs.txt generation job terminated unexpectedly'} return GenerateLLMsTextStatusResponse(success=False, error='LLMs.txt generation job terminated unexpectedly')
async def async_generate_llms_text( async def async_generate_llms_text(
self, self,
@@ -3845,6 +3876,12 @@ class AsyncFirecrawlApp(FirecrawlApp):
if experimental_stream is not None: if experimental_stream is not None:
params['__experimental_stream'] = experimental_stream params['__experimental_stream'] = experimental_stream
params = GenerateLLMsTextParams(
maxUrls=max_urls,
showFullText=show_full_text,
__experimental_stream=experimental_stream
)
headers = self._prepare_headers() headers = self._prepare_headers()
json_data = {'url': url, **params.dict(exclude_none=True)} json_data = {'url': url, **params.dict(exclude_none=True)}
json_data['origin'] = f"python-sdk@{version}" json_data['origin'] = f"python-sdk@{version}"
@@ -3981,7 +4018,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
await asyncio.sleep(2) await asyncio.sleep(2)
return {'success': False, 'error': 'Deep research job terminated unexpectedly'} return DeepResearchStatusResponse(success=False, error='Deep research job terminated unexpectedly')
async def async_deep_research( async def async_deep_research(
self, self,
@@ -4088,7 +4125,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
country: Optional[str] = None, country: Optional[str] = None,
location: Optional[str] = None, location: Optional[str] = None,
timeout: Optional[int] = None, timeout: Optional[int] = None,
scrape_options: Optional[CommonOptions] = None, scrape_options: Optional[ScrapeOptions] = None,
params: Optional[Union[Dict[str, Any], SearchParams]] = None, params: Optional[Union[Dict[str, Any], SearchParams]] = None,
**kwargs) -> SearchResponse: **kwargs) -> SearchResponse:
""" """
@@ -4103,7 +4140,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
country (Optional[str]): Country code (default: "us") country (Optional[str]): Country code (default: "us")
location (Optional[str]): Geo-targeting location (Optional[str]): Geo-targeting
timeout (Optional[int]): Request timeout in milliseconds timeout (Optional[int]): Request timeout in milliseconds
scrape_options (Optional[CommonOptions]): Result scraping configuration scrape_options (Optional[ScrapeOptions]): Result scraping configuration
params (Optional[Union[Dict[str, Any], SearchParams]]): Additional search parameters params (Optional[Union[Dict[str, Any], SearchParams]]): Additional search parameters
**kwargs: Additional keyword arguments for future compatibility **kwargs: Additional keyword arguments for future compatibility