From 0566e54d859330b8942ce0f1c6341760e30a0ded Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Mon, 26 Aug 2024 15:16:50 -0400 Subject: [PATCH 01/47] init --- .../turning_docs_into_api_specs/api_spec.json | 771 ++++++++++++++++++ .../dify_api_spec.json | 164 ++++ .../docs.firecrawl.dev/api_spec_0.json | 211 +++++ .../docs.firecrawl.dev/api_spec_1.json | 165 ++++ .../docs.firecrawl.dev/api_spec_10.json | 93 +++ .../docs.firecrawl.dev/api_spec_11.json | 131 +++ .../docs.firecrawl.dev/api_spec_13.json | 87 ++ .../docs.firecrawl.dev/api_spec_15.json | 83 ++ .../docs.firecrawl.dev/api_spec_16.json | 200 +++++ .../docs.firecrawl.dev/api_spec_2.json | 54 ++ .../docs.firecrawl.dev/api_spec_22.json | 166 ++++ .../docs.firecrawl.dev/api_spec_25.json | 229 ++++++ .../docs.firecrawl.dev/api_spec_26.json | 115 +++ .../docs.firecrawl.dev/api_spec_3.json | 185 +++++ .../docs.firecrawl.dev/api_spec_30.json | 212 +++++ .../docs.firecrawl.dev/api_spec_31.json | 199 +++++ .../docs.firecrawl.dev/api_spec_33.json | 202 +++++ .../docs.firecrawl.dev/api_spec_34.json | 201 +++++ .../docs.firecrawl.dev/api_spec_35.json | 245 ++++++ .../docs.firecrawl.dev/api_spec_4.json | 129 +++ .../docs.firecrawl.dev/api_spec_5.json | 186 +++++ .../docs.firecrawl.dev/api_spec_7.json | 86 ++ .../docs.firecrawl.dev/api_spec_8.json | 59 ++ .../docs.firecrawl.dev/combined_api_spec.json | 738 +++++++++++++++++ .../turning_docs_into_api_specs.ipynb | 287 +++++++ 25 files changed, 5198 insertions(+) create mode 100644 examples/turning_docs_into_api_specs/api_spec.json create mode 100644 examples/turning_docs_into_api_specs/dify_api_spec.json create mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_0.json create mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_1.json create mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_10.json create mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_11.json create mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_13.json create mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_15.json create mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_16.json create mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_2.json create mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_22.json create mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_25.json create mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_26.json create mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_3.json create mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_30.json create mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_31.json create mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_33.json create mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_34.json create mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_35.json create mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_4.json create mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_5.json create mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_7.json create mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_8.json create mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/combined_api_spec.json create mode 100644 examples/turning_docs_into_api_specs/turning_docs_into_api_specs.ipynb diff --git a/examples/turning_docs_into_api_specs/api_spec.json b/examples/turning_docs_into_api_specs/api_spec.json new file mode 100644 index 00000000..d866efd3 --- /dev/null +++ b/examples/turning_docs_into_api_specs/api_spec.json @@ -0,0 +1,771 @@ +{ + "info": { + "title": "Firecrawl API", + "version": "v0" + }, + "openapi": "3.0.0", + "paths": { + "/crawl": { + "post": { + "/crawl/cancel/{jobId}": { + "/crawl/status/{jobId}": { + "get": { + "/scrape": { + "/search": { + "post": { + "components": { + "securitySchemes": { + "Authorization": { + "bearerFormat": "JWT", + "scheme": "bearer", + "type": "http" + } + } + }, + "description": "Send a request to perform a web search and get scraped results from the top pages.", + "operationId": "searchWeb", + "parameters": [], + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "pageOptions": { + "description": "Options for controlling the scraping behavior of search result pages.", + "properties": { + "fetchPageContent": { + "default": true, + "description": "Fetch the content of each page. If false, defaults to a basic fast serp API.", + "type": "boolean" + }, + "includeHtml": { + "default": false, + "description": "Include the HTML version of the content on page. Will output a html key in the response.", + "type": "boolean" + }, + "includeRawHtml": { + "default": false, + "description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.", + "type": "boolean" + }, + "onlyMainContent": { + "default": false, + "description": "Only return the main content of the page excluding headers, navs, footers, etc.", + "type": "boolean" + } + }, + "type": "object" + }, + "query": { + "description": "The search query.", + "required": true, + "type": "string" + }, + "searchOptions": { + "description": "Options for controlling the search.", + "properties": { + "limit": { + "description": "Maximum number of search results to return.", + "type": "integer" + } + }, + "type": "object" + } + }, + "type": "object" + } + } + }, + "responses": { + "200": { + "402": { + "description": "Payment required." + }, + "429": { + "description": "Rate limit exceeded." + }, + "500": { + "description": "Internal server error." + }, + "content": { + "application/json": { + "schema": { + "properties": { + "data": { + "description": "An array of search results.", + "items": { + "properties": { + "content": { + "description": "Raw content of the search result page.", + "type": "string" + }, + "markdown": { + "description": "Markdown content of the search result page.", + "type": "string" + }, + "metadata": { + "description": "Metadata extracted from the search result page.", + "properties": { + "description": { + "description": "Page description.", + "type": "string" + }, + "language": { + "description": "Page language.", + "nullable": true, + "type": "string" + }, + "sourceURL": { + "description": "Source URL of the search result page.", + "type": "string" + }, + "title": { + "description": "Page title.", + "type": "string" + } + }, + "type": "object" + }, + "url": { + "description": "URL of the search result.", + "type": "string" + } + }, + "type": "object" + }, + "type": "array" + }, + "success": { + "description": "Indicates if the search was successful.", + "type": "boolean" + } + }, + "type": "object" + } + } + }, + "description": "Web search completed successfully." + } + } + }, + "summary": "Search the Web" + } + }, + "post": { + "description": "Send a request to scrape a single URL and get its content.", + "operationId": "scrapeURL", + "parameters": [], + "requestBody": { + "402": { + "description": "Payment required." + }, + "429": { + "description": "Rate limit exceeded." + }, + "500": { + "description": "Internal server error." + }, + "content": { + "application/json": { + "schema": { + "properties": { + "extractorOptions": { + "description": "Options for extraction of structured information from the page content. Note: LLM-based extraction is not performed by default and only occurs when explicitly configured. The 'markdown' mode simply returns the scraped markdown and is the default mode for scraping.", + "properties": { + "extractionPrompt": { + "description": "A prompt describing what information to extract from the page, applicable for LLM extraction modes.", + "type": "string" + }, + "extractionSchema": { + "description": "The schema for the data to be extracted, required only for LLM extraction modes.", + "type": "object" + }, + "mode": { + "default": "markdown", + "description": "The extraction mode to use. 'markdown': Returns the scraped markdown content, does not perform LLM extraction. 'llm-extraction': Extracts information from the cleaned and parsed content using LLM. 'llm-extraction-from-raw-html': Extracts information directly from the raw HTML using LLM. 'llm-extraction-from-markdown': Extracts information from the markdown content using LLM.", + "enum": [ + "markdown", + "llm-extraction", + "llm-extraction-from-raw-html", + "llm-extraction-from-markdown" + ], + "type": "string" + } + }, + "type": "object" + }, + "pageOptions": { + "description": "Options for controlling the scraping behavior.", + "properties": { + "fullPageScreenshot": { + "default": false, + "description": "Include a full page screenshot of the page that you are scraping.", + "type": "boolean" + }, + "headers": { + "description": "Headers to send with the request. Can be used to send cookies, user-agent, etc.", + "type": "object" + }, + "includeHtml": { + "default": false, + "description": "Include the HTML version of the content on page. Will output a html key in the response.", + "type": "boolean" + }, + "includeRawHtml": { + "default": false, + "description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.", + "type": "boolean" + }, + "onlyIncludeTags": { + "description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'", + "items": { + "type": "string" + }, + "type": "array" + }, + "onlyMainContent": { + "default": false, + "description": "Only return the main content of the page excluding headers, navs, footers, etc.", + "type": "boolean" + }, + "removeTags": { + "description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'", + "items": { + "type": "string" + }, + "type": "array" + }, + "replaceAllPathsWithAbsolutePaths": { + "default": false, + "description": "Replace all relative paths with absolute paths for images and links", + "type": "boolean" + }, + "screenshot": { + "default": false, + "description": "Include a screenshot of the top of the page that you are scraping.", + "type": "boolean" + }, + "waitFor": { + "default": 0, + "description": "Wait x amount of milliseconds for the page to load to fetch content", + "type": "integer" + } + }, + "type": "object" + }, + "timeout": { + "default": 30000, + "description": "Timeout in milliseconds for the request", + "type": "integer" + }, + "url": { + "description": "The URL to scrape.", + "required": true, + "type": "string" + } + }, + "type": "object" + } + } + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "data": { + "properties": { + "content": { + "description": "Raw content of the page.", + "type": "string" + }, + "html": { + "description": "HTML version of the page content, only present if `includeHtml` was set to `true` in the request.", + "nullable": true, + "type": "string" + }, + "llm_extraction": { + "description": "Extracted data from the page using the specified schema, only present if an LLM extraction mode was used.", + "nullable": true, + "type": "object" + }, + "markdown": { + "description": "Markdown version of the page content.", + "type": "string" + }, + "metadata": { + "properties": { + " ": { + "description": "Any other extracted metadata.", + "type": "string" + }, + "description": { + "description": "Page description.", + "type": "string" + }, + "language": { + "description": "Page language.", + "nullable": true, + "type": "string" + }, + "pageError": { + "description": "Error message if there was an error scraping the page.", + "nullable": true, + "type": "string" + }, + "pageStatusCode": { + "description": "HTTP status code of the page.", + "type": "integer" + }, + "sourceURL": { + "description": "Source URL of the page.", + "type": "string" + }, + "title": { + "description": "Page title.", + "type": "string" + } + }, + "type": "object" + }, + "rawHtml": { + "description": "Raw HTML content of the page, only present if `includeRawHtml` was set to `true` in the request.", + "nullable": true, + "type": "string" + }, + "warning": { + "description": "Warning message from the LLM extraction process, if any.", + "nullable": true, + "type": "string" + } + }, + "type": "object" + }, + "success": { + "description": "Indicates whether the scraping was successful.", + "type": "boolean" + } + }, + "type": "object" + } + } + }, + "description": "URL scraped successfully." + } + } + }, + "summary": "Scrape a URL" + } + }, + "description": "Send a request to get the status and results of a crawl job.", + "operationId": "getCrawlJobStatus", + "parameters": [ + { + "description": "ID of the crawl job to check.", + "in": "path", + "name": "jobId", + "required": true, + "schema": { + "type": "string" + } + } + ], + "requestBody": { + "content": {} + }, + "responses": { + "200": { + "402": { + "description": "Payment required." + }, + "429": { + "description": "Rate limit exceeded." + }, + "500": { + "description": "Internal server error." + }, + "content": { + "application/json": { + "schema": { + "properties": { + "current": { + "description": "The number of pages crawled so far.", + "type": "integer" + }, + "data": { + "description": "The crawl results. Only available when the crawl job is completed.", + "items": { + "properties": { + "content": { + "description": "Raw content of the page.", + "type": "string" + }, + "html": { + "description": "HTML version of the page content, only present if `includeHtml` was set to `true` in the crawl request.", + "type": "string" + }, + "index": { + "description": "The index of the crawled page in the results.", + "type": "integer" + }, + "markdown": { + "description": "Markdown content of the page.", + "type": "string" + }, + "metadata": { + "description": "Metadata extracted from the page.", + "properties": { + " ": { + "description": "Any other extracted metadata.", + "type": "string" + }, + "description": { + "description": "Page description.", + "type": "string" + }, + "language": { + "description": "Page language.", + "type": "string" + }, + "pageError": { + "description": "Error message if there was an error scraping the page.", + "type": "string" + }, + "pageStatusCode": { + "description": "HTTP status code of the page.", + "type": "integer" + }, + "sourceURL": { + "description": "Source URL of the page.", + "type": "string" + }, + "title": { + "description": "Page title.", + "type": "string" + } + }, + "type": "object" + }, + "rawHtml": { + "description": "Raw HTML content of the page, only present if `includeRawHtml` was set to `true` in the crawl request.", + "type": "string" + } + }, + "type": "object" + }, + "type": "array" + }, + "partial_data": { + "description": "Partial results streamed as the crawl progresses. This feature is in alpha and may change.", + "items": { + "properties": { + "content": { + "description": "Raw content of the page.", + "type": "string" + }, + "html": { + "description": "HTML version of the page content, only present if `includeHtml` was set to `true` in the crawl request.", + "type": "string" + }, + "index": { + "description": "The index of the crawled page in the results.", + "type": "integer" + }, + "markdown": { + "description": "Markdown content of the page.", + "type": "string" + }, + "metadata": { + "description": "Metadata extracted from the page.", + "properties": { + " ": { + "description": "Any other extracted metadata.", + "type": "string" + }, + "description": { + "description": "Page description.", + "type": "string" + }, + "language": { + "description": "Page language.", + "type": "string" + }, + "pageError": { + "description": "Error message if there was an error scraping the page.", + "type": "string" + }, + "pageStatusCode": { + "description": "HTTP status code of the page.", + "type": "integer" + }, + "sourceURL": { + "description": "Source URL of the page.", + "type": "string" + }, + "title": { + "description": "Page title.", + "type": "string" + } + }, + "type": "object" + }, + "rawHtml": { + "description": "Raw HTML content of the page, only present if `includeRawHtml` was set to `true` in the crawl request.", + "type": "string" + } + }, + "type": "object" + }, + "type": "array" + }, + "status": { + "description": "Status of the crawl job. Can be 'completed', 'active', 'failed', or 'paused'.", + "enum": [ + "completed", + "active", + "failed", + "paused" + ], + "type": "string" + }, + "total": { + "description": "The total estimated number of pages to crawl.", + "type": "integer" + } + }, + "type": "object" + } + } + }, + "description": "Crawl job status retrieved." + } + }, + "summary": "Get Crawl Job Status" + } + }, + "delete": { + "description": "Send a request to cancel a running crawl job.", + "operationId": "cancelCrawlJob", + "parameters": [ + { + "description": "ID of the crawl job to cancel.", + "in": "path", + "name": "jobId", + "required": true, + "schema": { + "type": "string" + } + } + ], + "requestBody": { + "content": {} + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "status": { + "description": "The status of the crawl job cancellation request, usually 'cancelled'.", + "type": "string" + } + }, + "type": "object" + } + } + }, + "description": "Crawl job cancellation request submitted." + }, + "402": { + "description": "Payment required." + }, + "429": { + "description": "Rate limit exceeded." + }, + "500": { + "description": "Internal server error." + } + }, + "summary": "Cancel a Crawl Job" + } + }, + "description": "Send a request to crawl a URL and all accessible subpages. This submits a crawl job and returns a job ID to check the status of the crawl.", + "operationId": "crawlWebsite", + "parameters": [], + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "crawlerOptions": { + "description": "Options for controlling the crawling behavior.", + "properties": { + "allowBackwardCrawling": { + "default": false, + "description": "Enables the crawler to navigate from a specific URL to previously linked pages. For instance, from 'example.com/product/123' back to 'example.com/product'", + "type": "boolean" + }, + "allowExternalContentLinks": { + "default": false, + "description": "Allows the crawler to follow links to external websites.", + "type": "boolean" + }, + "excludes": { + "description": "URL patterns to exclude", + "items": { + "type": "string" + }, + "type": "array" + }, + "generateImgAltText": { + "default": false, + "description": "Generate alt text for images using LLMs (must have a paid plan)", + "type": "boolean" + }, + "ignoreSitemap": { + "default": false, + "description": "Ignore the website sitemap when crawling", + "type": "boolean" + }, + "includes": { + "description": "URL patterns to include", + "items": { + "type": "string" + }, + "type": "array" + }, + "limit": { + "default": 10000, + "description": "Maximum number of pages to crawl", + "type": "integer" + }, + "maxDepth": { + "description": "Maximum depth to crawl relative to the entered URL. A maxDepth of 0 scrapes only the entered URL. A maxDepth of 1 scrapes the entered URL and all pages one level deep. A maxDepth of 2 scrapes the entered URL and all pages up to two levels deep. Higher values follow the same pattern.", + "type": "integer" + }, + "mode": { + "default": "default", + "description": "The crawling mode to use. Fast mode crawls 4x faster websites without sitemap, but may not be as accurate and shouldn't be used in heavy js-rendered websites.", + "enum": [ + "default", + "fast" + ], + "type": "string" + }, + "returnOnlyUrls": { + "default": false, + "description": "If true, returns only the URLs as a list on the crawl status. Attention: the return response will be a list of URLs inside the data, not a list of documents.", + "type": "boolean" + } + }, + "type": "object" + }, + "pageOptions": { + "description": "Options for controlling the scraping behavior of individual pages.", + "properties": { + "fullPageScreenshot": { + "default": false, + "description": "Include a full page screenshot of the page that you are scraping.", + "type": "boolean" + }, + "headers": { + "description": "Headers to send with the request. Can be used to send cookies, user-agent, etc.", + "type": "object" + }, + "includeHtml": { + "default": false, + "description": "Include the HTML version of the content on page. Will output a html key in the response.", + "type": "boolean" + }, + "includeRawHtml": { + "default": false, + "description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.", + "type": "boolean" + }, + "onlyIncludeTags": { + "description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'", + "items": { + "type": "string" + }, + "type": "array" + }, + "onlyMainContent": { + "default": false, + "description": "Only return the main content of the page excluding headers, navs, footers, etc.", + "type": "boolean" + }, + "removeTags": { + "description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'", + "items": { + "type": "string" + }, + "type": "array" + }, + "replaceAllPathsWithAbsolutePaths": { + "default": false, + "description": "Replace all relative paths with absolute paths for images and links", + "type": "boolean" + }, + "screenshot": { + "default": false, + "description": "Include a screenshot of the top of the page that you are scraping.", + "type": "boolean" + }, + "waitFor": { + "default": 0, + "description": "Wait x amount of milliseconds for the page to load to fetch content", + "type": "integer" + } + }, + "type": "object" + }, + "url": { + "description": "The base URL to start crawling from", + "required": true, + "type": "string" + } + }, + "type": "object" + } + } + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "jobId": { + "description": "The ID of the submitted crawl job.", + "type": "string" + } + }, + "type": "object" + } + } + }, + "description": "Crawl job submitted successfully." + }, + "402": { + "description": "Payment required." + }, + "429": { + "description": "Rate limit exceeded." + }, + "500": { + "description": "Internal server error." + } + } + }, + "summary": "Crawl a Website" + } + } + }, + "servers": [ + { + "url": "https://api.firecrawl.dev/v0" + } + ] +} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/dify_api_spec.json b/examples/turning_docs_into_api_specs/dify_api_spec.json new file mode 100644 index 00000000..e6eec457 --- /dev/null +++ b/examples/turning_docs_into_api_specs/dify_api_spec.json @@ -0,0 +1,164 @@ +{ + "openapi": "3.0.0", + "info": { + "title": "Knowledge Base API", + "description": "API for managing knowledge bases and documents." + }, + "paths": { + "/datasets": { + "post": { + "summary": "Create an Empty Dataset", + "description": "Only used to create an empty dataset", + "requestBody": { + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "name": { + "type": "string" + } + } + } + } + } + }, + "responses": {} + }, + "get": { + "summary": "Dataset List", + "parameters": [ + { + "name": "page", + "in": "query", + "schema": { + "type": "integer" + } + }, + { + "name": "limit", + "in": "query", + "schema": { + "type": "integer" + } + } + ], + "responses": {} + } + }, + "/datasets/{dataset_id}/document/create_by_text": { + "post": { + "summary": "Create Document by Text", + "requestBody": { + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "text": { + "type": "string" + }, + "indexing_technique": { + "type": "string" + }, + "process_rule": { + "type": "object" + } + } + } + } + } + }, + "responses": {} + } + }, + "/datasets/{dataset_id}/document/create_by_file": { + "post": { + "summary": "Create Document by File", + "requestBody": { + "content": { + "multipart/form-data": { + "schema": { + "type": "object", + "properties": { + "data": { + "type": "string" + }, + "file": { + "type": "string", + "format": "binary" + } + } + } + } + } + }, + "responses": {} + } + }, + "/datasets/{dataset_id}/documents/{batch}/indexing-status": { + "get": { + "summary": "Get Document Embedding Status (Progress)", + "responses": {} + } + }, + "/datasets/{dataset_id}/documents/{document_id}": { + "delete": { + "summary": "Delete Document", + "responses": {} + } + }, + "/datasets/{dataset_id}/documents": { + "get": { + "summary": "Dataset Document List", + "responses": {} + } + }, + "/datasets/{dataset_id}/documents/{document_id}/segments": { + "post": { + "summary": "Add Segments", + "requestBody": { + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "segments": { + "type": "array", + "items": { + "type": "object", + "properties": { + "content": { + "type": "string" + }, + "answer": { + "type": "string" + }, + "keywords": { + "type": "array", + "items": { + "type": "string" + } + } + } + } + } + } + } + } + } + }, + "responses": {} + } + }, + "/datasets/{dataset_id}/segments/{segment_id}": { + "delete": { + "summary": "Delete Document Segment", + "responses": {} + } + } + } +} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_0.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_0.json new file mode 100644 index 00000000..84bce02c --- /dev/null +++ b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_0.json @@ -0,0 +1,211 @@ +{ + "info": { + "title": "Firecrawl API", + "version": "v0" + }, + "openapi": "3.0.0", + "paths": { + "/v0/crawl": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "crawlerOptions": { + "description": "Crawling options.", + "properties": { + "excludes": { + "description": "URL patterns to exclude.", + "items": { + "type": "string" + }, + "type": "array" + }, + "includes": { + "description": "URL patterns to include.", + "items": { + "type": "string" + }, + "type": "array" + }, + "limit": { + "description": "Maximum pages to crawl.", + "type": "integer" + }, + "maxDepth": { + "description": "Maximum crawl depth.", + "type": "integer" + }, + "mode": { + "description": "Crawling mode.", + "enum": [ + "default", + "fast" + ], + "type": "string" + }, + "returnOnlyUrls": { + "description": "Return only URLs.", + "type": "boolean" + } + }, + "type": "object" + }, + "pageOptions": { + "description": "Page scraping options.", + "properties": { + "includeHtml": { + "description": "Include HTML content.", + "type": "boolean" + }, + "includeRawHtml": { + "description": "Include raw HTML content.", + "type": "boolean" + }, + "onlyMainContent": { + "description": "Only main content.", + "type": "boolean" + }, + "screenshot": { + "description": "Include page screenshot.", + "type": "boolean" + }, + "waitFor": { + "description": "Wait time in milliseconds.", + "type": "integer" + } + }, + "type": "object" + }, + "url": { + "description": "Base URL to crawl.", + "type": "string" + } + }, + "type": "object" + } + } + } + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "jobId": { + "description": "Crawl job ID.", + "type": "string" + } + }, + "type": "object" + } + } + }, + "description": "Crawl job initiated." + } + }, + "summary": "Crawl multiple pages." + } + }, + "/v0/crawl/status/{jobId}": { + "get": { + "parameters": [ + { + "description": "Crawl job ID.", + "in": "path", + "name": "jobId", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "description": "Crawl job status." + } + }, + "summary": "Check crawl job status." + } + }, + "/v0/scrape": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "extractorOptions": { + "description": "Data extraction options.", + "properties": { + "extractionPrompt": { + "description": "Prompt for data extraction.", + "type": "string" + }, + "extractionSchema": { + "description": "Schema for data extraction.", + "type": "object" + }, + "mode": { + "description": "Extraction mode.", + "enum": [ + "llm-extraction", + "llm-extraction-from-raw-html" + ], + "type": "string" + } + }, + "type": "object" + }, + "pageOptions": { + "description": "Page scraping options.", + "properties": { + "includeHtml": { + "description": "Include HTML content.", + "type": "boolean" + }, + "includeRawHtml": { + "description": "Include raw HTML content.", + "type": "boolean" + }, + "onlyMainContent": { + "description": "Only main content.", + "type": "boolean" + }, + "screenshot": { + "description": "Include page screenshot.", + "type": "boolean" + }, + "waitFor": { + "description": "Wait time in milliseconds.", + "type": "integer" + } + }, + "type": "object" + }, + "timeout": { + "description": "Timeout in milliseconds.", + "type": "integer" + }, + "url": { + "description": "URL to scrape.", + "type": "string" + } + }, + "type": "object" + } + } + } + }, + "responses": { + "200": { + "description": "Successful scraping." + } + }, + "summary": "Scrape a single page." + } + } + } +} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_1.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_1.json new file mode 100644 index 00000000..8656c978 --- /dev/null +++ b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_1.json @@ -0,0 +1,165 @@ +{ + "info": { + "title": "Firecrawl API", + "version": "v0" + }, + "openapi": "3.0.0", + "paths": { + "/crawl": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "crawlerOptions": { + "properties": { + "allowBackwardCrawling": { + "description": "Allow backward crawling", + "type": "boolean" + }, + "allowExternalContentLinks": { + "description": "Allow external links", + "type": "boolean" + }, + "excludes": { + "description": "URL patterns to exclude", + "items": { + "type": "string" + }, + "type": "array" + }, + "generateImgAltText": { + "description": "Generate alt text for images", + "type": "boolean" + }, + "ignoreSitemap": { + "description": "Ignore website sitemap", + "type": "boolean" + }, + "includes": { + "description": "URL patterns to include", + "items": { + "type": "string" + }, + "type": "array" + }, + "limit": { + "description": "Maximum pages to crawl", + "type": "integer" + }, + "maxDepth": { + "description": "Maximum crawl depth", + "type": "integer" + }, + "mode": { + "description": "Crawling mode", + "enum": [ + "default", + "fast" + ], + "type": "string" + }, + "returnOnlyUrls": { + "description": "Return only crawled URLs", + "type": "boolean" + } + }, + "type": "object" + }, + "pageOptions": { + "properties": { + "fullPageScreenshot": { + "description": "Include full page screenshot", + "type": "boolean" + }, + "headers": { + "description": "Headers for requests", + "type": "object" + }, + "includeHtml": { + "description": "Include HTML content", + "type": "boolean" + }, + "includeRawHtml": { + "description": "Include raw HTML content", + "type": "boolean" + }, + "onlyIncludeTags": { + "description": "Include only specific tags", + "items": { + "type": "string" + }, + "type": "array" + }, + "onlyMainContent": { + "description": "Return only main content", + "type": "boolean" + }, + "removeTags": { + "description": "Remove specific tags", + "items": { + "type": "string" + }, + "type": "array" + }, + "replaceAllPathsWithAbsolutePaths": { + "description": "Use absolute paths", + "type": "boolean" + }, + "screenshot": { + "description": "Include page screenshot", + "type": "boolean" + }, + "waitFor": { + "description": "Wait for page load (ms)", + "type": "integer" + } + }, + "type": "object" + }, + "url": { + "description": "Base URL to crawl", + "type": "string" + } + }, + "type": "object" + } + } + } + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "jobId": { + "description": "Job ID of the crawl", + "type": "string" + } + }, + "type": "object" + } + } + }, + "description": "Crawl request successful" + } + }, + "security": [ + { + "Bearer": [] + } + ], + "summary": "Crawl a website" + } + } + }, + "securitySchemes": { + "Bearer": { + "bearerFormat": "JWT", + "scheme": "bearer", + "type": "http" + } + } +} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_10.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_10.json new file mode 100644 index 00000000..55f73a32 --- /dev/null +++ b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_10.json @@ -0,0 +1,93 @@ +{ + "info": { + "title": "Firecrawl API", + "version": "1.0.0" + }, + "openapi": "3.0.0", + "paths": { + "/check_crawl_status": { + "post": { + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "current": { + "type": "integer" + }, + "data": { + "items": { + "properties": { + "content": { + "type": "string" + }, + "markdown": { + "type": "string" + }, + "metadata": { + "properties": { + "description": { + "type": "string" + }, + "language": { + "type": "string" + }, + "sourceURL": { + "type": "string" + }, + "title": { + "type": "string" + } + }, + "type": "object" + }, + "provider": { + "type": "string" + } + }, + "type": "object" + }, + "type": "array" + }, + "status": { + "type": "string" + }, + "total": { + "type": "integer" + } + }, + "type": "object" + } + } + }, + "description": "Crawl job status" + } + }, + "summary": "Check crawl job status" + } + }, + "/crawl": { + "post": { + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "jobId": { + "type": "string" + } + }, + "type": "object" + } + } + }, + "description": "Job ID" + } + }, + "summary": "Crawl URL and subpages" + } + } + } +} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_11.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_11.json new file mode 100644 index 00000000..e19ed056 --- /dev/null +++ b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_11.json @@ -0,0 +1,131 @@ +{ + "info": { + "title": "Firecrawl API", + "version": "v0" + }, + "openapi": "3.0.0", + "paths": { + "/scrape": { + "post": { + "requestBody": { + "content": { + "application/json": { + "example": { + "extractorOptions": { + "extractionPrompt": "Based on the information on the page, extract the information from the schema. ", + "extractionSchema": { + "properties": { + "company_mission": { + "type": "string" + }, + "is_in_yc": { + "type": "boolean" + }, + "is_open_source": { + "type": "boolean" + }, + "supports_sso": { + "type": "boolean" + } + }, + "required": [ + "company_mission", + "supports_sso", + "is_open_source", + "is_in_yc" + ], + "type": "object" + }, + "mode": "llm-extraction" + }, + "url": "https://docs.firecrawl.dev/" + }, + "schema": { + "properties": { + "extractorOptions": { + "properties": { + "extractionPrompt": { + "description": "Prompt for extraction", + "type": "string" + }, + "extractionSchema": { + "description": "Schema for data extraction", + "type": "object" + }, + "mode": { + "description": "Extraction mode", + "type": "string" + } + }, + "type": "object" + }, + "url": { + "description": "URL to scrape", + "type": "string" + } + }, + "type": "object" + } + } + } + }, + "responses": { + "200": { + "content": { + "application/json": { + "example": { + "data": { + "content": "Raw Content", + "llm_extraction": { + "company_mission": "Train a secure AI on your technical resources that answers customer and employee questions so your team doesn't have to", + "is_in_yc": true, + "is_open_source": false, + "supports_sso": true + }, + "metadata": { + "description": "Mendable allows you to easily build AI chat applications. Ingest, customize, then deploy with one line of code anywhere you want. Brought to you by SideGuide", + "ogDescription": "Mendable allows you to easily build AI chat applications. Ingest, customize, then deploy with one line of code anywhere you want. Brought to you by SideGuide", + "ogImage": "https://docs.firecrawl.dev/mendable_new_og1.png", + "ogLocaleAlternate": [], + "ogSiteName": "Mendable", + "ogTitle": "Mendable", + "ogUrl": "https://docs.firecrawl.dev/", + "robots": "follow, index", + "sourceURL": "https://docs.firecrawl.dev/", + "title": "Mendable" + } + }, + "success": true + }, + "schema": { + "properties": { + "data": { + "properties": { + "content": { + "type": "string" + }, + "llm_extraction": { + "type": "object" + }, + "metadata": { + "type": "object" + } + }, + "type": "object" + }, + "success": { + "type": "boolean" + } + }, + "type": "object" + } + } + }, + "description": "Successful scrape" + } + }, + "summary": "Extract data from pages." + } + } + } +} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_13.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_13.json new file mode 100644 index 00000000..0352c66f --- /dev/null +++ b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_13.json @@ -0,0 +1,87 @@ +{ + "info": { + "title": "Firecrawl API", + "version": "v0" + }, + "openapi": "3.0.0", + "paths": { + "/search": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "pageOptions": { + "properties": { + "fetchPageContent": { + "type": "boolean" + } + }, + "type": "object" + }, + "query": { + "type": "string" + } + }, + "type": "object" + } + } + } + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "data": { + "items": { + "properties": { + "markdown": { + "type": "string" + }, + "metadata": { + "properties": { + "description": { + "type": "string" + }, + "language": { + "type": "string" + }, + "sourceURL": { + "type": "string" + }, + "title": { + "type": "string" + } + }, + "type": "object" + }, + "provider": { + "type": "string" + }, + "url": { + "type": "string" + } + }, + "type": "object" + }, + "type": "array" + }, + "success": { + "type": "boolean" + } + }, + "type": "object" + } + } + }, + "description": "Successful search and scrape." + } + }, + "summary": "Search web, scrape, return markdown." + } + } + } +} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_15.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_15.json new file mode 100644 index 00000000..e7384f8e --- /dev/null +++ b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_15.json @@ -0,0 +1,83 @@ +{ + "info": { + "title": "Firecrawl API", + "version": "1.0.0" + }, + "openapi": "3.0.0", + "paths": { + "/crawl": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "url": { + "description": "Website URL to crawl.", + "type": "string" + } + }, + "type": "object" + } + } + } + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "items": { + "properties": { + "markdown": { + "description": "Markdown content.", + "type": "string" + } + }, + "type": "object" + }, + "type": "array" + } + } + }, + "description": "Website crawled successfully." + } + }, + "summary": "Crawl a website." + } + }, + "/scrape": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "url": { + "description": "Page URL to scrape.", + "type": "string" + } + }, + "type": "object" + } + } + } + }, + "responses": { + "200": { + "content": { + "text/plain": { + "schema": { + "description": "Scraped content.", + "type": "string" + } + } + }, + "description": "Page scraped successfully." + } + }, + "summary": "Scrape a single page." + } + } + } +} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_16.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_16.json new file mode 100644 index 00000000..ed6fb9d6 --- /dev/null +++ b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_16.json @@ -0,0 +1,200 @@ +{ + "info": { + "title": "Firecrawl API", + "version": "1.0.0" + }, + "openapi": "3.0.0", + "paths": { + "/crawl": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "crawler_options": { + "properties": { + "exclude": { + "description": "URL patterns to exclude", + "items": { + "type": "string" + }, + "type": "array" + }, + "generateImgAltText": { + "description": "Generate alt text for images", + "type": "boolean" + }, + "includes": { + "description": "URL patterns to include", + "items": { + "type": "string" + }, + "type": "array" + }, + "limit": { + "description": "Max pages to crawl", + "type": "integer" + }, + "maxDepth": { + "description": "Maximum crawl depth", + "type": "integer" + }, + "mode": { + "description": "Crawling mode", + "type": "string" + }, + "returnOnlyUrls": { + "description": "Return only URLs", + "type": "boolean" + }, + "timeout": { + "description": "Timeout in milliseconds", + "type": "integer" + } + }, + "type": "object" + }, + "page_options": { + "properties": { + "includeHtml": { + "description": "Include raw HTML", + "type": "boolean" + }, + "onlyMainContent": { + "description": "Only main content", + "type": "boolean" + } + }, + "type": "object" + }, + "url": { + "description": "Base URL to crawl", + "type": "string" + } + }, + "type": "object" + } + } + } + }, + "responses": { + "200": { + "description": "Crawl successful." + } + }, + "summary": "Crawl a website." + } + }, + "/scrape": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "extractor_options": { + "properties": { + "extractionPrompt": { + "description": "Prompt for extraction", + "type": "string" + }, + "extractionSchema": { + "description": "Schema for extraction", + "type": "string" + }, + "mode": { + "description": "Extraction mode", + "type": "string" + } + }, + "type": "object" + }, + "page_options": { + "properties": { + "includeHtml": { + "description": "Include raw HTML", + "type": "boolean" + }, + "onlyMainContent": { + "description": "Only main content", + "type": "boolean" + } + }, + "type": "object" + }, + "timeout": { + "description": "Timeout in milliseconds", + "type": "integer" + }, + "url": { + "description": "URL to scrape", + "type": "string" + } + }, + "type": "object" + } + } + } + }, + "responses": { + "200": { + "description": "Scrape successful." + } + }, + "summary": "Scrape a website." + } + }, + "/search": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "page_options": { + "properties": { + "fetchPageContent": { + "description": "Fetch full content", + "type": "boolean" + }, + "includeHtml": { + "description": "Include raw HTML", + "type": "boolean" + }, + "onlyMainContent": { + "description": "Only main content", + "type": "boolean" + } + }, + "type": "object" + }, + "query": { + "description": "Search query string", + "type": "string" + }, + "search_options": { + "properties": { + "limit": { + "description": "Max results", + "type": "integer" + } + }, + "type": "object" + } + }, + "type": "object" + } + } + } + }, + "responses": { + "200": { + "description": "Search successful." + } + }, + "summary": "Search Firecrawl index." + } + } + } +} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_2.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_2.json new file mode 100644 index 00000000..25cf6c05 --- /dev/null +++ b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_2.json @@ -0,0 +1,54 @@ +{ + "info": { + "title": "Firecrawl API", + "version": "v0" + }, + "openapi": "3.0.0", + "paths": { + "/crawl/cancel/{jobId}": { + "delete": { + "parameters": [ + { + "description": "ID of crawl job", + "in": "path", + "name": "jobId", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "status": { + "type": "string" + } + }, + "type": "object" + } + } + }, + "description": "Returns cancelled." + } + }, + "security": [ + { + "Bearer": [] + } + ], + "summary": "Cancel crawl job" + } + } + }, + "securitySchemes": { + "Bearer": { + "bearerFormat": "Bearer ", + "scheme": "bearer", + "type": "http" + } + } +} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_22.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_22.json new file mode 100644 index 00000000..ac146a63 --- /dev/null +++ b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_22.json @@ -0,0 +1,166 @@ +{ + "info": { + "title": "Firecrawl API", + "version": "1.0.0" + }, + "openapi": "3.0.0", + "paths": { + "/check-crawl-status/{jobId}": { + "get": { + "parameters": [ + { + "in": "path", + "name": "jobId", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "current": { + "description": "Current progress", + "type": "integer" + }, + "data": { + "items": { + "properties": { + "content": { + "description": "Raw content", + "type": "string" + }, + "markdown": { + "description": "Markdown content", + "type": "string" + }, + "metadata": { + "description": "Page metadata", + "type": "object" + }, + "provider": { + "description": "Data provider", + "type": "string" + } + }, + "type": "object" + }, + "type": "array" + }, + "status": { + "description": "Job status", + "type": "string" + }, + "total": { + "description": "Total pages", + "type": "integer" + } + }, + "type": "object" + } + } + }, + "description": "Crawl job status." + } + }, + "summary": "Check crawl job status." + } + }, + "/crawl": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "crawlerOptions": { + "description": "Crawler options", + "type": "object" + }, + "url": { + "description": "URL to crawl", + "type": "string" + } + }, + "type": "object" + } + } + } + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "jobId": { + "description": "Job ID", + "type": "string" + } + }, + "type": "object" + } + } + }, + "description": "Crawl job submitted." + } + }, + "summary": "Crawl a URL." + } + }, + "/scrape": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "extractorOptions": { + "description": "Extractor options", + "type": "object" + }, + "pageOptions": { + "description": "Page options", + "type": "object" + }, + "url": { + "description": "URL to scrape", + "type": "string" + } + }, + "type": "object" + } + } + } + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "data": { + "description": "Scraped data", + "type": "object" + }, + "success": { + "description": "Success flag", + "type": "boolean" + } + }, + "type": "object" + } + } + }, + "description": "Scraped data." + } + }, + "summary": "Scrape a single URL." + } + } + } +} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_25.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_25.json new file mode 100644 index 00000000..9701a462 --- /dev/null +++ b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_25.json @@ -0,0 +1,229 @@ +{ + "info": { + "title": "Firecrawl API", + "version": "v0" + }, + "openapi": "3.0.0", + "paths": { + "/v0/crawl": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "crawlerOptions": { + "properties": { + "excludes": { + "description": "Paths to exclude", + "items": { + "type": "string" + }, + "type": "array" + }, + "includes": { + "description": "Paths to include", + "items": { + "type": "string" + }, + "type": "array" + }, + "limit": { + "description": "Maximum pages to crawl", + "type": "integer" + }, + "maxDepth": { + "description": "Maximum crawl depth", + "type": "integer" + }, + "returnOnlyUrls": { + "description": "Only return URLs", + "type": "boolean" + } + }, + "type": "object" + }, + "pageOptions": { + "properties": { + "onlyMainContent": { + "description": "Extract main content", + "type": "boolean" + } + }, + "type": "object" + }, + "url": { + "description": "URL to crawl", + "type": "string" + } + }, + "type": "object" + } + } + } + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "jobId": { + "description": "Job ID", + "type": "string" + } + }, + "type": "object" + } + } + }, + "description": "Crawl job created" + } + }, + "summary": "Crawl a website" + } + }, + "/v0/crawl/status/{jobId}": { + "get": { + "parameters": [ + { + "in": "path", + "name": "jobId", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "current": { + "type": "integer" + }, + "data": { + "items": { + "properties": { + "url": { + "type": "string" + } + }, + "type": "object" + }, + "type": "array" + }, + "status": { + "description": "Job status", + "type": "string" + }, + "total": { + "type": "integer" + } + }, + "type": "object" + } + } + }, + "description": "Crawl job status" + } + }, + "summary": "Get crawl job status" + } + }, + "/v0/scrape": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "pageOptions": { + "properties": { + "onlyMainContent": { + "description": "Extract main content", + "type": "boolean" + } + }, + "type": "object" + }, + "url": { + "description": "URL to scrape", + "type": "string" + } + }, + "type": "object" + } + } + } + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "data": { + "properties": { + "content": { + "type": "string" + }, + "html": { + "type": "string" + }, + "llm_extraction": { + "type": "object" + }, + "markdown": { + "type": "string" + }, + "metadata": { + "properties": { + "description": { + "type": "string" + }, + "language": { + "type": "string" + }, + "pageError": { + "type": "string" + }, + "pageStatusCode": { + "type": "integer" + }, + "sourceURL": { + "type": "string" + }, + "title": { + "type": "string" + } + }, + "type": "object" + }, + "rawHtml": { + "type": "string" + }, + "warning": { + "type": "string" + } + }, + "type": "object" + }, + "success": { + "type": "boolean" + } + }, + "type": "object" + } + } + }, + "description": "Scrape results" + } + }, + "summary": "Scrape a webpage" + } + } + } +} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_26.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_26.json new file mode 100644 index 00000000..b642e9c0 --- /dev/null +++ b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_26.json @@ -0,0 +1,115 @@ +{ + "info": { + "title": "Firecrawl API", + "version": "1.0.0" + }, + "openapi": "3.0.0", + "paths": { + "/scrape": { + "post": { + "requestBody": { + "content": { + "application/json": { + "example": { + "extractorOptions": { + "extractionPrompt": "Extract company info.", + "extractionSchema": { + "properties": { + "company_description": { + "type": "string" + }, + "company_industry": { + "type": "string" + }, + "who_they_serve": { + "type": "string" + } + }, + "required": [ + "company_description", + "company_industry", + "who_they_serve" + ], + "type": "object" + }, + "mode": "llm-extraction" + }, + "pageOptions": { + "onlyMainContent": true + }, + "url": "https://example.com" + }, + "schema": { + "properties": { + "extractorOptions": { + "properties": { + "extractionPrompt": { + "description": "Prompt for LLM extraction.", + "type": "string" + }, + "extractionSchema": { + "properties": { + "properties": { + "company_description": { + "type": "string" + }, + "company_industry": { + "type": "string" + }, + "who_they_serve": { + "type": "string" + } + }, + "required": [ + "company_description", + "company_industry", + "who_they_serve" + ], + "type": { + "type": "string" + } + }, + "type": "object" + }, + "mode": { + "description": "Extraction mode.", + "type": "string" + } + }, + "type": "object" + }, + "pageOptions": { + "properties": { + "onlyMainContent": { + "type": "boolean" + } + }, + "type": "object" + }, + "url": { + "description": "URL to scrape.", + "type": "string" + } + }, + "type": "object" + } + } + } + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "type": "object" + } + } + }, + "description": "Successful scrape." + } + }, + "summary": "Scrape data from URL." + } + } + } +} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_3.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_3.json new file mode 100644 index 00000000..bcf94159 --- /dev/null +++ b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_3.json @@ -0,0 +1,185 @@ +{ + "components": { + "securitySchemes": { + "bearerAuth": { + "scheme": "bearer", + "type": "http" + } + } + }, + "info": { + "title": "Firecrawl API", + "version": "v0" + }, + "openapi": "3.0.0", + "paths": { + "/v0/scrape": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "extractorOptions": { + "description": "Options for extraction", + "properties": { + "extractionPrompt": { + "description": "Prompt for LLM extraction", + "type": "string" + }, + "extractionSchema": { + "description": "Schema for LLM extraction", + "type": "object" + }, + "mode": { + "description": "Extraction mode", + "enum": [ + "markdown", + "llm-extraction", + "llm-extraction-from-raw-html", + "llm-extraction-from-markdown" + ], + "type": "string" + } + }, + "type": "object" + }, + "pageOptions": { + "properties": { + "fullPageScreenshot": { + "description": "Include full page screenshot", + "type": "boolean" + }, + "headers": { + "description": "Headers for request", + "type": "object" + }, + "includeHtml": { + "description": "Include HTML content", + "type": "boolean" + }, + "includeRawHtml": { + "description": "Include raw HTML content", + "type": "boolean" + }, + "onlyIncludeTags": { + "description": "Include only these tags", + "items": { + "type": "string" + }, + "type": "array" + }, + "onlyMainContent": { + "description": "Only return main content", + "type": "boolean" + }, + "removeTags": { + "description": "Remove these tags", + "items": { + "type": "string" + }, + "type": "array" + }, + "replaceAllPathsWithAbsolutePaths": { + "description": "Replace relative paths", + "type": "boolean" + }, + "screenshot": { + "description": "Include screenshot", + "type": "boolean" + }, + "waitFor": { + "description": "Wait time in ms", + "type": "integer" + } + }, + "type": "object" + }, + "timeout": { + "description": "Timeout in ms", + "type": "integer" + }, + "url": { + "description": "URL to scrape", + "type": "string" + } + }, + "type": "object" + } + } + }, + "required": true + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "data": { + "properties": { + "content": { + "type": "string" + }, + "html": { + "type": "string" + }, + "llm_extraction": { + "type": "object" + }, + "markdown": { + "type": "string" + }, + "metadata": { + "properties": { + "description": { + "type": "string" + }, + "language": { + "type": "string" + }, + "pageError": { + "type": "string" + }, + "pageStatusCode": { + "type": "integer" + }, + "sourceURL": { + "type": "string" + }, + "title": { + "type": "string" + } + }, + "type": "object" + }, + "rawHtml": { + "type": "string" + }, + "warning": { + "type": "string" + } + }, + "type": "object" + }, + "success": { + "type": "boolean" + } + }, + "type": "object" + } + } + }, + "description": "Successful scrape" + } + }, + "security": [ + { + "bearerAuth": [] + } + ], + "summary": "Scrape a webpage" + } + } + } +} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_30.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_30.json new file mode 100644 index 00000000..bc542e2a --- /dev/null +++ b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_30.json @@ -0,0 +1,212 @@ +{ + "info": { + "title": "Firecrawl API", + "version": "1.0.0" + }, + "openapi": "3.0.0", + "paths": { + "/crawl": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "crawlerOptions": { + "description": "Crawl job options", + "properties": { + "excludes": { + "description": "Pages to exclude", + "items": { + "type": "string" + }, + "type": "array" + }, + "includes": { + "description": "Pages to include", + "items": { + "type": "string" + }, + "type": "array" + }, + "limit": { + "description": "Max pages to crawl", + "type": "integer" + } + }, + "type": "object" + }, + "pageOptions": { + "description": "Page scraping options", + "properties": { + "onlyMainContent": { + "description": "Only scrape main content", + "type": "boolean" + } + }, + "type": "object" + }, + "url": { + "description": "URL to crawl", + "type": "string" + } + }, + "required": [ + "url" + ], + "type": "object" + } + } + } + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "description": "Crawl job result", + "type": "object" + } + } + }, + "description": "Crawl job result" + } + }, + "summary": "Crawl a website" + } + }, + "/crawl/{jobId}/cancel": { + "post": { + "parameters": [ + { + "description": "Crawl job ID", + "in": "path", + "name": "jobId", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "description": "Cancellation status", + "type": "object" + } + } + }, + "description": "Cancellation status" + } + }, + "summary": "Cancel crawl job" + } + }, + "/crawl/{jobId}/status": { + "get": { + "parameters": [ + { + "description": "Crawl job ID", + "in": "path", + "name": "jobId", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "description": "Crawl status", + "type": "object" + } + } + }, + "description": "Crawl status" + } + }, + "summary": "Check crawl status" + } + }, + "/scrape": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "extractorOptions": { + "description": "LLM extraction options", + "properties": { + "extractionSchema": { + "description": "JSON schema for extraction", + "type": "object" + } + }, + "type": "object" + }, + "url": { + "description": "URL to scrape", + "type": "string" + } + }, + "required": [ + "url" + ], + "type": "object" + } + } + } + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "description": "Scraped data", + "type": "object" + } + } + }, + "description": "Scraped data" + } + }, + "summary": "Scrape a single URL" + } + }, + "/search": { + "get": { + "parameters": [ + { + "description": "Search query", + "in": "query", + "name": "query", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "description": "Search results", + "type": "object" + } + } + }, + "description": "Search results" + } + }, + "summary": "Search and scrape" + } + } + } +} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_31.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_31.json new file mode 100644 index 00000000..07f71759 --- /dev/null +++ b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_31.json @@ -0,0 +1,199 @@ +{ + "info": { + "title": "Firecrawl API", + "version": "1.0.0" + }, + "openapi": "3.0.0", + "paths": { + "/crawl": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "crawlerOptions": { + "properties": { + "excludes": { + "description": "Paths to exclude", + "items": { + "type": "string" + }, + "type": "array" + }, + "includes": { + "description": "Paths to include", + "items": { + "type": "string" + }, + "type": "array" + }, + "limit": { + "description": "Maximum pages to crawl", + "type": "integer" + } + }, + "type": "object" + }, + "pageOptions": { + "properties": { + "onlyMainContent": { + "description": "Extract only main content", + "type": "boolean" + } + }, + "type": "object" + }, + "url": { + "description": "Starting URL for crawl", + "type": "string" + } + }, + "type": "object" + } + } + } + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "jobId": { + "description": "Unique job identifier", + "type": "string" + } + }, + "type": "object" + } + } + }, + "description": "Crawl job started" + } + }, + "summary": "Crawl a website" + } + }, + "/crawl/{jobId}/status": { + "get": { + "parameters": [ + { + "in": "path", + "name": "jobId", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "status": { + "description": "Current job status", + "type": "string" + } + }, + "type": "object" + } + } + }, + "description": "Crawl job status" + } + }, + "summary": "Check crawl status" + } + }, + "/scrape": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "extractorOptions": { + "properties": { + "extractionSchema": { + "description": "Zod schema for extraction", + "type": "object" + } + }, + "type": "object" + }, + "url": { + "description": "URL to scrape", + "type": "string" + } + }, + "type": "object" + } + } + } + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "data": { + "description": "Extracted data", + "type": "object" + } + }, + "type": "object" + } + } + }, + "description": "Scraped data" + } + }, + "summary": "Scrape a single URL" + } + }, + "/search": { + "get": { + "parameters": [ + { + "in": "query", + "name": "query", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "items": { + "properties": { + "content": { + "description": "Page content (optional)", + "type": "string" + }, + "url": { + "description": "Result URL", + "type": "string" + } + }, + "type": "object" + }, + "type": "array" + } + } + }, + "description": "Search results" + } + }, + "summary": "Search for a query" + } + } + } +} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_33.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_33.json new file mode 100644 index 00000000..b45ae841 --- /dev/null +++ b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_33.json @@ -0,0 +1,202 @@ +{ + "info": { + "title": "Firecrawl API", + "version": "1.0.0" + }, + "openapi": "3.0.0", + "paths": { + "/crawl": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "crawlerOptions": { + "description": "Options for crawling", + "properties": { + "excludes": { + "description": "URLs to exclude", + "items": { + "type": "string" + }, + "type": "array" + }, + "includes": { + "description": "URLs to include", + "items": { + "type": "string" + }, + "type": "array" + }, + "limit": { + "description": "Maximum pages to crawl", + "type": "integer" + } + }, + "type": "object" + }, + "pageOptions": { + "description": "Options for page content", + "properties": { + "onlyMainContent": { + "description": "Extract only main content", + "type": "boolean" + } + }, + "type": "object" + }, + "url": { + "description": "URL to crawl", + "type": "string" + } + }, + "type": "object" + } + } + } + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "jobId": { + "description": "Unique crawl job ID", + "type": "string" + } + }, + "type": "object" + } + } + }, + "description": "Crawl job started." + } + }, + "summary": "Crawl a website." + } + }, + "/crawl/{jobId}": { + "get": { + "parameters": [ + { + "in": "path", + "name": "jobId", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "status": { + "description": "Current job status", + "type": "string" + } + }, + "type": "object" + } + } + }, + "description": "Crawl job status." + } + }, + "summary": "Check crawl job status." + } + }, + "/scrape": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "extractorOptions": { + "description": "Options for data extraction", + "properties": { + "extractionSchema": { + "description": "Pydantic schema", + "type": "object" + }, + "mode": { + "description": "Extraction mode", + "type": "string" + } + }, + "type": "object" + }, + "pageOptions": { + "description": "Options for page content", + "properties": { + "onlyMainContent": { + "description": "Extract only main content", + "type": "boolean" + } + }, + "type": "object" + }, + "url": { + "description": "URL to scrape", + "type": "string" + } + }, + "type": "object" + } + } + } + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "type": "object" + } + } + }, + "description": "Scraped data." + } + }, + "summary": "Scrape a single URL." + } + }, + "/search": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "query": { + "description": "Search query", + "type": "string" + } + }, + "type": "object" + } + } + } + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "type": "object" + } + } + }, + "description": "Search results." + } + }, + "summary": "Search the web." + } + } + } +} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_34.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_34.json new file mode 100644 index 00000000..3bafda42 --- /dev/null +++ b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_34.json @@ -0,0 +1,201 @@ +{ + "info": { + "title": "Firecrawl API", + "version": "0.1" + }, + "openapi": "3.0.0", + "paths": { + "/crawl": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "crawlerOptions": { + "description": "Crawl job options", + "properties": { + "excludes": { + "description": "URLs to exclude", + "items": { + "type": "string" + }, + "type": "array" + }, + "includes": { + "description": "URLs to include", + "items": { + "type": "string" + }, + "type": "array" + }, + "limit": { + "description": "Maximum pages to crawl", + "type": "integer" + } + }, + "type": "object" + }, + "pageOptions": { + "description": "Page scraping options", + "properties": { + "onlyMainContent": { + "description": "Only scrape main content", + "type": "boolean" + } + }, + "type": "object" + }, + "url": { + "description": "URL to crawl", + "type": "string" + } + }, + "type": "object" + } + } + } + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "type": "object" + } + } + }, + "description": "Crawl job started" + } + }, + "summary": "Crawl a website." + } + }, + "/crawl/{job_id}/cancel": { + "post": { + "parameters": [ + { + "description": "Crawl job ID", + "in": "path", + "name": "job_id", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "type": "object" + } + } + }, + "description": "Cancellation status" + } + }, + "summary": "Cancel crawl job." + } + }, + "/crawl/{job_id}/status": { + "get": { + "parameters": [ + { + "description": "Crawl job ID", + "in": "path", + "name": "job_id", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "type": "object" + } + } + }, + "description": "Crawl status" + } + }, + "summary": "Check crawl status." + } + }, + "/scrape": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "extractorOptions": { + "description": "LLM extraction options", + "properties": { + "extractionSchema": { + "description": "JSON schema for extraction", + "type": "object" + } + }, + "type": "object" + }, + "url": { + "description": "URL to scrape", + "type": "string" + } + }, + "type": "object" + } + } + } + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "type": "object" + } + } + }, + "description": "Scraped data" + } + }, + "summary": "Scrape a single URL." + } + }, + "/search": { + "get": { + "parameters": [ + { + "description": "Search query", + "in": "query", + "name": "query", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "type": "object" + } + } + }, + "description": "Search results" + } + }, + "summary": "Search and scrape results." + } + } + } +} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_35.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_35.json new file mode 100644 index 00000000..890d31b1 --- /dev/null +++ b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_35.json @@ -0,0 +1,245 @@ +{ + "info": { + "title": "Firecrawl API", + "version": "1.0.0" + }, + "openapi": "3.0.0", + "paths": { + "/check-crawl-status": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "jobId": { + "description": "Crawl job ID", + "type": "string" + } + }, + "type": "object" + } + } + } + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "current": { + "description": "Current page count", + "type": "integer" + }, + "data": { + "description": "Crawl data", + "items": { + "properties": { + "content": { + "description": "Raw content", + "type": "string" + }, + "markdown": { + "description": "Markdown content", + "type": "string" + }, + "metadata": { + "description": "Page metadata", + "properties": { + "description": { + "description": "Page description", + "type": "string" + }, + "language": { + "description": "Page language", + "type": "string" + }, + "sourceURL": { + "description": "Page URL", + "type": "string" + }, + "title": { + "description": "Page title", + "type": "string" + } + }, + "type": "object" + }, + "provider": { + "description": "Content provider", + "type": "string" + } + }, + "type": "object" + }, + "type": "array" + }, + "status": { + "description": "Crawl status", + "type": "string" + }, + "total": { + "description": "Total page count", + "type": "integer" + } + }, + "type": "object" + } + } + }, + "description": "Crawl job status." + } + }, + "summary": "Check crawl job status." + } + }, + "/crawl": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "crawlerOptions": { + "description": "Crawler options", + "properties": { + "excludes": { + "description": "URLs to exclude", + "items": { + "type": "string" + }, + "type": "array" + } + }, + "type": "object" + }, + "url": { + "description": "URL to crawl", + "type": "string" + } + }, + "type": "object" + } + } + } + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "jobId": { + "description": "Job ID", + "type": "string" + } + }, + "type": "object" + } + } + }, + "description": "Crawl job submitted." + } + }, + "summary": "Crawl a URL." + } + }, + "/scrape-url": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "extractorOptions": { + "description": "Extractor options", + "properties": { + "extractionSchema": { + "description": "Extraction schema", + "type": "string" + }, + "mode": { + "description": "Extraction mode", + "type": "string" + } + }, + "type": "object" + }, + "pageOptions": { + "description": "Page options", + "properties": { + "onlyMainContent": { + "description": "Only main content", + "type": "boolean" + } + }, + "type": "object" + }, + "url": { + "description": "URL to scrape", + "type": "string" + } + }, + "type": "object" + } + } + } + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "data": { + "description": "Scraped data", + "properties": { + "content": { + "description": "Raw content", + "type": "string" + }, + "html": { + "description": "HTML content", + "type": "string" + }, + "llm_extraction": { + "description": "LLM extraction results", + "type": "object" + }, + "markdown": { + "description": "Markdown content", + "type": "string" + }, + "metadata": { + "description": "Page metadata", + "type": "object" + }, + "rawHtml": { + "description": "Raw HTML content", + "type": "string" + }, + "warning": { + "description": "Warning message", + "type": "string" + } + }, + "type": "object" + }, + "success": { + "description": "Request success", + "type": "boolean" + } + }, + "type": "object" + } + } + }, + "description": "Scraped data." + } + }, + "summary": "Scrape a single URL." + } + } + } +} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_4.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_4.json new file mode 100644 index 00000000..daf53932 --- /dev/null +++ b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_4.json @@ -0,0 +1,129 @@ +{ + "components": { + "securitySchemes": { + "Bearer": { + "scheme": "bearer", + "type": "http" + } + } + }, + "info": { + "title": "Firecrawl API", + "version": "v0" + }, + "openapi": "3.0.0", + "paths": { + "/search": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "pageOptions": { + "properties": { + "fetchPageContent": { + "description": "Fetch content of each page.", + "type": "boolean" + }, + "includeHtml": { + "description": "Include HTML content.", + "type": "boolean" + }, + "includeRawHtml": { + "description": "Include raw HTML content.", + "type": "boolean" + }, + "onlyMainContent": { + "description": "Only return main content.", + "type": "boolean" + } + }, + "type": "object" + }, + "query": { + "description": "The query to search for", + "type": "string" + }, + "searchOptions": { + "properties": { + "limit": { + "description": "Maximum number of results.", + "type": "integer" + } + }, + "type": "object" + } + }, + "type": "object" + } + } + }, + "required": true + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "data": { + "items": { + "properties": { + "content": { + "type": "string" + }, + "markdown": { + "type": "string" + }, + "metadata": { + "properties": { + "description": { + "type": "string" + }, + "language": { + "type": "string" + }, + "sourceURL": { + "type": "string" + }, + "title": { + "type": "string" + } + }, + "type": "object" + }, + "url": { + "type": "string" + } + }, + "type": "object" + }, + "type": "array" + }, + "success": { + "type": "boolean" + } + }, + "type": "object" + } + } + }, + "description": "Successful search." + } + }, + "security": [ + { + "Bearer": [] + } + ], + "summary": "Search the web." + } + } + }, + "servers": [ + { + "url": "https://api.firecrawl.dev/v0" + } + ] +} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_5.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_5.json new file mode 100644 index 00000000..4fae28c0 --- /dev/null +++ b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_5.json @@ -0,0 +1,186 @@ +{ + "info": { + "title": "Firecrawl API", + "version": "v0" + }, + "openapi": "3.0.0", + "paths": { + "/crawl/status/{jobId}": { + "get": { + "parameters": [ + { + "description": "ID of crawl job", + "in": "path", + "name": "jobId", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "current": { + "description": "Current page number", + "type": "integer" + }, + "data": { + "description": "Data from the job", + "items": { + "properties": { + "content": { + "type": "string" + }, + "html": { + "description": "HTML content", + "nullable": true, + "type": "string" + }, + "index": { + "description": "Page number crawled", + "type": "integer" + }, + "markdown": { + "type": "string" + }, + "metadata": { + "properties": { + "description": { + "type": "string" + }, + "language": { + "nullable": true, + "type": "string" + }, + "pageError": { + "description": "Error message of page", + "nullable": true, + "type": "string" + }, + "pageStatusCode": { + "description": "Status code of page", + "type": "integer" + }, + "sourceURL": { + "type": "string" + }, + "title": { + "type": "string" + }, + "{any other metadata}": { + "type": "string" + } + }, + "type": "object" + }, + "rawHtml": { + "description": "Raw HTML content", + "nullable": true, + "type": "string" + } + }, + "type": "object" + }, + "type": "array" + }, + "partial_data": { + "description": "Partial documents (streaming)", + "items": { + "properties": { + "content": { + "type": "string" + }, + "html": { + "description": "HTML content", + "nullable": true, + "type": "string" + }, + "index": { + "description": "Page number crawled", + "type": "integer" + }, + "markdown": { + "type": "string" + }, + "metadata": { + "properties": { + "description": { + "type": "string" + }, + "language": { + "nullable": true, + "type": "string" + }, + "pageError": { + "description": "Error message of page", + "nullable": true, + "type": "string" + }, + "pageStatusCode": { + "description": "Status code of page", + "type": "integer" + }, + "sourceURL": { + "type": "string" + }, + "title": { + "type": "string" + }, + "{any other metadata}": { + "type": "string" + } + }, + "type": "object" + }, + "rawHtml": { + "description": "Raw HTML content", + "nullable": true, + "type": "string" + } + }, + "type": "object" + }, + "type": "array" + }, + "status": { + "description": "Status of the job", + "type": "string" + }, + "total": { + "description": "Total number of pages", + "type": "integer" + } + }, + "type": "object" + } + } + }, + "description": "Successful operation" + } + }, + "security": [ + { + "Authorization": [] + } + ], + "summary": "Get crawl job status" + } + } + }, + "securitySchemes": { + "Authorization": { + "bearerFormat": "Bearer ", + "scheme": "bearer", + "type": "http" + } + }, + "servers": [ + { + "url": "https://api.firecrawl.dev/v0" + } + ] +} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_7.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_7.json new file mode 100644 index 00000000..b74b9886 --- /dev/null +++ b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_7.json @@ -0,0 +1,86 @@ +{ + "info": { + "title": "Firecrawl API", + "version": "v0" + }, + "openapi": "3.0.0", + "paths": { + "/v0/search": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "pageOptions": { + "properties": { + "fetchPageContent": { + "description": "Fetch page content", + "type": "boolean" + } + }, + "type": "object" + }, + "query": { + "description": "Search term", + "type": "string" + } + }, + "type": "object" + } + } + } + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "data": { + "items": { + "properties": { + "markdown": { + "type": "string" + }, + "metadata": { + "properties": { + "description": { + "type": "string" + }, + "language": { + "type": "string" + }, + "sourceURL": { + "type": "string" + }, + "title": { + "type": "string" + } + }, + "type": "object" + }, + "url": { + "type": "string" + } + }, + "type": "object" + }, + "type": "array" + }, + "success": { + "type": "boolean" + } + }, + "type": "object" + } + } + }, + "description": "Successful search" + } + }, + "summary": "Search and extract content" + } + } + } +} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_8.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_8.json new file mode 100644 index 00000000..2d5f40e2 --- /dev/null +++ b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_8.json @@ -0,0 +1,59 @@ +{ + "info": { + "title": "Firecrawl API", + "version": "v0" + }, + "openapi": "3.0.0", + "paths": { + "/test": { + "get": { + "description": "Returns a test message.", + "responses": { + "200": { + "content": { + "text/plain": { + "schema": { + "example": "Hello, world!", + "type": "string" + } + } + }, + "description": "Successful operation" + } + }, + "summary": "Test endpoint" + } + }, + "/v0/crawl": { + "post": { + "description": "Processes crawl job for URL.", + "requestBody": { + "content": { + "application/json": { + "example": { + "url": "https://docs.firecrawl.dev" + }, + "schema": { + "properties": { + "url": { + "description": "Website URL", + "type": "string" + } + }, + "type": "object" + } + } + }, + "description": "URL to crawl", + "required": true + }, + "responses": { + "200": { + "description": "Crawl initiated." + } + }, + "summary": "Crawl a given URL." + } + } + } +} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/combined_api_spec.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/combined_api_spec.json new file mode 100644 index 00000000..77d67234 --- /dev/null +++ b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/combined_api_spec.json @@ -0,0 +1,738 @@ +{ + "components": { + "schemas": {} + }, + "info": { + "title": "https://docs.firecrawl.dev API Specification", + "version": "1.0.0" + }, + "openapi": "3.0.0", + "paths": { + "/check_crawl_status": { + "post": { + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "current": { + "type": "integer" + }, + "data": { + "items": { + "properties": { + "content": { + "type": "string" + }, + "markdown": { + "type": "string" + }, + "metadata": { + "properties": { + "description": { + "type": "string" + }, + "language": { + "type": "string" + }, + "sourceURL": { + "type": "string" + }, + "title": { + "type": "string" + } + }, + "type": "object" + }, + "provider": { + "type": "string" + } + }, + "type": "object" + }, + "type": "array" + }, + "status": { + "type": "string" + }, + "total": { + "type": "integer" + } + }, + "type": "object" + } + } + }, + "description": "Crawl job status" + } + }, + "summary": "Check crawl job status" + } + }, + "/crawl": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "crawlerOptions": { + "properties": { + "allowBackwardCrawling": { + "description": "Allow backward crawling", + "type": "boolean" + }, + "allowExternalContentLinks": { + "description": "Allow external links", + "type": "boolean" + }, + "excludes": { + "description": "URL patterns to exclude", + "items": { + "type": "string" + }, + "type": "array" + }, + "generateImgAltText": { + "description": "Generate alt text for images", + "type": "boolean" + }, + "ignoreSitemap": { + "description": "Ignore website sitemap", + "type": "boolean" + }, + "includes": { + "description": "URL patterns to include", + "items": { + "type": "string" + }, + "type": "array" + }, + "limit": { + "description": "Maximum pages to crawl", + "type": "integer" + }, + "maxDepth": { + "description": "Maximum crawl depth", + "type": "integer" + }, + "mode": { + "description": "Crawling mode", + "enum": [ + "default", + "fast" + ], + "type": "string" + }, + "returnOnlyUrls": { + "description": "Return only crawled URLs", + "type": "boolean" + } + }, + "type": "object" + }, + "pageOptions": { + "properties": { + "fullPageScreenshot": { + "description": "Include full page screenshot", + "type": "boolean" + }, + "headers": { + "description": "Headers for requests", + "type": "object" + }, + "includeHtml": { + "description": "Include HTML content", + "type": "boolean" + }, + "includeRawHtml": { + "description": "Include raw HTML content", + "type": "boolean" + }, + "onlyIncludeTags": { + "description": "Include only specific tags", + "items": { + "type": "string" + }, + "type": "array" + }, + "onlyMainContent": { + "description": "Return only main content", + "type": "boolean" + }, + "removeTags": { + "description": "Remove specific tags", + "items": { + "type": "string" + }, + "type": "array" + }, + "replaceAllPathsWithAbsolutePaths": { + "description": "Use absolute paths", + "type": "boolean" + }, + "screenshot": { + "description": "Include page screenshot", + "type": "boolean" + }, + "waitFor": { + "description": "Wait for page load (ms)", + "type": "integer" + } + }, + "type": "object" + }, + "url": { + "description": "Base URL to crawl", + "type": "string" + } + }, + "type": "object" + } + } + } + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "jobId": { + "description": "Job ID of the crawl", + "type": "string" + } + }, + "type": "object" + } + } + }, + "description": "Crawl request successful" + } + }, + "security": [ + { + "Bearer": [] + } + ], + "summary": "Crawl a website" + } + }, + "/crawl/cancel/{jobId}": { + "delete": { + "parameters": [ + { + "description": "ID of crawl job", + "in": "path", + "name": "jobId", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "status": { + "type": "string" + } + }, + "type": "object" + } + } + }, + "description": "Returns cancelled." + } + }, + "security": [ + { + "Bearer": [] + } + ], + "summary": "Cancel crawl job" + } + }, + "/search": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "pageOptions": { + "properties": { + "fetchPageContent": { + "description": "Fetch content of each page.", + "type": "boolean" + }, + "includeHtml": { + "description": "Include HTML content.", + "type": "boolean" + }, + "includeRawHtml": { + "description": "Include raw HTML content.", + "type": "boolean" + }, + "onlyMainContent": { + "description": "Only return main content.", + "type": "boolean" + } + }, + "type": "object" + }, + "query": { + "description": "The query to search for", + "type": "string" + }, + "searchOptions": { + "properties": { + "limit": { + "description": "Maximum number of results.", + "type": "integer" + } + }, + "type": "object" + } + }, + "type": "object" + } + } + }, + "required": true + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "data": { + "items": { + "properties": { + "content": { + "type": "string" + }, + "markdown": { + "type": "string" + }, + "metadata": { + "properties": { + "description": { + "type": "string" + }, + "language": { + "type": "string" + }, + "sourceURL": { + "type": "string" + }, + "title": { + "type": "string" + } + }, + "type": "object" + }, + "url": { + "type": "string" + } + }, + "type": "object" + }, + "type": "array" + }, + "success": { + "type": "boolean" + } + }, + "type": "object" + } + } + }, + "description": "Successful search." + } + }, + "security": [ + { + "Bearer": [] + } + ], + "summary": "Search the web." + } + }, + "/v0/crawl": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "crawlerOptions": { + "description": "Crawling options.", + "properties": { + "excludes": { + "description": "URL patterns to exclude.", + "items": { + "type": "string" + }, + "type": "array" + }, + "includes": { + "description": "URL patterns to include.", + "items": { + "type": "string" + }, + "type": "array" + }, + "limit": { + "description": "Maximum pages to crawl.", + "type": "integer" + }, + "maxDepth": { + "description": "Maximum crawl depth.", + "type": "integer" + }, + "mode": { + "description": "Crawling mode.", + "enum": [ + "default", + "fast" + ], + "type": "string" + }, + "returnOnlyUrls": { + "description": "Return only URLs.", + "type": "boolean" + } + }, + "type": "object" + }, + "pageOptions": { + "description": "Page scraping options.", + "properties": { + "includeHtml": { + "description": "Include HTML content.", + "type": "boolean" + }, + "includeRawHtml": { + "description": "Include raw HTML content.", + "type": "boolean" + }, + "onlyMainContent": { + "description": "Only main content.", + "type": "boolean" + }, + "screenshot": { + "description": "Include page screenshot.", + "type": "boolean" + }, + "waitFor": { + "description": "Wait time in milliseconds.", + "type": "integer" + } + }, + "type": "object" + }, + "url": { + "description": "Base URL to crawl.", + "type": "string" + } + }, + "type": "object" + } + } + } + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "jobId": { + "description": "Crawl job ID.", + "type": "string" + } + }, + "type": "object" + } + } + }, + "description": "Crawl job initiated." + } + }, + "summary": "Crawl multiple pages." + } + }, + "/v0/crawl/status/{jobId}": { + "get": { + "parameters": [ + { + "description": "Crawl job ID.", + "in": "path", + "name": "jobId", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "description": "Crawl job status." + } + }, + "summary": "Check crawl job status." + } + }, + "/v0/scrape": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "extractorOptions": { + "description": "Options for extraction", + "properties": { + "extractionPrompt": { + "description": "Prompt for LLM extraction", + "type": "string" + }, + "extractionSchema": { + "description": "Schema for LLM extraction", + "type": "object" + }, + "mode": { + "description": "Extraction mode", + "enum": [ + "markdown", + "llm-extraction", + "llm-extraction-from-raw-html", + "llm-extraction-from-markdown" + ], + "type": "string" + } + }, + "type": "object" + }, + "pageOptions": { + "properties": { + "fullPageScreenshot": { + "description": "Include full page screenshot", + "type": "boolean" + }, + "headers": { + "description": "Headers for request", + "type": "object" + }, + "includeHtml": { + "description": "Include HTML content", + "type": "boolean" + }, + "includeRawHtml": { + "description": "Include raw HTML content", + "type": "boolean" + }, + "onlyIncludeTags": { + "description": "Include only these tags", + "items": { + "type": "string" + }, + "type": "array" + }, + "onlyMainContent": { + "description": "Only return main content", + "type": "boolean" + }, + "removeTags": { + "description": "Remove these tags", + "items": { + "type": "string" + }, + "type": "array" + }, + "replaceAllPathsWithAbsolutePaths": { + "description": "Replace relative paths", + "type": "boolean" + }, + "screenshot": { + "description": "Include screenshot", + "type": "boolean" + }, + "waitFor": { + "description": "Wait time in ms", + "type": "integer" + } + }, + "type": "object" + }, + "timeout": { + "description": "Timeout in ms", + "type": "integer" + }, + "url": { + "description": "URL to scrape", + "type": "string" + } + }, + "type": "object" + } + } + }, + "required": true + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "data": { + "properties": { + "content": { + "type": "string" + }, + "html": { + "type": "string" + }, + "llm_extraction": { + "type": "object" + }, + "markdown": { + "type": "string" + }, + "metadata": { + "properties": { + "description": { + "type": "string" + }, + "language": { + "type": "string" + }, + "pageError": { + "type": "string" + }, + "pageStatusCode": { + "type": "integer" + }, + "sourceURL": { + "type": "string" + }, + "title": { + "type": "string" + } + }, + "type": "object" + }, + "rawHtml": { + "type": "string" + }, + "warning": { + "type": "string" + } + }, + "type": "object" + }, + "success": { + "type": "boolean" + } + }, + "type": "object" + } + } + }, + "description": "Successful scrape" + } + }, + "security": [ + { + "bearerAuth": [] + } + ], + "summary": "Scrape a webpage" + } + }, + "/v0/search": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "pageOptions": { + "properties": { + "fetchPageContent": { + "description": "Fetch page content", + "type": "boolean" + } + }, + "type": "object" + }, + "query": { + "description": "Search term", + "type": "string" + } + }, + "type": "object" + } + } + } + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "data": { + "items": { + "properties": { + "markdown": { + "type": "string" + }, + "metadata": { + "properties": { + "description": { + "type": "string" + }, + "language": { + "type": "string" + }, + "sourceURL": { + "type": "string" + }, + "title": { + "type": "string" + } + }, + "type": "object" + }, + "url": { + "type": "string" + } + }, + "type": "object" + }, + "type": "array" + }, + "success": { + "type": "boolean" + } + }, + "type": "object" + } + } + }, + "description": "Successful search" + } + }, + "summary": "Search and extract content" + } + } + } +} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/turning_docs_into_api_specs.ipynb b/examples/turning_docs_into_api_specs/turning_docs_into_api_specs.ipynb new file mode 100644 index 00000000..1b97f67b --- /dev/null +++ b/examples/turning_docs_into_api_specs/turning_docs_into_api_specs.ipynb @@ -0,0 +1,287 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/ericciarla/projects/python_projects/agents_testing/.conda/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "import os\n", + "import datetime\n", + "import time\n", + "from firecrawl import FirecrawlApp\n", + "import json\n", + "import google.generativeai as genai\n", + "from dotenv import load_dotenv\n", + "\n", + "# Load environment variables\n", + "load_dotenv()\n", + "\n", + "# Retrieve API keys from environment variables\n", + "google_api_key = os.getenv(\"GOOGLE_API_KEY\")\n", + "firecrawl_api_key = os.getenv(\"FIRECRAWL_API_KEY\")\n", + "\n", + "# Configure the Google Generative AI module with the API key\n", + "genai.configure(api_key=google_api_key)\n", + "model = genai.GenerativeModel(\"gemini-1.5-pro-001\")\n", + "\n", + "# Set the docs URL\n", + "docs_url=\"https://docs.firecrawl.dev\"\n", + "\n", + "# Initialize the FirecrawlApp with your API key\n", + "app = FirecrawlApp(api_key=firecrawl_api_key)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "36\n" + ] + } + ], + "source": [ + "# Crawl all pages on docs\n", + "params = {\n", + " \"pageOptions\": {\n", + " \"onlyMainContent\": True\n", + " },\n", + "}\n", + "crawl_result = app.crawl_url(docs_url, params=params)\n", + "\n", + "print(len(crawl_result))" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "prompt_instructions = f\"\"\"Given the following API documentation content, generate an OpenAPI 3.0 specification in JSON format ONLY if you are 100% confident and clear about all details. Focus on extracting the main endpoints, their HTTP methods, parameters, request bodies, and responses. The specification should follow OpenAPI 3.0 structure and conventions. Include only the 200 response for each endpoint. Limit all descriptions to 5 words or less.\n", + "\n", + "If there is ANY uncertainty, lack of complete information, or if you are not 100% confident about ANY part of the specification, return an empty JSON object {{}}.\n", + "\n", + "Do not make anything up. Only include information that is explicitly provided in the documentation. If any detail is unclear or missing, do not attempt to fill it in.\n", + "\n", + "API Documentation Content:\n", + "{{content}}\n", + "\n", + "Generate the OpenAPI 3.0 specification in JSON format ONLY if you are 100% confident about every single detail. Include only the JSON object, no additional text, and ensure it has no errors in the JSON format so it can be parsed. Remember to include only the 200 response for each endpoint and keep all descriptions to 5 words maximum.\n", + "\n", + "Once again, if there is ANY doubt, uncertainty, or lack of complete information, return an empty JSON object {{}}.\n", + "\n", + "To reiterate: accuracy is paramount. Do not make anything up. If you are not 100% clear or confident about the entire OpenAPI spec, return an empty JSON object {{}}.\n", + "\"\"\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "API specification saved to docs.firecrawl.dev/api_spec_0.json\n", + "API specification saved to docs.firecrawl.dev/api_spec_1.json\n", + "API specification saved to docs.firecrawl.dev/api_spec_2.json\n", + "API specification saved to docs.firecrawl.dev/api_spec_3.json\n", + "API specification saved to docs.firecrawl.dev/api_spec_4.json\n", + "An error occurred for page 5: 'content'\n", + "No API specification found for page 6\n", + "API specification saved to docs.firecrawl.dev/api_spec_7.json\n", + "No API specification found for page 8\n", + "No API specification found for page 9\n", + "API specification saved to docs.firecrawl.dev/api_spec_10.json\n", + "No API specification found for page 11\n", + "No API specification found for page 12\n", + "API specification saved to docs.firecrawl.dev/api_spec_13.json\n", + "No API specification found for page 14\n", + "No API specification found for page 15\n", + "No API specification found for page 16\n", + "No API specification found for page 17\n", + "No API specification found for page 18\n", + "No API specification found for page 19\n", + "No API specification found for page 20\n", + "No API specification found for page 21\n", + "No API specification found for page 22\n", + "No API specification found for page 23\n", + "No API specification found for page 24\n", + "No API specification found for page 25\n", + "No API specification found for page 26\n", + "No API specification found for page 27\n", + "No API specification found for page 28\n", + "No API specification found for page 29\n", + "No API specification found for page 30\n", + "No API specification found for page 31\n", + "No API specification found for page 32\n", + "No API specification found for page 33\n", + "No API specification found for page 34\n", + "No API specification found for page 35\n", + "Total API specifications collected: 8\n" + ] + } + ], + "source": [ + "# Create a folder for storing API specs\n", + "import os\n", + "import urllib.parse\n", + "\n", + "folder_name = urllib.parse.urlparse(docs_url).netloc\n", + "os.makedirs(folder_name, exist_ok=True)\n", + "\n", + "# Initialize a list to store all API specs\n", + "all_api_specs = []\n", + "\n", + "# Process each page in crawl_result\n", + "for index, result in enumerate(crawl_result):\n", + " if 'content' in result:\n", + " # Update prompt_instructions with the current page's content\n", + " current_prompt = prompt_instructions.replace(\"{content}\", result['content'])\n", + " try:\n", + " # Query the model\n", + " response = model.generate_content([current_prompt])\n", + " response_dict = response.to_dict()\n", + " response_text = response_dict['candidates'][0]['content']['parts'][0]['text']\n", + " \n", + " # Remove the ```json code wrap if present\n", + " response_text = response_text.strip().removeprefix('```json').removesuffix('```').strip()\n", + " \n", + " # Parse JSON\n", + " json_data = json.loads(response_text)\n", + " \n", + " # Save non-empty API specs\n", + " if json_data != {}:\n", + " output_file = os.path.join(folder_name, f'api_spec_{index}.json')\n", + " with open(output_file, 'w') as f:\n", + " json.dump(json_data, f, indent=2, sort_keys=True)\n", + " print(f\"API specification saved to {output_file}\")\n", + " \n", + " # Add the API spec to the list\n", + " all_api_specs.append(json_data)\n", + " else:\n", + " print(f\"No API specification found for page {index}\")\n", + " \n", + " except json.JSONDecodeError:\n", + " print(f\"Error parsing JSON response for page {index}\")\n", + " except Exception as e:\n", + " print(f\"An error occurred for page {index}: {str(e)}\")\n", + "\n", + "# Print the total number of API specs collected\n", + "print(f\"Total API specifications collected: {len(all_api_specs)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Combined API specification saved to docs.firecrawl.dev/combined_api_spec.json\n", + "Total paths in combined spec: 8\n", + "Total schemas in combined spec: 0\n" + ] + } + ], + "source": [ + "# Combine all API specs and keep the most filled out spec for each path and method\n", + "combined_spec = {\n", + " \"openapi\": \"3.0.0\",\n", + " \"info\": {\n", + " \"title\": f\"{docs_url} API Specification\",\n", + " \"version\": \"1.0.0\"\n", + " },\n", + " \"paths\": {},\n", + " \"components\": {\n", + " \"schemas\": {}\n", + " }\n", + "}\n", + "\n", + "def count_properties(obj):\n", + " if isinstance(obj, dict):\n", + " return sum(count_properties(v) for v in obj.values()) + len(obj)\n", + " elif isinstance(obj, list):\n", + " return sum(count_properties(item) for item in obj)\n", + " else:\n", + " return 1\n", + "\n", + "for spec in all_api_specs:\n", + " if \"paths\" in spec:\n", + " for path, methods in spec[\"paths\"].items():\n", + " if path not in combined_spec[\"paths\"]:\n", + " combined_spec[\"paths\"][path] = {}\n", + " for method, details in methods.items():\n", + " if method not in combined_spec[\"paths\"][path] or count_properties(details) > count_properties(combined_spec[\"paths\"][path][method]):\n", + " combined_spec[\"paths\"][path][method] = details\n", + "\n", + " if \"components\" in spec and \"schemas\" in spec[\"components\"]:\n", + " for schema_name, schema in spec[\"components\"][\"schemas\"].items():\n", + " if schema_name not in combined_spec[\"components\"][\"schemas\"] or count_properties(schema) > count_properties(combined_spec[\"components\"][\"schemas\"][schema_name]):\n", + " combined_spec[\"components\"][\"schemas\"][schema_name] = schema\n", + "\n", + "# Save the combined API spec\n", + "output_file = os.path.join(folder_name, 'combined_api_spec.json')\n", + "with open(output_file, 'w') as f:\n", + " json.dump(combined_spec, f, indent=2, sort_keys=True)\n", + "\n", + "print(f\"Combined API specification saved to {output_file}\")\n", + "print(f\"Total paths in combined spec: {len(combined_spec['paths'])}\")\n", + "print(f\"Total schemas in combined spec: {len(combined_spec['components']['schemas'])}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# note: turn this into a simple web app like roast my site\n", + "- select which methods you want to add\n", + "- generate a UI for each method\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 995a3ff5bb9b7091ba5cb40e7656d10062dad87e Mon Sep 17 00:00:00 2001 From: Andrei Bobkov Date: Tue, 3 Sep 2024 10:49:59 +0200 Subject: [PATCH 02/47] chore(tsconfig): modernize and remove commonjs --- apps/js-sdk/firecrawl/tsconfig.json | 124 ++++++---------------------- 1 file changed, 23 insertions(+), 101 deletions(-) diff --git a/apps/js-sdk/firecrawl/tsconfig.json b/apps/js-sdk/firecrawl/tsconfig.json index 56f13ced..071b13ce 100644 --- a/apps/js-sdk/firecrawl/tsconfig.json +++ b/apps/js-sdk/firecrawl/tsconfig.json @@ -1,110 +1,32 @@ { "compilerOptions": { - /* Visit https://aka.ms/tsconfig to read more about this file */ + // See https://www.totaltypescript.com/tsconfig-cheat-sheet + /* Base Options: */ + "esModuleInterop": true, + "skipLibCheck": true, + "target": "es2022", + "allowJs": true, + "resolveJsonModule": true, + "moduleDetection": "force", + "isolatedModules": true, + "verbatimModuleSyntax": true, - /* Projects */ - // "incremental": true, /* Save .tsbuildinfo files to allow for incremental compilation of projects. */ - // "composite": true, /* Enable constraints that allow a TypeScript project to be used with project references. */ - // "tsBuildInfoFile": "./.tsbuildinfo", /* Specify the path to .tsbuildinfo incremental compilation file. */ - // "disableSourceOfProjectReferenceRedirect": true, /* Disable preferring source files instead of declaration files when referencing composite projects. */ - // "disableSolutionSearching": true, /* Opt a project out of multi-project reference checking when editing. */ - // "disableReferencedProjectLoad": true, /* Reduce the number of projects loaded automatically by TypeScript. */ + /* Strictness */ + "strict": true, + "noUncheckedIndexedAccess": true, + "noImplicitOverride": true, - /* Language and Environment */ - "target": "es2020", /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */ - // "lib": [], /* Specify a set of bundled library declaration files that describe the target runtime environment. */ - // "jsx": "preserve", /* Specify what JSX code is generated. */ - // "experimentalDecorators": true, /* Enable experimental support for legacy experimental decorators. */ - // "emitDecoratorMetadata": true, /* Emit design-type metadata for decorated declarations in source files. */ - // "jsxFactory": "", /* Specify the JSX factory function used when targeting React JSX emit, e.g. 'React.createElement' or 'h'. */ - // "jsxFragmentFactory": "", /* Specify the JSX Fragment reference used for fragments when targeting React JSX emit e.g. 'React.Fragment' or 'Fragment'. */ - // "jsxImportSource": "", /* Specify module specifier used to import the JSX factory functions when using 'jsx: react-jsx*'. */ - // "reactNamespace": "", /* Specify the object invoked for 'createElement'. This only applies when targeting 'react' JSX emit. */ - // "noLib": true, /* Disable including any library files, including the default lib.d.ts. */ - // "useDefineForClassFields": true, /* Emit ECMAScript-standard-compliant class fields. */ - // "moduleDetection": "auto", /* Control what method is used to detect module-format JS files. */ + /* If transpiling with TypeScript: */ + "module": "NodeNext", + "outDir": "dist", + "rootDir": "src", + "sourceMap": true, - /* Modules */ - "module": "commonjs", /* Specify what module code is generated. */ - "rootDir": "./src", /* Specify the root folder within your source files. */ - "moduleResolution": "node", /* Specify how TypeScript looks up a file from a given module specifier. */ - // "baseUrl": "./", /* Specify the base directory to resolve non-relative module names. */ - // "paths": {}, /* Specify a set of entries that re-map imports to additional lookup locations. */ - // "rootDirs": [], /* Allow multiple folders to be treated as one when resolving modules. */ - // "typeRoots": [], /* Specify multiple folders that act like './node_modules/@types'. */ - // "types": [], /* Specify type package names to be included without being referenced in a source file. */ - // "allowUmdGlobalAccess": true, /* Allow accessing UMD globals from modules. */ - // "moduleSuffixes": [], /* List of file name suffixes to search when resolving a module. */ - // "allowImportingTsExtensions": true, /* Allow imports to include TypeScript file extensions. Requires '--moduleResolution bundler' and either '--noEmit' or '--emitDeclarationOnly' to be set. */ - // "resolvePackageJsonExports": true, /* Use the package.json 'exports' field when resolving package imports. */ - // "resolvePackageJsonImports": true, /* Use the package.json 'imports' field when resolving imports. */ - // "customConditions": [], /* Conditions to set in addition to the resolver-specific defaults when resolving imports. */ - // "resolveJsonModule": true, /* Enable importing .json files. */ - // "allowArbitraryExtensions": true, /* Enable importing files with any extension, provided a declaration file is present. */ - // "noResolve": true, /* Disallow 'import's, 'require's or ''s from expanding the number of files TypeScript should add to a project. */ + /* AND if you're building for a library: */ + "declaration": true, - /* JavaScript Support */ - // "allowJs": true, /* Allow JavaScript files to be a part of your program. Use the 'checkJS' option to get errors from these files. */ - // "checkJs": true, /* Enable error reporting in type-checked JavaScript files. */ - // "maxNodeModuleJsDepth": 1, /* Specify the maximum folder depth used for checking JavaScript files from 'node_modules'. Only applicable with 'allowJs'. */ - - /* Emit */ - "declaration": true, /* Generate .d.ts files from TypeScript and JavaScript files in your project. */ - // "declarationMap": true, /* Create sourcemaps for d.ts files. */ - // "emitDeclarationOnly": true, /* Only output d.ts files and not JavaScript files. */ - // "sourceMap": true, /* Create source map files for emitted JavaScript files. */ - // "inlineSourceMap": true, /* Include sourcemap files inside the emitted JavaScript. */ - // "outFile": "./", /* Specify a file that bundles all outputs into one JavaScript file. If 'declaration' is true, also designates a file that bundles all .d.ts output. */ - "outDir": "./build", /* Specify an output folder for all emitted files. */ - // "removeComments": true, /* Disable emitting comments. */ - // "noEmit": true, /* Disable emitting files from a compilation. */ - // "importHelpers": true, /* Allow importing helper functions from tslib once per project, instead of including them per-file. */ - // "importsNotUsedAsValues": "remove", /* Specify emit/checking behavior for imports that are only used for types. */ - // "downlevelIteration": true, /* Emit more compliant, but verbose and less performant JavaScript for iteration. */ - // "sourceRoot": "", /* Specify the root path for debuggers to find the reference source code. */ - // "mapRoot": "", /* Specify the location where debugger should locate map files instead of generated locations. */ - // "inlineSources": true, /* Include source code in the sourcemaps inside the emitted JavaScript. */ - // "emitBOM": true, /* Emit a UTF-8 Byte Order Mark (BOM) in the beginning of output files. */ - // "newLine": "crlf", /* Set the newline character for emitting files. */ - // "stripInternal": true, /* Disable emitting declarations that have '@internal' in their JSDoc comments. */ - // "noEmitHelpers": true, /* Disable generating custom helper functions like '__extends' in compiled output. */ - // "noEmitOnError": true, /* Disable emitting files if any type checking errors are reported. */ - // "preserveConstEnums": true, /* Disable erasing 'const enum' declarations in generated code. */ - "declarationDir": "./types", /* Specify the output directory for generated declaration files. */ - // "preserveValueImports": true, /* Preserve unused imported values in the JavaScript output that would otherwise be removed. */ - - /* Interop Constraints */ - // "isolatedModules": true, /* Ensure that each file can be safely transpiled without relying on other imports. */ - // "verbatimModuleSyntax": true, /* Do not transform or elide any imports or exports not marked as type-only, ensuring they are written in the output file's format based on the 'module' setting. */ - // "allowSyntheticDefaultImports": true, /* Allow 'import x from y' when a module doesn't have a default export. */ - "esModuleInterop": true, /* Emit additional JavaScript to ease support for importing CommonJS modules. This enables 'allowSyntheticDefaultImports' for type compatibility. */ - // "preserveSymlinks": true, /* Disable resolving symlinks to their realpath. This correlates to the same flag in node. */ - "forceConsistentCasingInFileNames": true, /* Ensure that casing is correct in imports. */ - - /* Type Checking */ - "strict": true, /* Enable all strict type-checking options. */ - // "noImplicitAny": true, /* Enable error reporting for expressions and declarations with an implied 'any' type. */ - // "strictNullChecks": true, /* When type checking, take into account 'null' and 'undefined'. */ - // "strictFunctionTypes": true, /* When assigning functions, check to ensure parameters and the return values are subtype-compatible. */ - // "strictBindCallApply": true, /* Check that the arguments for 'bind', 'call', and 'apply' methods match the original function. */ - // "strictPropertyInitialization": true, /* Check for class properties that are declared but not set in the constructor. */ - // "noImplicitThis": true, /* Enable error reporting when 'this' is given the type 'any'. */ - // "useUnknownInCatchVariables": true, /* Default catch clause variables as 'unknown' instead of 'any'. */ - // "alwaysStrict": true, /* Ensure 'use strict' is always emitted. */ - // "noUnusedLocals": true, /* Enable error reporting when local variables aren't read. */ - // "noUnusedParameters": true, /* Raise an error when a function parameter isn't read. */ - // "exactOptionalPropertyTypes": true, /* Interpret optional property types as written, rather than adding 'undefined'. */ - // "noImplicitReturns": true, /* Enable error reporting for codepaths that do not explicitly return in a function. */ - // "noFallthroughCasesInSwitch": true, /* Enable error reporting for fallthrough cases in switch statements. */ - // "noUncheckedIndexedAccess": true, /* Add 'undefined' to a type when accessed using an index. */ - // "noImplicitOverride": true, /* Ensure overriding members in derived classes are marked with an override modifier. */ - // "noPropertyAccessFromIndexSignature": true, /* Enforces using indexed accessors for keys declared using an indexed type. */ - // "allowUnusedLabels": true, /* Disable error reporting for unused labels. */ - // "allowUnreachableCode": true, /* Disable error reporting for unreachable code. */ - - /* Completeness */ - // "skipDefaultLibCheck": true, /* Skip type checking .d.ts files that are included with TypeScript. */ - "skipLibCheck": true /* Skip type checking all .d.ts files. */ + /* AND if you're building for a library in a monorepo: */ + "declarationMap": true /* Skip type checking all .d.ts files. */ }, "include": ["src/**/*"], "exclude": ["node_modules", "dist", "**/__tests__/*"] From 41241f4d652f52669ca27e24969969e68f536468 Mon Sep 17 00:00:00 2001 From: Andrei Bobkov Date: Tue, 3 Sep 2024 10:50:19 +0200 Subject: [PATCH 03/47] chore(.gitignore): add `apps/js-sdk/firecrawl/dist` --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 9eb551a9..45f0d802 100644 --- a/.gitignore +++ b/.gitignore @@ -21,3 +21,5 @@ apps/playwright-service-ts/package-lock.json *.pyc .rdb + +apps/js-sdk/firecrawl/dist \ No newline at end of file From fe8f9d4b2ff2cb822be1a34eabd2125d4fc5db5c Mon Sep 17 00:00:00 2001 From: Andrei Bobkov Date: Tue, 3 Sep 2024 10:50:52 +0200 Subject: [PATCH 04/47] feat(js-sdk): drop `commonjs` outputs and simplify build process --- apps/js-sdk/firecrawl/build/cjs/index.js | 347 ------------------- apps/js-sdk/firecrawl/build/cjs/package.json | 1 - apps/js-sdk/firecrawl/build/esm/index.js | 339 ------------------ apps/js-sdk/firecrawl/build/esm/package.json | 1 - apps/js-sdk/firecrawl/package.json | 16 +- apps/js-sdk/firecrawl/src/index.ts | 2 +- apps/js-sdk/firecrawl/types/index.d.ts | 260 -------------- 7 files changed, 4 insertions(+), 962 deletions(-) delete mode 100644 apps/js-sdk/firecrawl/build/cjs/index.js delete mode 100644 apps/js-sdk/firecrawl/build/cjs/package.json delete mode 100644 apps/js-sdk/firecrawl/build/esm/index.js delete mode 100644 apps/js-sdk/firecrawl/build/esm/package.json delete mode 100644 apps/js-sdk/firecrawl/types/index.d.ts diff --git a/apps/js-sdk/firecrawl/build/cjs/index.js b/apps/js-sdk/firecrawl/build/cjs/index.js deleted file mode 100644 index 2908b09d..00000000 --- a/apps/js-sdk/firecrawl/build/cjs/index.js +++ /dev/null @@ -1,347 +0,0 @@ -"use strict"; -var __importDefault = (this && this.__importDefault) || function (mod) { - return (mod && mod.__esModule) ? mod : { "default": mod }; -}; -Object.defineProperty(exports, "__esModule", { value: true }); -exports.CrawlWatcher = void 0; -const axios_1 = __importDefault(require("axios")); -const zod_to_json_schema_1 = require("zod-to-json-schema"); -const isows_1 = require("isows"); -const typescript_event_target_1 = require("typescript-event-target"); -/** - * Main class for interacting with the Firecrawl API. - * Provides methods for scraping, searching, crawling, and mapping web content. - */ -class FirecrawlApp { - /** - * Initializes a new instance of the FirecrawlApp class. - * @param config - Configuration options for the FirecrawlApp instance. - */ - constructor({ apiKey = null, apiUrl = null }) { - this.apiKey = apiKey || ""; - this.apiUrl = apiUrl || "https://api.firecrawl.dev"; - } - /** - * Scrapes a URL using the Firecrawl API. - * @param url - The URL to scrape. - * @param params - Additional parameters for the scrape request. - * @returns The response from the scrape operation. - */ - async scrapeUrl(url, params) { - const headers = { - "Content-Type": "application/json", - Authorization: `Bearer ${this.apiKey}`, - }; - let jsonData = { url, ...params }; - if (jsonData?.extract?.schema) { - let schema = jsonData.extract.schema; - // Try parsing the schema as a Zod schema - try { - schema = (0, zod_to_json_schema_1.zodToJsonSchema)(schema); - } - catch (error) { - } - jsonData = { - ...jsonData, - extract: { - ...jsonData.extract, - schema: schema, - }, - }; - } - try { - const response = await axios_1.default.post(this.apiUrl + `/v1/scrape`, jsonData, { headers }); - if (response.status === 200) { - const responseData = response.data; - if (responseData.success) { - return { - success: true, - warning: responseData.warning, - error: responseData.error, - ...responseData.data - }; - } - else { - throw new Error(`Failed to scrape URL. Error: ${responseData.error}`); - } - } - else { - this.handleError(response, "scrape URL"); - } - } - catch (error) { - throw new Error(error.message); - } - return { success: false, error: "Internal server error." }; - } - /** - * This method is intended to search for a query using the Firecrawl API. However, it is not supported in version 1 of the API. - * @param query - The search query string. - * @param params - Additional parameters for the search. - * @returns Throws an error advising to use version 0 of the API. - */ - async search(query, params) { - throw new Error("Search is not supported in v1, please update FirecrawlApp() initialization to use v0."); - } - /** - * Initiates a crawl job for a URL using the Firecrawl API. - * @param url - The URL to crawl. - * @param params - Additional parameters for the crawl request. - * @param pollInterval - Time in seconds for job status checks. - * @param idempotencyKey - Optional idempotency key for the request. - * @returns The response from the crawl operation. - */ - async crawlUrl(url, params, pollInterval = 2, idempotencyKey) { - const headers = this.prepareHeaders(idempotencyKey); - let jsonData = { url, ...params }; - try { - const response = await this.postRequest(this.apiUrl + `/v1/crawl`, jsonData, headers); - if (response.status === 200) { - const id = response.data.id; - return this.monitorJobStatus(id, headers, pollInterval); - } - else { - this.handleError(response, "start crawl job"); - } - } - catch (error) { - if (error.response?.data?.error) { - throw new Error(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`); - } - else { - throw new Error(error.message); - } - } - return { success: false, error: "Internal server error." }; - } - async asyncCrawlUrl(url, params, idempotencyKey) { - const headers = this.prepareHeaders(idempotencyKey); - let jsonData = { url, ...params }; - try { - const response = await this.postRequest(this.apiUrl + `/v1/crawl`, jsonData, headers); - if (response.status === 200) { - return response.data; - } - else { - this.handleError(response, "start crawl job"); - } - } - catch (error) { - if (error.response?.data?.error) { - throw new Error(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`); - } - else { - throw new Error(error.message); - } - } - return { success: false, error: "Internal server error." }; - } - /** - * Checks the status of a crawl job using the Firecrawl API. - * @param id - The ID of the crawl operation. - * @returns The response containing the job status. - */ - async checkCrawlStatus(id) { - if (!id) { - throw new Error("No crawl ID provided"); - } - const headers = this.prepareHeaders(); - try { - const response = await this.getRequest(`${this.apiUrl}/v1/crawl/${id}`, headers); - if (response.status === 200) { - return ({ - success: true, - status: response.data.status, - total: response.data.total, - completed: response.data.completed, - creditsUsed: response.data.creditsUsed, - expiresAt: new Date(response.data.expiresAt), - next: response.data.next, - data: response.data.data, - error: response.data.error - }); - } - else { - this.handleError(response, "check crawl status"); - } - } - catch (error) { - throw new Error(error.message); - } - return { success: false, error: "Internal server error." }; - } - async crawlUrlAndWatch(url, params, idempotencyKey) { - const crawl = await this.asyncCrawlUrl(url, params, idempotencyKey); - if (crawl.success && crawl.id) { - const id = crawl.id; - return new CrawlWatcher(id, this); - } - throw new Error("Crawl job failed to start"); - } - async mapUrl(url, params) { - const headers = this.prepareHeaders(); - let jsonData = { url, ...params }; - try { - const response = await this.postRequest(this.apiUrl + `/v1/map`, jsonData, headers); - if (response.status === 200) { - return response.data; - } - else { - this.handleError(response, "map"); - } - } - catch (error) { - throw new Error(error.message); - } - return { success: false, error: "Internal server error." }; - } - /** - * Prepares the headers for an API request. - * @param idempotencyKey - Optional key to ensure idempotency. - * @returns The prepared headers. - */ - prepareHeaders(idempotencyKey) { - return { - "Content-Type": "application/json", - Authorization: `Bearer ${this.apiKey}`, - ...(idempotencyKey ? { "x-idempotency-key": idempotencyKey } : {}), - }; - } - /** - * Sends a POST request to the specified URL. - * @param url - The URL to send the request to. - * @param data - The data to send in the request. - * @param headers - The headers for the request. - * @returns The response from the POST request. - */ - postRequest(url, data, headers) { - return axios_1.default.post(url, data, { headers }); - } - /** - * Sends a GET request to the specified URL. - * @param url - The URL to send the request to. - * @param headers - The headers for the request. - * @returns The response from the GET request. - */ - getRequest(url, headers) { - return axios_1.default.get(url, { headers }); - } - /** - * Monitors the status of a crawl job until completion or failure. - * @param id - The ID of the crawl operation. - * @param headers - The headers for the request. - * @param checkInterval - Interval in seconds for job status checks. - * @param checkUrl - Optional URL to check the status (used for v1 API) - * @returns The final job status or data. - */ - async monitorJobStatus(id, headers, checkInterval) { - while (true) { - const statusResponse = await this.getRequest(`${this.apiUrl}/v1/crawl/${id}`, headers); - if (statusResponse.status === 200) { - const statusData = statusResponse.data; - if (statusData.status === "completed") { - if ("data" in statusData) { - return statusData; - } - else { - throw new Error("Crawl job completed but no data was returned"); - } - } - else if (["active", "paused", "pending", "queued", "scraping"].includes(statusData.status)) { - checkInterval = Math.max(checkInterval, 2); - await new Promise((resolve) => setTimeout(resolve, checkInterval * 1000)); - } - else { - throw new Error(`Crawl job failed or was stopped. Status: ${statusData.status}`); - } - } - else { - this.handleError(statusResponse, "check crawl status"); - } - } - } - /** - * Handles errors from API responses. - * @param {AxiosResponse} response - The response from the API. - * @param {string} action - The action being performed when the error occurred. - */ - handleError(response, action) { - if ([402, 408, 409, 500].includes(response.status)) { - const errorMessage = response.data.error || "Unknown error occurred"; - throw new Error(`Failed to ${action}. Status code: ${response.status}. Error: ${errorMessage}`); - } - else { - throw new Error(`Unexpected error occurred while trying to ${action}. Status code: ${response.status}`); - } - } -} -exports.default = FirecrawlApp; -class CrawlWatcher extends typescript_event_target_1.TypedEventTarget { - constructor(id, app) { - super(); - this.ws = new isows_1.WebSocket(`${app.apiUrl}/v1/crawl/${id}`, app.apiKey); - this.status = "scraping"; - this.data = []; - const messageHandler = (msg) => { - if (msg.type === "done") { - this.status = "completed"; - this.dispatchTypedEvent("done", new CustomEvent("done", { - detail: { - status: this.status, - data: this.data, - }, - })); - } - else if (msg.type === "error") { - this.status = "failed"; - this.dispatchTypedEvent("error", new CustomEvent("error", { - detail: { - status: this.status, - data: this.data, - error: msg.error, - }, - })); - } - else if (msg.type === "catchup") { - this.status = msg.data.status; - this.data.push(...(msg.data.data ?? [])); - for (const doc of this.data) { - this.dispatchTypedEvent("document", new CustomEvent("document", { - detail: doc, - })); - } - } - else if (msg.type === "document") { - this.dispatchTypedEvent("document", new CustomEvent("document", { - detail: msg.data, - })); - } - }; - this.ws.onmessage = ((ev) => { - if (typeof ev.data !== "string") { - this.ws.close(); - return; - } - const msg = JSON.parse(ev.data); - messageHandler(msg); - }).bind(this); - this.ws.onclose = ((ev) => { - const msg = JSON.parse(ev.reason); - messageHandler(msg); - }).bind(this); - this.ws.onerror = ((_) => { - this.status = "failed"; - this.dispatchTypedEvent("error", new CustomEvent("error", { - detail: { - status: this.status, - data: this.data, - error: "WebSocket error", - }, - })); - }).bind(this); - } - close() { - this.ws.close(); - } -} -exports.CrawlWatcher = CrawlWatcher; diff --git a/apps/js-sdk/firecrawl/build/cjs/package.json b/apps/js-sdk/firecrawl/build/cjs/package.json deleted file mode 100644 index b731bd61..00000000 --- a/apps/js-sdk/firecrawl/build/cjs/package.json +++ /dev/null @@ -1 +0,0 @@ -{"type": "commonjs"} diff --git a/apps/js-sdk/firecrawl/build/esm/index.js b/apps/js-sdk/firecrawl/build/esm/index.js deleted file mode 100644 index 4245cc37..00000000 --- a/apps/js-sdk/firecrawl/build/esm/index.js +++ /dev/null @@ -1,339 +0,0 @@ -import axios from "axios"; -import { zodToJsonSchema } from "zod-to-json-schema"; -import { WebSocket } from "isows"; -import { TypedEventTarget } from "typescript-event-target"; -/** - * Main class for interacting with the Firecrawl API. - * Provides methods for scraping, searching, crawling, and mapping web content. - */ -export default class FirecrawlApp { - /** - * Initializes a new instance of the FirecrawlApp class. - * @param config - Configuration options for the FirecrawlApp instance. - */ - constructor({ apiKey = null, apiUrl = null }) { - this.apiKey = apiKey || ""; - this.apiUrl = apiUrl || "https://api.firecrawl.dev"; - } - /** - * Scrapes a URL using the Firecrawl API. - * @param url - The URL to scrape. - * @param params - Additional parameters for the scrape request. - * @returns The response from the scrape operation. - */ - async scrapeUrl(url, params) { - const headers = { - "Content-Type": "application/json", - Authorization: `Bearer ${this.apiKey}`, - }; - let jsonData = { url, ...params }; - if (jsonData?.extract?.schema) { - let schema = jsonData.extract.schema; - // Try parsing the schema as a Zod schema - try { - schema = zodToJsonSchema(schema); - } - catch (error) { - } - jsonData = { - ...jsonData, - extract: { - ...jsonData.extract, - schema: schema, - }, - }; - } - try { - const response = await axios.post(this.apiUrl + `/v1/scrape`, jsonData, { headers }); - if (response.status === 200) { - const responseData = response.data; - if (responseData.success) { - return { - success: true, - warning: responseData.warning, - error: responseData.error, - ...responseData.data - }; - } - else { - throw new Error(`Failed to scrape URL. Error: ${responseData.error}`); - } - } - else { - this.handleError(response, "scrape URL"); - } - } - catch (error) { - throw new Error(error.message); - } - return { success: false, error: "Internal server error." }; - } - /** - * This method is intended to search for a query using the Firecrawl API. However, it is not supported in version 1 of the API. - * @param query - The search query string. - * @param params - Additional parameters for the search. - * @returns Throws an error advising to use version 0 of the API. - */ - async search(query, params) { - throw new Error("Search is not supported in v1, please update FirecrawlApp() initialization to use v0."); - } - /** - * Initiates a crawl job for a URL using the Firecrawl API. - * @param url - The URL to crawl. - * @param params - Additional parameters for the crawl request. - * @param pollInterval - Time in seconds for job status checks. - * @param idempotencyKey - Optional idempotency key for the request. - * @returns The response from the crawl operation. - */ - async crawlUrl(url, params, pollInterval = 2, idempotencyKey) { - const headers = this.prepareHeaders(idempotencyKey); - let jsonData = { url, ...params }; - try { - const response = await this.postRequest(this.apiUrl + `/v1/crawl`, jsonData, headers); - if (response.status === 200) { - const id = response.data.id; - return this.monitorJobStatus(id, headers, pollInterval); - } - else { - this.handleError(response, "start crawl job"); - } - } - catch (error) { - if (error.response?.data?.error) { - throw new Error(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`); - } - else { - throw new Error(error.message); - } - } - return { success: false, error: "Internal server error." }; - } - async asyncCrawlUrl(url, params, idempotencyKey) { - const headers = this.prepareHeaders(idempotencyKey); - let jsonData = { url, ...params }; - try { - const response = await this.postRequest(this.apiUrl + `/v1/crawl`, jsonData, headers); - if (response.status === 200) { - return response.data; - } - else { - this.handleError(response, "start crawl job"); - } - } - catch (error) { - if (error.response?.data?.error) { - throw new Error(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`); - } - else { - throw new Error(error.message); - } - } - return { success: false, error: "Internal server error." }; - } - /** - * Checks the status of a crawl job using the Firecrawl API. - * @param id - The ID of the crawl operation. - * @returns The response containing the job status. - */ - async checkCrawlStatus(id) { - if (!id) { - throw new Error("No crawl ID provided"); - } - const headers = this.prepareHeaders(); - try { - const response = await this.getRequest(`${this.apiUrl}/v1/crawl/${id}`, headers); - if (response.status === 200) { - return ({ - success: true, - status: response.data.status, - total: response.data.total, - completed: response.data.completed, - creditsUsed: response.data.creditsUsed, - expiresAt: new Date(response.data.expiresAt), - next: response.data.next, - data: response.data.data, - error: response.data.error - }); - } - else { - this.handleError(response, "check crawl status"); - } - } - catch (error) { - throw new Error(error.message); - } - return { success: false, error: "Internal server error." }; - } - async crawlUrlAndWatch(url, params, idempotencyKey) { - const crawl = await this.asyncCrawlUrl(url, params, idempotencyKey); - if (crawl.success && crawl.id) { - const id = crawl.id; - return new CrawlWatcher(id, this); - } - throw new Error("Crawl job failed to start"); - } - async mapUrl(url, params) { - const headers = this.prepareHeaders(); - let jsonData = { url, ...params }; - try { - const response = await this.postRequest(this.apiUrl + `/v1/map`, jsonData, headers); - if (response.status === 200) { - return response.data; - } - else { - this.handleError(response, "map"); - } - } - catch (error) { - throw new Error(error.message); - } - return { success: false, error: "Internal server error." }; - } - /** - * Prepares the headers for an API request. - * @param idempotencyKey - Optional key to ensure idempotency. - * @returns The prepared headers. - */ - prepareHeaders(idempotencyKey) { - return { - "Content-Type": "application/json", - Authorization: `Bearer ${this.apiKey}`, - ...(idempotencyKey ? { "x-idempotency-key": idempotencyKey } : {}), - }; - } - /** - * Sends a POST request to the specified URL. - * @param url - The URL to send the request to. - * @param data - The data to send in the request. - * @param headers - The headers for the request. - * @returns The response from the POST request. - */ - postRequest(url, data, headers) { - return axios.post(url, data, { headers }); - } - /** - * Sends a GET request to the specified URL. - * @param url - The URL to send the request to. - * @param headers - The headers for the request. - * @returns The response from the GET request. - */ - getRequest(url, headers) { - return axios.get(url, { headers }); - } - /** - * Monitors the status of a crawl job until completion or failure. - * @param id - The ID of the crawl operation. - * @param headers - The headers for the request. - * @param checkInterval - Interval in seconds for job status checks. - * @param checkUrl - Optional URL to check the status (used for v1 API) - * @returns The final job status or data. - */ - async monitorJobStatus(id, headers, checkInterval) { - while (true) { - const statusResponse = await this.getRequest(`${this.apiUrl}/v1/crawl/${id}`, headers); - if (statusResponse.status === 200) { - const statusData = statusResponse.data; - if (statusData.status === "completed") { - if ("data" in statusData) { - return statusData; - } - else { - throw new Error("Crawl job completed but no data was returned"); - } - } - else if (["active", "paused", "pending", "queued", "scraping"].includes(statusData.status)) { - checkInterval = Math.max(checkInterval, 2); - await new Promise((resolve) => setTimeout(resolve, checkInterval * 1000)); - } - else { - throw new Error(`Crawl job failed or was stopped. Status: ${statusData.status}`); - } - } - else { - this.handleError(statusResponse, "check crawl status"); - } - } - } - /** - * Handles errors from API responses. - * @param {AxiosResponse} response - The response from the API. - * @param {string} action - The action being performed when the error occurred. - */ - handleError(response, action) { - if ([402, 408, 409, 500].includes(response.status)) { - const errorMessage = response.data.error || "Unknown error occurred"; - throw new Error(`Failed to ${action}. Status code: ${response.status}. Error: ${errorMessage}`); - } - else { - throw new Error(`Unexpected error occurred while trying to ${action}. Status code: ${response.status}`); - } - } -} -export class CrawlWatcher extends TypedEventTarget { - constructor(id, app) { - super(); - this.ws = new WebSocket(`${app.apiUrl}/v1/crawl/${id}`, app.apiKey); - this.status = "scraping"; - this.data = []; - const messageHandler = (msg) => { - if (msg.type === "done") { - this.status = "completed"; - this.dispatchTypedEvent("done", new CustomEvent("done", { - detail: { - status: this.status, - data: this.data, - }, - })); - } - else if (msg.type === "error") { - this.status = "failed"; - this.dispatchTypedEvent("error", new CustomEvent("error", { - detail: { - status: this.status, - data: this.data, - error: msg.error, - }, - })); - } - else if (msg.type === "catchup") { - this.status = msg.data.status; - this.data.push(...(msg.data.data ?? [])); - for (const doc of this.data) { - this.dispatchTypedEvent("document", new CustomEvent("document", { - detail: doc, - })); - } - } - else if (msg.type === "document") { - this.dispatchTypedEvent("document", new CustomEvent("document", { - detail: msg.data, - })); - } - }; - this.ws.onmessage = ((ev) => { - if (typeof ev.data !== "string") { - this.ws.close(); - return; - } - const msg = JSON.parse(ev.data); - messageHandler(msg); - }).bind(this); - this.ws.onclose = ((ev) => { - const msg = JSON.parse(ev.reason); - messageHandler(msg); - }).bind(this); - this.ws.onerror = ((_) => { - this.status = "failed"; - this.dispatchTypedEvent("error", new CustomEvent("error", { - detail: { - status: this.status, - data: this.data, - error: "WebSocket error", - }, - })); - }).bind(this); - } - close() { - this.ws.close(); - } -} diff --git a/apps/js-sdk/firecrawl/build/esm/package.json b/apps/js-sdk/firecrawl/build/esm/package.json deleted file mode 100644 index 6990891f..00000000 --- a/apps/js-sdk/firecrawl/build/esm/package.json +++ /dev/null @@ -1 +0,0 @@ -{"type": "module"} diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index e68b3014..430cffff 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -2,21 +2,11 @@ "name": "@mendable/firecrawl-js", "version": "1.2.1", "description": "JavaScript SDK for Firecrawl API", - "main": "build/cjs/index.js", - "types": "types/index.d.ts", + "main": "dist/index.js", + "types": "dist/index.d.ts", "type": "module", - "exports": { - "require": { - "types": "./types/index.d.ts", - "default": "./build/cjs/index.js" - }, - "import": { - "types": "./types/index.d.ts", - "default": "./build/esm/index.js" - } - }, "scripts": { - "build": "tsc --module commonjs --moduleResolution node10 --outDir build/cjs/ && echo '{\"type\": \"commonjs\"}' > build/cjs/package.json && npx tsc --module NodeNext --moduleResolution NodeNext --outDir build/esm/ && echo '{\"type\": \"module\"}' > build/esm/package.json", + "build": "tsc", "build-and-publish": "npm run build && npm publish --access public", "publish-beta": "npm run build && npm publish --access public --tag beta", "test": "NODE_OPTIONS=--experimental-vm-modules jest --verbose src/__tests__/v1/**/*.test.ts" diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 1d1bb4ee..e9411527 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -1,4 +1,4 @@ -import axios, { AxiosResponse, AxiosRequestHeaders } from "axios"; +import axios, { type AxiosResponse, type AxiosRequestHeaders } from "axios"; import { z } from "zod"; import { zodToJsonSchema } from "zod-to-json-schema"; import { WebSocket } from "isows"; diff --git a/apps/js-sdk/firecrawl/types/index.d.ts b/apps/js-sdk/firecrawl/types/index.d.ts deleted file mode 100644 index 36356c4e..00000000 --- a/apps/js-sdk/firecrawl/types/index.d.ts +++ /dev/null @@ -1,260 +0,0 @@ -import { AxiosResponse, AxiosRequestHeaders } from "axios"; -import { z } from "zod"; -import { TypedEventTarget } from "typescript-event-target"; -/** - * Configuration interface for FirecrawlApp. - * @param apiKey - Optional API key for authentication. - * @param apiUrl - Optional base URL of the API; defaults to 'https://api.firecrawl.dev'. - */ -export interface FirecrawlAppConfig { - apiKey?: string | null; - apiUrl?: string | null; -} -/** - * Metadata for a Firecrawl document. - * Includes various optional properties for document metadata. - */ -export interface FirecrawlDocumentMetadata { - title?: string; - description?: string; - language?: string; - keywords?: string; - robots?: string; - ogTitle?: string; - ogDescription?: string; - ogUrl?: string; - ogImage?: string; - ogAudio?: string; - ogDeterminer?: string; - ogLocale?: string; - ogLocaleAlternate?: string[]; - ogSiteName?: string; - ogVideo?: string; - dctermsCreated?: string; - dcDateCreated?: string; - dcDate?: string; - dctermsType?: string; - dcType?: string; - dctermsAudience?: string; - dctermsSubject?: string; - dcSubject?: string; - dcDescription?: string; - dctermsKeywords?: string; - modifiedTime?: string; - publishedTime?: string; - articleTag?: string; - articleSection?: string; - sourceURL?: string; - statusCode?: number; - error?: string; - [key: string]: any; -} -/** - * Document interface for Firecrawl. - * Represents a document retrieved or processed by Firecrawl. - */ -export interface FirecrawlDocument { - url?: string; - markdown?: string; - html?: string; - rawHtml?: string; - links?: string[]; - extract?: Record; - screenshot?: string; - metadata?: FirecrawlDocumentMetadata; -} -/** - * Parameters for scraping operations. - * Defines the options and configurations available for scraping web content. - */ -export interface ScrapeParams { - formats: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot" | "extract" | "full@scrennshot")[]; - headers?: Record; - includeTags?: string[]; - excludeTags?: string[]; - onlyMainContent?: boolean; - extract?: { - prompt?: string; - schema?: z.ZodSchema | any; - systemPrompt?: string; - }; - waitFor?: number; - timeout?: number; -} -/** - * Response interface for scraping operations. - * Defines the structure of the response received after a scraping operation. - */ -export interface ScrapeResponse extends FirecrawlDocument { - success: true; - warning?: string; - error?: string; -} -/** - * Parameters for crawling operations. - * Includes options for both scraping and mapping during a crawl. - */ -export interface CrawlParams { - includePaths?: string[]; - excludePaths?: string[]; - maxDepth?: number; - limit?: number; - allowBackwardLinks?: boolean; - allowExternalLinks?: boolean; - ignoreSitemap?: boolean; - scrapeOptions?: ScrapeParams; - webhook?: string; -} -/** - * Response interface for crawling operations. - * Defines the structure of the response received after initiating a crawl. - */ -export interface CrawlResponse { - id?: string; - url?: string; - success: true; - error?: string; -} -/** - * Response interface for job status checks. - * Provides detailed status of a crawl job including progress and results. - */ -export interface CrawlStatusResponse { - success: true; - total: number; - completed: number; - creditsUsed: number; - expiresAt: Date; - status: "scraping" | "completed" | "failed"; - next: string; - data?: FirecrawlDocument[]; - error?: string; -} -/** - * Parameters for mapping operations. - * Defines options for mapping URLs during a crawl. - */ -export interface MapParams { - search?: string; - ignoreSitemap?: boolean; - includeSubdomains?: boolean; - limit?: number; -} -/** - * Response interface for mapping operations. - * Defines the structure of the response received after a mapping operation. - */ -export interface MapResponse { - success: true; - links?: string[]; - error?: string; -} -/** - * Error response interface. - * Defines the structure of the response received when an error occurs. - */ -export interface ErrorResponse { - success: false; - error: string; -} -/** - * Main class for interacting with the Firecrawl API. - * Provides methods for scraping, searching, crawling, and mapping web content. - */ -export default class FirecrawlApp { - apiKey: string; - apiUrl: string; - /** - * Initializes a new instance of the FirecrawlApp class. - * @param config - Configuration options for the FirecrawlApp instance. - */ - constructor({ apiKey, apiUrl }: FirecrawlAppConfig); - /** - * Scrapes a URL using the Firecrawl API. - * @param url - The URL to scrape. - * @param params - Additional parameters for the scrape request. - * @returns The response from the scrape operation. - */ - scrapeUrl(url: string, params?: ScrapeParams): Promise; - /** - * This method is intended to search for a query using the Firecrawl API. However, it is not supported in version 1 of the API. - * @param query - The search query string. - * @param params - Additional parameters for the search. - * @returns Throws an error advising to use version 0 of the API. - */ - search(query: string, params?: any): Promise; - /** - * Initiates a crawl job for a URL using the Firecrawl API. - * @param url - The URL to crawl. - * @param params - Additional parameters for the crawl request. - * @param pollInterval - Time in seconds for job status checks. - * @param idempotencyKey - Optional idempotency key for the request. - * @returns The response from the crawl operation. - */ - crawlUrl(url: string, params?: CrawlParams, pollInterval?: number, idempotencyKey?: string): Promise; - asyncCrawlUrl(url: string, params?: CrawlParams, idempotencyKey?: string): Promise; - /** - * Checks the status of a crawl job using the Firecrawl API. - * @param id - The ID of the crawl operation. - * @returns The response containing the job status. - */ - checkCrawlStatus(id?: string): Promise; - crawlUrlAndWatch(url: string, params?: CrawlParams, idempotencyKey?: string): Promise; - mapUrl(url: string, params?: MapParams): Promise; - /** - * Prepares the headers for an API request. - * @param idempotencyKey - Optional key to ensure idempotency. - * @returns The prepared headers. - */ - prepareHeaders(idempotencyKey?: string): AxiosRequestHeaders; - /** - * Sends a POST request to the specified URL. - * @param url - The URL to send the request to. - * @param data - The data to send in the request. - * @param headers - The headers for the request. - * @returns The response from the POST request. - */ - postRequest(url: string, data: any, headers: AxiosRequestHeaders): Promise; - /** - * Sends a GET request to the specified URL. - * @param url - The URL to send the request to. - * @param headers - The headers for the request. - * @returns The response from the GET request. - */ - getRequest(url: string, headers: AxiosRequestHeaders): Promise; - /** - * Monitors the status of a crawl job until completion or failure. - * @param id - The ID of the crawl operation. - * @param headers - The headers for the request. - * @param checkInterval - Interval in seconds for job status checks. - * @param checkUrl - Optional URL to check the status (used for v1 API) - * @returns The final job status or data. - */ - monitorJobStatus(id: string, headers: AxiosRequestHeaders, checkInterval: number): Promise; - /** - * Handles errors from API responses. - * @param {AxiosResponse} response - The response from the API. - * @param {string} action - The action being performed when the error occurred. - */ - handleError(response: AxiosResponse, action: string): void; -} -interface CrawlWatcherEvents { - document: CustomEvent; - done: CustomEvent<{ - status: CrawlStatusResponse["status"]; - data: FirecrawlDocument[]; - }>; - error: CustomEvent<{ - status: CrawlStatusResponse["status"]; - data: FirecrawlDocument[]; - error: string; - }>; -} -export declare class CrawlWatcher extends TypedEventTarget { - private ws; - data: FirecrawlDocument[]; - status: CrawlStatusResponse["status"]; - constructor(id: string, app: FirecrawlApp); - close(): void; -} -export {}; From 2a8f55e533175d75381c699c68526763dfe5892a Mon Sep 17 00:00:00 2001 From: Andrei Bobkov Date: Tue, 3 Sep 2024 11:12:28 +0200 Subject: [PATCH 05/47] perf(js-sdk): remove whole `z` import and instead use type-only import --- apps/js-sdk/firecrawl/src/index.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 1d1bb4ee..95b4eebd 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -1,5 +1,5 @@ import axios, { AxiosResponse, AxiosRequestHeaders } from "axios"; -import { z } from "zod"; +import type { ZodSchema } from "zod"; import { zodToJsonSchema } from "zod-to-json-schema"; import { WebSocket } from "isows"; import { TypedEventTarget } from "typescript-event-target"; @@ -81,7 +81,7 @@ export interface ScrapeParams { onlyMainContent?: boolean; extract?: { prompt?: string; - schema?: z.ZodSchema | any; + schema?: ZodSchema | any; systemPrompt?: string; }; waitFor?: number; From 2b0e447bc26ec94f930af68de4d0ad4e6d6fb08f Mon Sep 17 00:00:00 2001 From: Andrei Bobkov Date: Tue, 3 Sep 2024 11:13:48 +0200 Subject: [PATCH 06/47] perf(js-sdk): move `dotenv` and `uuid` to `devDependencies` --- apps/js-sdk/firecrawl/package-lock.json | 12 +++++++----- apps/js-sdk/firecrawl/package.json | 4 ++-- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/apps/js-sdk/firecrawl/package-lock.json b/apps/js-sdk/firecrawl/package-lock.json index ce6a1a4a..7c2ecbfd 100644 --- a/apps/js-sdk/firecrawl/package-lock.json +++ b/apps/js-sdk/firecrawl/package-lock.json @@ -1,19 +1,17 @@ { "name": "@mendable/firecrawl-js", - "version": "1.1.0", + "version": "1.2.1", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "@mendable/firecrawl-js", - "version": "1.1.0", + "version": "1.2.1", "license": "MIT", "dependencies": { "axios": "^1.6.8", - "dotenv": "^16.4.5", "isows": "^1.0.4", "typescript-event-target": "^1.1.1", - "uuid": "^9.0.1", "zod": "^3.23.8", "zod-to-json-schema": "^3.23.0" }, @@ -25,9 +23,11 @@ "@types/mocha": "^10.0.6", "@types/node": "^20.12.12", "@types/uuid": "^9.0.8", + "dotenv": "^16.4.5", "jest": "^29.7.0", "ts-jest": "^29.2.2", - "typescript": "^5.4.5" + "typescript": "^5.4.5", + "uuid": "^9.0.1" } }, "node_modules/@ampproject/remapping": { @@ -1657,6 +1657,7 @@ "version": "16.4.5", "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.4.5.tgz", "integrity": "sha512-ZmdL2rui+eB2YwhsWzjInR8LldtZHGDoQ1ugH85ppHKwpUHL7j7rN0Ti9NCnGiQbhaZ11FpR+7ao1dNsmduNUg==", + "dev": true, "engines": { "node": ">=12" }, @@ -3794,6 +3795,7 @@ "version": "9.0.1", "resolved": "https://registry.npmjs.org/uuid/-/uuid-9.0.1.tgz", "integrity": "sha512-b+1eJOlsR9K8HJpow9Ok3fiWOWSIcIzXodvv0rQjVoOVNpWMpxf1wZNpt4y9h10odCNrqnYp1OBzRktckBe3sA==", + "dev": true, "funding": [ "https://github.com/sponsors/broofa", "https://github.com/sponsors/ctavan" diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index e68b3014..62120b35 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -29,10 +29,8 @@ "license": "MIT", "dependencies": { "axios": "^1.6.8", - "dotenv": "^16.4.5", "isows": "^1.0.4", "typescript-event-target": "^1.1.1", - "uuid": "^9.0.1", "zod": "^3.23.8", "zod-to-json-schema": "^3.23.0" }, @@ -41,6 +39,8 @@ }, "homepage": "https://github.com/mendableai/firecrawl#readme", "devDependencies": { + "uuid": "^9.0.1", + "dotenv": "^16.4.5", "@jest/globals": "^29.7.0", "@types/axios": "^0.14.0", "@types/dotenv": "^8.2.0", From aa2cf686f4c891e5fe5b8be8eb15050bff01d261 Mon Sep 17 00:00:00 2001 From: Tadashi Shigeoka Date: Fri, 6 Sep 2024 21:41:31 +0900 Subject: [PATCH 07/47] [Docs] upgraded the path of the self-hosted documentation URL to `/v1`. --- SELF_HOST.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SELF_HOST.md b/SELF_HOST.md index f631cf18..2fa87776 100644 --- a/SELF_HOST.md +++ b/SELF_HOST.md @@ -106,7 +106,7 @@ You should be able to see the Bull Queue Manager UI on `http://localhost:3002/ad If you’d like to test the crawl endpoint, you can run this: ```bash - curl -X POST http://localhost:3002/v0/crawl \ + curl -X POST http://localhost:3002/v1/crawl \ -H 'Content-Type: application/json' \ -d '{ "url": "https://mendable.ai" From 2044e71fcf1fb811a94f8aae1b87acdfaaaac2be Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Fri, 6 Sep 2024 15:26:33 -0400 Subject: [PATCH 08/47] Docs to API Spec --- .../turning_docs_into_api_specs/api_spec.json | 771 ------------------ .../combined_api_spec.json | 510 ++++++++++++ .../dify_api_spec.json | 164 ---- .../docs.firecrawl.dev/api_spec_0.json | 211 ----- .../docs.firecrawl.dev/api_spec_1.json | 165 ---- .../docs.firecrawl.dev/api_spec_10.json | 93 --- .../docs.firecrawl.dev/api_spec_11.json | 131 --- .../docs.firecrawl.dev/api_spec_13.json | 87 -- .../docs.firecrawl.dev/api_spec_15.json | 83 -- .../docs.firecrawl.dev/api_spec_16.json | 200 ----- .../docs.firecrawl.dev/api_spec_2.json | 54 -- .../docs.firecrawl.dev/api_spec_22.json | 166 ---- .../docs.firecrawl.dev/api_spec_25.json | 229 ------ .../docs.firecrawl.dev/api_spec_26.json | 115 --- .../docs.firecrawl.dev/api_spec_3.json | 185 ----- .../docs.firecrawl.dev/api_spec_30.json | 212 ----- .../docs.firecrawl.dev/api_spec_31.json | 199 ----- .../docs.firecrawl.dev/api_spec_33.json | 202 ----- .../docs.firecrawl.dev/api_spec_34.json | 201 ----- .../docs.firecrawl.dev/api_spec_35.json | 245 ------ .../docs.firecrawl.dev/api_spec_4.json | 129 --- .../docs.firecrawl.dev/api_spec_5.json | 186 ----- .../docs.firecrawl.dev/api_spec_7.json | 86 -- .../docs.firecrawl.dev/api_spec_8.json | 59 -- .../docs.firecrawl.dev/combined_api_spec.json | 738 ----------------- .../turning_docs_into_api_specs.ipynb | 287 ------- .../turning_docs_into_api_specs.py | 137 ++++ 27 files changed, 647 insertions(+), 5198 deletions(-) delete mode 100644 examples/turning_docs_into_api_specs/api_spec.json create mode 100644 examples/turning_docs_into_api_specs/combined_api_spec.json delete mode 100644 examples/turning_docs_into_api_specs/dify_api_spec.json delete mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_0.json delete mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_1.json delete mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_10.json delete mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_11.json delete mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_13.json delete mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_15.json delete mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_16.json delete mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_2.json delete mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_22.json delete mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_25.json delete mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_26.json delete mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_3.json delete mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_30.json delete mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_31.json delete mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_33.json delete mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_34.json delete mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_35.json delete mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_4.json delete mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_5.json delete mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_7.json delete mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_8.json delete mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/combined_api_spec.json delete mode 100644 examples/turning_docs_into_api_specs/turning_docs_into_api_specs.ipynb create mode 100644 examples/turning_docs_into_api_specs/turning_docs_into_api_specs.py diff --git a/examples/turning_docs_into_api_specs/api_spec.json b/examples/turning_docs_into_api_specs/api_spec.json deleted file mode 100644 index d866efd3..00000000 --- a/examples/turning_docs_into_api_specs/api_spec.json +++ /dev/null @@ -1,771 +0,0 @@ -{ - "info": { - "title": "Firecrawl API", - "version": "v0" - }, - "openapi": "3.0.0", - "paths": { - "/crawl": { - "post": { - "/crawl/cancel/{jobId}": { - "/crawl/status/{jobId}": { - "get": { - "/scrape": { - "/search": { - "post": { - "components": { - "securitySchemes": { - "Authorization": { - "bearerFormat": "JWT", - "scheme": "bearer", - "type": "http" - } - } - }, - "description": "Send a request to perform a web search and get scraped results from the top pages.", - "operationId": "searchWeb", - "parameters": [], - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "pageOptions": { - "description": "Options for controlling the scraping behavior of search result pages.", - "properties": { - "fetchPageContent": { - "default": true, - "description": "Fetch the content of each page. If false, defaults to a basic fast serp API.", - "type": "boolean" - }, - "includeHtml": { - "default": false, - "description": "Include the HTML version of the content on page. Will output a html key in the response.", - "type": "boolean" - }, - "includeRawHtml": { - "default": false, - "description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.", - "type": "boolean" - }, - "onlyMainContent": { - "default": false, - "description": "Only return the main content of the page excluding headers, navs, footers, etc.", - "type": "boolean" - } - }, - "type": "object" - }, - "query": { - "description": "The search query.", - "required": true, - "type": "string" - }, - "searchOptions": { - "description": "Options for controlling the search.", - "properties": { - "limit": { - "description": "Maximum number of search results to return.", - "type": "integer" - } - }, - "type": "object" - } - }, - "type": "object" - } - } - }, - "responses": { - "200": { - "402": { - "description": "Payment required." - }, - "429": { - "description": "Rate limit exceeded." - }, - "500": { - "description": "Internal server error." - }, - "content": { - "application/json": { - "schema": { - "properties": { - "data": { - "description": "An array of search results.", - "items": { - "properties": { - "content": { - "description": "Raw content of the search result page.", - "type": "string" - }, - "markdown": { - "description": "Markdown content of the search result page.", - "type": "string" - }, - "metadata": { - "description": "Metadata extracted from the search result page.", - "properties": { - "description": { - "description": "Page description.", - "type": "string" - }, - "language": { - "description": "Page language.", - "nullable": true, - "type": "string" - }, - "sourceURL": { - "description": "Source URL of the search result page.", - "type": "string" - }, - "title": { - "description": "Page title.", - "type": "string" - } - }, - "type": "object" - }, - "url": { - "description": "URL of the search result.", - "type": "string" - } - }, - "type": "object" - }, - "type": "array" - }, - "success": { - "description": "Indicates if the search was successful.", - "type": "boolean" - } - }, - "type": "object" - } - } - }, - "description": "Web search completed successfully." - } - } - }, - "summary": "Search the Web" - } - }, - "post": { - "description": "Send a request to scrape a single URL and get its content.", - "operationId": "scrapeURL", - "parameters": [], - "requestBody": { - "402": { - "description": "Payment required." - }, - "429": { - "description": "Rate limit exceeded." - }, - "500": { - "description": "Internal server error." - }, - "content": { - "application/json": { - "schema": { - "properties": { - "extractorOptions": { - "description": "Options for extraction of structured information from the page content. Note: LLM-based extraction is not performed by default and only occurs when explicitly configured. The 'markdown' mode simply returns the scraped markdown and is the default mode for scraping.", - "properties": { - "extractionPrompt": { - "description": "A prompt describing what information to extract from the page, applicable for LLM extraction modes.", - "type": "string" - }, - "extractionSchema": { - "description": "The schema for the data to be extracted, required only for LLM extraction modes.", - "type": "object" - }, - "mode": { - "default": "markdown", - "description": "The extraction mode to use. 'markdown': Returns the scraped markdown content, does not perform LLM extraction. 'llm-extraction': Extracts information from the cleaned and parsed content using LLM. 'llm-extraction-from-raw-html': Extracts information directly from the raw HTML using LLM. 'llm-extraction-from-markdown': Extracts information from the markdown content using LLM.", - "enum": [ - "markdown", - "llm-extraction", - "llm-extraction-from-raw-html", - "llm-extraction-from-markdown" - ], - "type": "string" - } - }, - "type": "object" - }, - "pageOptions": { - "description": "Options for controlling the scraping behavior.", - "properties": { - "fullPageScreenshot": { - "default": false, - "description": "Include a full page screenshot of the page that you are scraping.", - "type": "boolean" - }, - "headers": { - "description": "Headers to send with the request. Can be used to send cookies, user-agent, etc.", - "type": "object" - }, - "includeHtml": { - "default": false, - "description": "Include the HTML version of the content on page. Will output a html key in the response.", - "type": "boolean" - }, - "includeRawHtml": { - "default": false, - "description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.", - "type": "boolean" - }, - "onlyIncludeTags": { - "description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'", - "items": { - "type": "string" - }, - "type": "array" - }, - "onlyMainContent": { - "default": false, - "description": "Only return the main content of the page excluding headers, navs, footers, etc.", - "type": "boolean" - }, - "removeTags": { - "description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'", - "items": { - "type": "string" - }, - "type": "array" - }, - "replaceAllPathsWithAbsolutePaths": { - "default": false, - "description": "Replace all relative paths with absolute paths for images and links", - "type": "boolean" - }, - "screenshot": { - "default": false, - "description": "Include a screenshot of the top of the page that you are scraping.", - "type": "boolean" - }, - "waitFor": { - "default": 0, - "description": "Wait x amount of milliseconds for the page to load to fetch content", - "type": "integer" - } - }, - "type": "object" - }, - "timeout": { - "default": 30000, - "description": "Timeout in milliseconds for the request", - "type": "integer" - }, - "url": { - "description": "The URL to scrape.", - "required": true, - "type": "string" - } - }, - "type": "object" - } - } - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "data": { - "properties": { - "content": { - "description": "Raw content of the page.", - "type": "string" - }, - "html": { - "description": "HTML version of the page content, only present if `includeHtml` was set to `true` in the request.", - "nullable": true, - "type": "string" - }, - "llm_extraction": { - "description": "Extracted data from the page using the specified schema, only present if an LLM extraction mode was used.", - "nullable": true, - "type": "object" - }, - "markdown": { - "description": "Markdown version of the page content.", - "type": "string" - }, - "metadata": { - "properties": { - " ": { - "description": "Any other extracted metadata.", - "type": "string" - }, - "description": { - "description": "Page description.", - "type": "string" - }, - "language": { - "description": "Page language.", - "nullable": true, - "type": "string" - }, - "pageError": { - "description": "Error message if there was an error scraping the page.", - "nullable": true, - "type": "string" - }, - "pageStatusCode": { - "description": "HTTP status code of the page.", - "type": "integer" - }, - "sourceURL": { - "description": "Source URL of the page.", - "type": "string" - }, - "title": { - "description": "Page title.", - "type": "string" - } - }, - "type": "object" - }, - "rawHtml": { - "description": "Raw HTML content of the page, only present if `includeRawHtml` was set to `true` in the request.", - "nullable": true, - "type": "string" - }, - "warning": { - "description": "Warning message from the LLM extraction process, if any.", - "nullable": true, - "type": "string" - } - }, - "type": "object" - }, - "success": { - "description": "Indicates whether the scraping was successful.", - "type": "boolean" - } - }, - "type": "object" - } - } - }, - "description": "URL scraped successfully." - } - } - }, - "summary": "Scrape a URL" - } - }, - "description": "Send a request to get the status and results of a crawl job.", - "operationId": "getCrawlJobStatus", - "parameters": [ - { - "description": "ID of the crawl job to check.", - "in": "path", - "name": "jobId", - "required": true, - "schema": { - "type": "string" - } - } - ], - "requestBody": { - "content": {} - }, - "responses": { - "200": { - "402": { - "description": "Payment required." - }, - "429": { - "description": "Rate limit exceeded." - }, - "500": { - "description": "Internal server error." - }, - "content": { - "application/json": { - "schema": { - "properties": { - "current": { - "description": "The number of pages crawled so far.", - "type": "integer" - }, - "data": { - "description": "The crawl results. Only available when the crawl job is completed.", - "items": { - "properties": { - "content": { - "description": "Raw content of the page.", - "type": "string" - }, - "html": { - "description": "HTML version of the page content, only present if `includeHtml` was set to `true` in the crawl request.", - "type": "string" - }, - "index": { - "description": "The index of the crawled page in the results.", - "type": "integer" - }, - "markdown": { - "description": "Markdown content of the page.", - "type": "string" - }, - "metadata": { - "description": "Metadata extracted from the page.", - "properties": { - " ": { - "description": "Any other extracted metadata.", - "type": "string" - }, - "description": { - "description": "Page description.", - "type": "string" - }, - "language": { - "description": "Page language.", - "type": "string" - }, - "pageError": { - "description": "Error message if there was an error scraping the page.", - "type": "string" - }, - "pageStatusCode": { - "description": "HTTP status code of the page.", - "type": "integer" - }, - "sourceURL": { - "description": "Source URL of the page.", - "type": "string" - }, - "title": { - "description": "Page title.", - "type": "string" - } - }, - "type": "object" - }, - "rawHtml": { - "description": "Raw HTML content of the page, only present if `includeRawHtml` was set to `true` in the crawl request.", - "type": "string" - } - }, - "type": "object" - }, - "type": "array" - }, - "partial_data": { - "description": "Partial results streamed as the crawl progresses. This feature is in alpha and may change.", - "items": { - "properties": { - "content": { - "description": "Raw content of the page.", - "type": "string" - }, - "html": { - "description": "HTML version of the page content, only present if `includeHtml` was set to `true` in the crawl request.", - "type": "string" - }, - "index": { - "description": "The index of the crawled page in the results.", - "type": "integer" - }, - "markdown": { - "description": "Markdown content of the page.", - "type": "string" - }, - "metadata": { - "description": "Metadata extracted from the page.", - "properties": { - " ": { - "description": "Any other extracted metadata.", - "type": "string" - }, - "description": { - "description": "Page description.", - "type": "string" - }, - "language": { - "description": "Page language.", - "type": "string" - }, - "pageError": { - "description": "Error message if there was an error scraping the page.", - "type": "string" - }, - "pageStatusCode": { - "description": "HTTP status code of the page.", - "type": "integer" - }, - "sourceURL": { - "description": "Source URL of the page.", - "type": "string" - }, - "title": { - "description": "Page title.", - "type": "string" - } - }, - "type": "object" - }, - "rawHtml": { - "description": "Raw HTML content of the page, only present if `includeRawHtml` was set to `true` in the crawl request.", - "type": "string" - } - }, - "type": "object" - }, - "type": "array" - }, - "status": { - "description": "Status of the crawl job. Can be 'completed', 'active', 'failed', or 'paused'.", - "enum": [ - "completed", - "active", - "failed", - "paused" - ], - "type": "string" - }, - "total": { - "description": "The total estimated number of pages to crawl.", - "type": "integer" - } - }, - "type": "object" - } - } - }, - "description": "Crawl job status retrieved." - } - }, - "summary": "Get Crawl Job Status" - } - }, - "delete": { - "description": "Send a request to cancel a running crawl job.", - "operationId": "cancelCrawlJob", - "parameters": [ - { - "description": "ID of the crawl job to cancel.", - "in": "path", - "name": "jobId", - "required": true, - "schema": { - "type": "string" - } - } - ], - "requestBody": { - "content": {} - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "status": { - "description": "The status of the crawl job cancellation request, usually 'cancelled'.", - "type": "string" - } - }, - "type": "object" - } - } - }, - "description": "Crawl job cancellation request submitted." - }, - "402": { - "description": "Payment required." - }, - "429": { - "description": "Rate limit exceeded." - }, - "500": { - "description": "Internal server error." - } - }, - "summary": "Cancel a Crawl Job" - } - }, - "description": "Send a request to crawl a URL and all accessible subpages. This submits a crawl job and returns a job ID to check the status of the crawl.", - "operationId": "crawlWebsite", - "parameters": [], - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "crawlerOptions": { - "description": "Options for controlling the crawling behavior.", - "properties": { - "allowBackwardCrawling": { - "default": false, - "description": "Enables the crawler to navigate from a specific URL to previously linked pages. For instance, from 'example.com/product/123' back to 'example.com/product'", - "type": "boolean" - }, - "allowExternalContentLinks": { - "default": false, - "description": "Allows the crawler to follow links to external websites.", - "type": "boolean" - }, - "excludes": { - "description": "URL patterns to exclude", - "items": { - "type": "string" - }, - "type": "array" - }, - "generateImgAltText": { - "default": false, - "description": "Generate alt text for images using LLMs (must have a paid plan)", - "type": "boolean" - }, - "ignoreSitemap": { - "default": false, - "description": "Ignore the website sitemap when crawling", - "type": "boolean" - }, - "includes": { - "description": "URL patterns to include", - "items": { - "type": "string" - }, - "type": "array" - }, - "limit": { - "default": 10000, - "description": "Maximum number of pages to crawl", - "type": "integer" - }, - "maxDepth": { - "description": "Maximum depth to crawl relative to the entered URL. A maxDepth of 0 scrapes only the entered URL. A maxDepth of 1 scrapes the entered URL and all pages one level deep. A maxDepth of 2 scrapes the entered URL and all pages up to two levels deep. Higher values follow the same pattern.", - "type": "integer" - }, - "mode": { - "default": "default", - "description": "The crawling mode to use. Fast mode crawls 4x faster websites without sitemap, but may not be as accurate and shouldn't be used in heavy js-rendered websites.", - "enum": [ - "default", - "fast" - ], - "type": "string" - }, - "returnOnlyUrls": { - "default": false, - "description": "If true, returns only the URLs as a list on the crawl status. Attention: the return response will be a list of URLs inside the data, not a list of documents.", - "type": "boolean" - } - }, - "type": "object" - }, - "pageOptions": { - "description": "Options for controlling the scraping behavior of individual pages.", - "properties": { - "fullPageScreenshot": { - "default": false, - "description": "Include a full page screenshot of the page that you are scraping.", - "type": "boolean" - }, - "headers": { - "description": "Headers to send with the request. Can be used to send cookies, user-agent, etc.", - "type": "object" - }, - "includeHtml": { - "default": false, - "description": "Include the HTML version of the content on page. Will output a html key in the response.", - "type": "boolean" - }, - "includeRawHtml": { - "default": false, - "description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.", - "type": "boolean" - }, - "onlyIncludeTags": { - "description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'", - "items": { - "type": "string" - }, - "type": "array" - }, - "onlyMainContent": { - "default": false, - "description": "Only return the main content of the page excluding headers, navs, footers, etc.", - "type": "boolean" - }, - "removeTags": { - "description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'", - "items": { - "type": "string" - }, - "type": "array" - }, - "replaceAllPathsWithAbsolutePaths": { - "default": false, - "description": "Replace all relative paths with absolute paths for images and links", - "type": "boolean" - }, - "screenshot": { - "default": false, - "description": "Include a screenshot of the top of the page that you are scraping.", - "type": "boolean" - }, - "waitFor": { - "default": 0, - "description": "Wait x amount of milliseconds for the page to load to fetch content", - "type": "integer" - } - }, - "type": "object" - }, - "url": { - "description": "The base URL to start crawling from", - "required": true, - "type": "string" - } - }, - "type": "object" - } - } - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "jobId": { - "description": "The ID of the submitted crawl job.", - "type": "string" - } - }, - "type": "object" - } - } - }, - "description": "Crawl job submitted successfully." - }, - "402": { - "description": "Payment required." - }, - "429": { - "description": "Rate limit exceeded." - }, - "500": { - "description": "Internal server error." - } - } - }, - "summary": "Crawl a Website" - } - } - }, - "servers": [ - { - "url": "https://api.firecrawl.dev/v0" - } - ] -} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/combined_api_spec.json b/examples/turning_docs_into_api_specs/combined_api_spec.json new file mode 100644 index 00000000..526dec8b --- /dev/null +++ b/examples/turning_docs_into_api_specs/combined_api_spec.json @@ -0,0 +1,510 @@ +{ + "openapi": "3.0.0", + "info": { + "title": "https://docs.firecrawl.dev/api-reference API Specification", + "version": "1.0.0" + }, + "paths": { + "/crawl": { + "post": { + "summary": "Crawl a website", + "requestBody": { + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "url": { + "type": "string", + "description": "Base URL to crawl" + }, + "excludePaths": { + "type": "array", + "items": { + "type": "string" + }, + "description": "URL patterns to exclude" + }, + "includePaths": { + "type": "array", + "items": { + "type": "string" + }, + "description": "URL patterns to include" + }, + "maxDepth": { + "type": "integer", + "description": "Maximum crawl depth" + }, + "ignoreSitemap": { + "type": "boolean", + "description": "Ignore sitemap?" + }, + "limit": { + "type": "integer", + "description": "Maximum pages to crawl" + }, + "allowBackwardLinks": { + "type": "boolean", + "description": "Allow backward links?" + }, + "allowExternalLinks": { + "type": "boolean", + "description": "Allow external links?" + }, + "webhook": { + "type": "string", + "description": "Webhook URL" + }, + "scrapeOptions": { + "type": "object", + "properties": { + "formats": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Formats to include" + }, + "headers": { + "type": "object", + "description": "Headers to send" + }, + "includeTags": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Tags to include" + }, + "excludeTags": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Tags to exclude" + }, + "onlyMainContent": { + "type": "boolean", + "description": "Only main content?" + }, + "waitFor": { + "type": "integer", + "description": "Wait time in ms" + } + } + } + } + } + } + } + }, + "responses": { + "200": { + "description": "Crawl started", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "success": { + "type": "boolean" + }, + "id": { + "type": "string" + }, + "url": { + "type": "string" + } + } + } + } + } + } + }, + "security": [ + { + "Authorization": [] + } + ] + } + }, + "/scrape": { + "post": { + "summary": "Scrape a webpage", + "requestBody": { + "required": true, + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "url": { + "type": "string", + "description": "URL to scrape" + }, + "formats": { + "type": "array", + "description": "Output formats", + "items": { + "type": "string", + "enum": [ + "markdown", + "html", + "rawHtml", + "links", + "screenshot", + "extract", + "screenshot@fullPage" + ] + } + }, + "onlyMainContent": { + "type": "boolean", + "description": "Only main content" + }, + "includeTags": { + "type": "array", + "description": "Tags to include", + "items": { + "type": "string" + } + }, + "excludeTags": { + "type": "array", + "description": "Tags to exclude", + "items": { + "type": "string" + } + }, + "headers": { + "type": "object", + "description": "Request headers" + }, + "waitFor": { + "type": "integer", + "description": "Delay in ms" + }, + "timeout": { + "type": "integer", + "description": "Timeout in ms" + }, + "extract": { + "type": "object", + "description": "Extract object", + "properties": { + "schema": { + "type": "object", + "description": "Extraction schema" + }, + "systemPrompt": { + "type": "string", + "description": "System prompt" + }, + "prompt": { + "type": "string", + "description": "Extraction prompt" + } + } + } + } + } + } + } + }, + "responses": { + "200": { + "description": "Successful scrape", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "success": { + "type": "boolean" + }, + "data": { + "type": "object", + "properties": { + "markdown": { + "type": "string" + }, + "html": { + "type": "string" + }, + "rawHtml": { + "type": "string" + }, + "screenshot": { + "type": "string" + }, + "links": { + "type": "array", + "items": { + "type": "string" + } + }, + "metadata": { + "type": "object", + "properties": { + "title": { + "type": "string" + }, + "description": { + "type": "string" + }, + "language": { + "type": "string" + }, + "sourceURL": { + "type": "string" + }, + "statusCode": { + "type": "integer" + }, + "error": { + "type": "string" + } + } + }, + "llm_extraction": { + "type": "object" + }, + "warning": { + "type": "string" + } + } + } + } + } + } + } + } + }, + "security": [ + { + "Bearer": [] + } + ] + } + }, + "/v1/crawl/{id}": { + "get": { + "summary": "Get crawl status", + "parameters": [ + { + "name": "id", + "in": "path", + "description": "ID of crawl job", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "description": "Crawl status", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "status": { + "type": "string", + "description": "Current status of crawl" + }, + "total": { + "type": "integer", + "description": "Total pages crawled" + }, + "completed": { + "type": "integer", + "description": "Number of pages crawled" + }, + "creditsUsed": { + "type": "integer", + "description": "Credits used" + }, + "expiresAt": { + "type": "string", + "format": "date-time", + "description": "Crawl expiry" + }, + "next": { + "type": "string", + "nullable": true, + "description": "URL for next data" + }, + "data": { + "type": "array", + "description": "Data of the crawl", + "items": { + "type": "object", + "properties": { + "markdown": { + "type": "string" + }, + "html": { + "type": "string" + }, + "rawHtml": { + "type": "string" + }, + "links": { + "type": "array", + "items": { + "type": "string" + } + }, + "screenshot": { + "type": "string" + }, + "metadata": { + "type": "object", + "properties": { + "title": { + "type": "string" + }, + "description": { + "type": "string" + }, + "language": { + "type": "string" + }, + "sourceURL": { + "type": "string" + }, + "statusCode": { + "type": "integer" + }, + "error": { + "type": "string" + } + } + } + } + } + } + } + } + } + } + } + }, + "security": [ + { + "Bearer": [] + } + ] + } + }, + "/crawl/{id}": { + "delete": { + "summary": "Cancel crawl job", + "security": [ + { + "bearerAuth": [] + } + ], + "parameters": [ + { + "name": "id", + "in": "path", + "description": "ID of crawl job", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "description": "Crawl job cancelled", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "success": { + "type": "boolean" + }, + "message": { + "type": "string" + } + } + } + } + } + } + } + } + }, + "/map": { + "post": { + "summary": "Map website and return links", + "requestBody": { + "required": true, + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "url": { + "type": "string", + "description": "Base URL to crawl" + }, + "search": { + "type": "string", + "description": "Search query for mapping" + }, + "ignoreSitemap": { + "type": "boolean", + "description": "Ignore sitemap?" + }, + "includeSubdomains": { + "type": "boolean", + "description": "Include subdomains?" + }, + "limit": { + "type": "integer", + "description": "Max links to return" + } + }, + "required": [ + "url" + ] + } + } + } + }, + "responses": { + "200": { + "description": "Successful mapping", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "success": { + "type": "boolean" + }, + "links": { + "type": "array", + "items": { + "type": "string" + } + } + } + } + } + } + } + } + } + } + }, + "components": { + "schemas": {} + } +} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/dify_api_spec.json b/examples/turning_docs_into_api_specs/dify_api_spec.json deleted file mode 100644 index e6eec457..00000000 --- a/examples/turning_docs_into_api_specs/dify_api_spec.json +++ /dev/null @@ -1,164 +0,0 @@ -{ - "openapi": "3.0.0", - "info": { - "title": "Knowledge Base API", - "description": "API for managing knowledge bases and documents." - }, - "paths": { - "/datasets": { - "post": { - "summary": "Create an Empty Dataset", - "description": "Only used to create an empty dataset", - "requestBody": { - "content": { - "application/json": { - "schema": { - "type": "object", - "properties": { - "name": { - "type": "string" - } - } - } - } - } - }, - "responses": {} - }, - "get": { - "summary": "Dataset List", - "parameters": [ - { - "name": "page", - "in": "query", - "schema": { - "type": "integer" - } - }, - { - "name": "limit", - "in": "query", - "schema": { - "type": "integer" - } - } - ], - "responses": {} - } - }, - "/datasets/{dataset_id}/document/create_by_text": { - "post": { - "summary": "Create Document by Text", - "requestBody": { - "content": { - "application/json": { - "schema": { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "text": { - "type": "string" - }, - "indexing_technique": { - "type": "string" - }, - "process_rule": { - "type": "object" - } - } - } - } - } - }, - "responses": {} - } - }, - "/datasets/{dataset_id}/document/create_by_file": { - "post": { - "summary": "Create Document by File", - "requestBody": { - "content": { - "multipart/form-data": { - "schema": { - "type": "object", - "properties": { - "data": { - "type": "string" - }, - "file": { - "type": "string", - "format": "binary" - } - } - } - } - } - }, - "responses": {} - } - }, - "/datasets/{dataset_id}/documents/{batch}/indexing-status": { - "get": { - "summary": "Get Document Embedding Status (Progress)", - "responses": {} - } - }, - "/datasets/{dataset_id}/documents/{document_id}": { - "delete": { - "summary": "Delete Document", - "responses": {} - } - }, - "/datasets/{dataset_id}/documents": { - "get": { - "summary": "Dataset Document List", - "responses": {} - } - }, - "/datasets/{dataset_id}/documents/{document_id}/segments": { - "post": { - "summary": "Add Segments", - "requestBody": { - "content": { - "application/json": { - "schema": { - "type": "object", - "properties": { - "segments": { - "type": "array", - "items": { - "type": "object", - "properties": { - "content": { - "type": "string" - }, - "answer": { - "type": "string" - }, - "keywords": { - "type": "array", - "items": { - "type": "string" - } - } - } - } - } - } - } - } - } - }, - "responses": {} - } - }, - "/datasets/{dataset_id}/segments/{segment_id}": { - "delete": { - "summary": "Delete Document Segment", - "responses": {} - } - } - } -} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_0.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_0.json deleted file mode 100644 index 84bce02c..00000000 --- a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_0.json +++ /dev/null @@ -1,211 +0,0 @@ -{ - "info": { - "title": "Firecrawl API", - "version": "v0" - }, - "openapi": "3.0.0", - "paths": { - "/v0/crawl": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "crawlerOptions": { - "description": "Crawling options.", - "properties": { - "excludes": { - "description": "URL patterns to exclude.", - "items": { - "type": "string" - }, - "type": "array" - }, - "includes": { - "description": "URL patterns to include.", - "items": { - "type": "string" - }, - "type": "array" - }, - "limit": { - "description": "Maximum pages to crawl.", - "type": "integer" - }, - "maxDepth": { - "description": "Maximum crawl depth.", - "type": "integer" - }, - "mode": { - "description": "Crawling mode.", - "enum": [ - "default", - "fast" - ], - "type": "string" - }, - "returnOnlyUrls": { - "description": "Return only URLs.", - "type": "boolean" - } - }, - "type": "object" - }, - "pageOptions": { - "description": "Page scraping options.", - "properties": { - "includeHtml": { - "description": "Include HTML content.", - "type": "boolean" - }, - "includeRawHtml": { - "description": "Include raw HTML content.", - "type": "boolean" - }, - "onlyMainContent": { - "description": "Only main content.", - "type": "boolean" - }, - "screenshot": { - "description": "Include page screenshot.", - "type": "boolean" - }, - "waitFor": { - "description": "Wait time in milliseconds.", - "type": "integer" - } - }, - "type": "object" - }, - "url": { - "description": "Base URL to crawl.", - "type": "string" - } - }, - "type": "object" - } - } - } - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "jobId": { - "description": "Crawl job ID.", - "type": "string" - } - }, - "type": "object" - } - } - }, - "description": "Crawl job initiated." - } - }, - "summary": "Crawl multiple pages." - } - }, - "/v0/crawl/status/{jobId}": { - "get": { - "parameters": [ - { - "description": "Crawl job ID.", - "in": "path", - "name": "jobId", - "required": true, - "schema": { - "type": "string" - } - } - ], - "responses": { - "200": { - "description": "Crawl job status." - } - }, - "summary": "Check crawl job status." - } - }, - "/v0/scrape": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "extractorOptions": { - "description": "Data extraction options.", - "properties": { - "extractionPrompt": { - "description": "Prompt for data extraction.", - "type": "string" - }, - "extractionSchema": { - "description": "Schema for data extraction.", - "type": "object" - }, - "mode": { - "description": "Extraction mode.", - "enum": [ - "llm-extraction", - "llm-extraction-from-raw-html" - ], - "type": "string" - } - }, - "type": "object" - }, - "pageOptions": { - "description": "Page scraping options.", - "properties": { - "includeHtml": { - "description": "Include HTML content.", - "type": "boolean" - }, - "includeRawHtml": { - "description": "Include raw HTML content.", - "type": "boolean" - }, - "onlyMainContent": { - "description": "Only main content.", - "type": "boolean" - }, - "screenshot": { - "description": "Include page screenshot.", - "type": "boolean" - }, - "waitFor": { - "description": "Wait time in milliseconds.", - "type": "integer" - } - }, - "type": "object" - }, - "timeout": { - "description": "Timeout in milliseconds.", - "type": "integer" - }, - "url": { - "description": "URL to scrape.", - "type": "string" - } - }, - "type": "object" - } - } - } - }, - "responses": { - "200": { - "description": "Successful scraping." - } - }, - "summary": "Scrape a single page." - } - } - } -} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_1.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_1.json deleted file mode 100644 index 8656c978..00000000 --- a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_1.json +++ /dev/null @@ -1,165 +0,0 @@ -{ - "info": { - "title": "Firecrawl API", - "version": "v0" - }, - "openapi": "3.0.0", - "paths": { - "/crawl": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "crawlerOptions": { - "properties": { - "allowBackwardCrawling": { - "description": "Allow backward crawling", - "type": "boolean" - }, - "allowExternalContentLinks": { - "description": "Allow external links", - "type": "boolean" - }, - "excludes": { - "description": "URL patterns to exclude", - "items": { - "type": "string" - }, - "type": "array" - }, - "generateImgAltText": { - "description": "Generate alt text for images", - "type": "boolean" - }, - "ignoreSitemap": { - "description": "Ignore website sitemap", - "type": "boolean" - }, - "includes": { - "description": "URL patterns to include", - "items": { - "type": "string" - }, - "type": "array" - }, - "limit": { - "description": "Maximum pages to crawl", - "type": "integer" - }, - "maxDepth": { - "description": "Maximum crawl depth", - "type": "integer" - }, - "mode": { - "description": "Crawling mode", - "enum": [ - "default", - "fast" - ], - "type": "string" - }, - "returnOnlyUrls": { - "description": "Return only crawled URLs", - "type": "boolean" - } - }, - "type": "object" - }, - "pageOptions": { - "properties": { - "fullPageScreenshot": { - "description": "Include full page screenshot", - "type": "boolean" - }, - "headers": { - "description": "Headers for requests", - "type": "object" - }, - "includeHtml": { - "description": "Include HTML content", - "type": "boolean" - }, - "includeRawHtml": { - "description": "Include raw HTML content", - "type": "boolean" - }, - "onlyIncludeTags": { - "description": "Include only specific tags", - "items": { - "type": "string" - }, - "type": "array" - }, - "onlyMainContent": { - "description": "Return only main content", - "type": "boolean" - }, - "removeTags": { - "description": "Remove specific tags", - "items": { - "type": "string" - }, - "type": "array" - }, - "replaceAllPathsWithAbsolutePaths": { - "description": "Use absolute paths", - "type": "boolean" - }, - "screenshot": { - "description": "Include page screenshot", - "type": "boolean" - }, - "waitFor": { - "description": "Wait for page load (ms)", - "type": "integer" - } - }, - "type": "object" - }, - "url": { - "description": "Base URL to crawl", - "type": "string" - } - }, - "type": "object" - } - } - } - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "jobId": { - "description": "Job ID of the crawl", - "type": "string" - } - }, - "type": "object" - } - } - }, - "description": "Crawl request successful" - } - }, - "security": [ - { - "Bearer": [] - } - ], - "summary": "Crawl a website" - } - } - }, - "securitySchemes": { - "Bearer": { - "bearerFormat": "JWT", - "scheme": "bearer", - "type": "http" - } - } -} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_10.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_10.json deleted file mode 100644 index 55f73a32..00000000 --- a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_10.json +++ /dev/null @@ -1,93 +0,0 @@ -{ - "info": { - "title": "Firecrawl API", - "version": "1.0.0" - }, - "openapi": "3.0.0", - "paths": { - "/check_crawl_status": { - "post": { - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "current": { - "type": "integer" - }, - "data": { - "items": { - "properties": { - "content": { - "type": "string" - }, - "markdown": { - "type": "string" - }, - "metadata": { - "properties": { - "description": { - "type": "string" - }, - "language": { - "type": "string" - }, - "sourceURL": { - "type": "string" - }, - "title": { - "type": "string" - } - }, - "type": "object" - }, - "provider": { - "type": "string" - } - }, - "type": "object" - }, - "type": "array" - }, - "status": { - "type": "string" - }, - "total": { - "type": "integer" - } - }, - "type": "object" - } - } - }, - "description": "Crawl job status" - } - }, - "summary": "Check crawl job status" - } - }, - "/crawl": { - "post": { - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "jobId": { - "type": "string" - } - }, - "type": "object" - } - } - }, - "description": "Job ID" - } - }, - "summary": "Crawl URL and subpages" - } - } - } -} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_11.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_11.json deleted file mode 100644 index e19ed056..00000000 --- a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_11.json +++ /dev/null @@ -1,131 +0,0 @@ -{ - "info": { - "title": "Firecrawl API", - "version": "v0" - }, - "openapi": "3.0.0", - "paths": { - "/scrape": { - "post": { - "requestBody": { - "content": { - "application/json": { - "example": { - "extractorOptions": { - "extractionPrompt": "Based on the information on the page, extract the information from the schema. ", - "extractionSchema": { - "properties": { - "company_mission": { - "type": "string" - }, - "is_in_yc": { - "type": "boolean" - }, - "is_open_source": { - "type": "boolean" - }, - "supports_sso": { - "type": "boolean" - } - }, - "required": [ - "company_mission", - "supports_sso", - "is_open_source", - "is_in_yc" - ], - "type": "object" - }, - "mode": "llm-extraction" - }, - "url": "https://docs.firecrawl.dev/" - }, - "schema": { - "properties": { - "extractorOptions": { - "properties": { - "extractionPrompt": { - "description": "Prompt for extraction", - "type": "string" - }, - "extractionSchema": { - "description": "Schema for data extraction", - "type": "object" - }, - "mode": { - "description": "Extraction mode", - "type": "string" - } - }, - "type": "object" - }, - "url": { - "description": "URL to scrape", - "type": "string" - } - }, - "type": "object" - } - } - } - }, - "responses": { - "200": { - "content": { - "application/json": { - "example": { - "data": { - "content": "Raw Content", - "llm_extraction": { - "company_mission": "Train a secure AI on your technical resources that answers customer and employee questions so your team doesn't have to", - "is_in_yc": true, - "is_open_source": false, - "supports_sso": true - }, - "metadata": { - "description": "Mendable allows you to easily build AI chat applications. Ingest, customize, then deploy with one line of code anywhere you want. Brought to you by SideGuide", - "ogDescription": "Mendable allows you to easily build AI chat applications. Ingest, customize, then deploy with one line of code anywhere you want. Brought to you by SideGuide", - "ogImage": "https://docs.firecrawl.dev/mendable_new_og1.png", - "ogLocaleAlternate": [], - "ogSiteName": "Mendable", - "ogTitle": "Mendable", - "ogUrl": "https://docs.firecrawl.dev/", - "robots": "follow, index", - "sourceURL": "https://docs.firecrawl.dev/", - "title": "Mendable" - } - }, - "success": true - }, - "schema": { - "properties": { - "data": { - "properties": { - "content": { - "type": "string" - }, - "llm_extraction": { - "type": "object" - }, - "metadata": { - "type": "object" - } - }, - "type": "object" - }, - "success": { - "type": "boolean" - } - }, - "type": "object" - } - } - }, - "description": "Successful scrape" - } - }, - "summary": "Extract data from pages." - } - } - } -} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_13.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_13.json deleted file mode 100644 index 0352c66f..00000000 --- a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_13.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "info": { - "title": "Firecrawl API", - "version": "v0" - }, - "openapi": "3.0.0", - "paths": { - "/search": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "pageOptions": { - "properties": { - "fetchPageContent": { - "type": "boolean" - } - }, - "type": "object" - }, - "query": { - "type": "string" - } - }, - "type": "object" - } - } - } - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "data": { - "items": { - "properties": { - "markdown": { - "type": "string" - }, - "metadata": { - "properties": { - "description": { - "type": "string" - }, - "language": { - "type": "string" - }, - "sourceURL": { - "type": "string" - }, - "title": { - "type": "string" - } - }, - "type": "object" - }, - "provider": { - "type": "string" - }, - "url": { - "type": "string" - } - }, - "type": "object" - }, - "type": "array" - }, - "success": { - "type": "boolean" - } - }, - "type": "object" - } - } - }, - "description": "Successful search and scrape." - } - }, - "summary": "Search web, scrape, return markdown." - } - } - } -} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_15.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_15.json deleted file mode 100644 index e7384f8e..00000000 --- a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_15.json +++ /dev/null @@ -1,83 +0,0 @@ -{ - "info": { - "title": "Firecrawl API", - "version": "1.0.0" - }, - "openapi": "3.0.0", - "paths": { - "/crawl": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "url": { - "description": "Website URL to crawl.", - "type": "string" - } - }, - "type": "object" - } - } - } - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "items": { - "properties": { - "markdown": { - "description": "Markdown content.", - "type": "string" - } - }, - "type": "object" - }, - "type": "array" - } - } - }, - "description": "Website crawled successfully." - } - }, - "summary": "Crawl a website." - } - }, - "/scrape": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "url": { - "description": "Page URL to scrape.", - "type": "string" - } - }, - "type": "object" - } - } - } - }, - "responses": { - "200": { - "content": { - "text/plain": { - "schema": { - "description": "Scraped content.", - "type": "string" - } - } - }, - "description": "Page scraped successfully." - } - }, - "summary": "Scrape a single page." - } - } - } -} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_16.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_16.json deleted file mode 100644 index ed6fb9d6..00000000 --- a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_16.json +++ /dev/null @@ -1,200 +0,0 @@ -{ - "info": { - "title": "Firecrawl API", - "version": "1.0.0" - }, - "openapi": "3.0.0", - "paths": { - "/crawl": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "crawler_options": { - "properties": { - "exclude": { - "description": "URL patterns to exclude", - "items": { - "type": "string" - }, - "type": "array" - }, - "generateImgAltText": { - "description": "Generate alt text for images", - "type": "boolean" - }, - "includes": { - "description": "URL patterns to include", - "items": { - "type": "string" - }, - "type": "array" - }, - "limit": { - "description": "Max pages to crawl", - "type": "integer" - }, - "maxDepth": { - "description": "Maximum crawl depth", - "type": "integer" - }, - "mode": { - "description": "Crawling mode", - "type": "string" - }, - "returnOnlyUrls": { - "description": "Return only URLs", - "type": "boolean" - }, - "timeout": { - "description": "Timeout in milliseconds", - "type": "integer" - } - }, - "type": "object" - }, - "page_options": { - "properties": { - "includeHtml": { - "description": "Include raw HTML", - "type": "boolean" - }, - "onlyMainContent": { - "description": "Only main content", - "type": "boolean" - } - }, - "type": "object" - }, - "url": { - "description": "Base URL to crawl", - "type": "string" - } - }, - "type": "object" - } - } - } - }, - "responses": { - "200": { - "description": "Crawl successful." - } - }, - "summary": "Crawl a website." - } - }, - "/scrape": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "extractor_options": { - "properties": { - "extractionPrompt": { - "description": "Prompt for extraction", - "type": "string" - }, - "extractionSchema": { - "description": "Schema for extraction", - "type": "string" - }, - "mode": { - "description": "Extraction mode", - "type": "string" - } - }, - "type": "object" - }, - "page_options": { - "properties": { - "includeHtml": { - "description": "Include raw HTML", - "type": "boolean" - }, - "onlyMainContent": { - "description": "Only main content", - "type": "boolean" - } - }, - "type": "object" - }, - "timeout": { - "description": "Timeout in milliseconds", - "type": "integer" - }, - "url": { - "description": "URL to scrape", - "type": "string" - } - }, - "type": "object" - } - } - } - }, - "responses": { - "200": { - "description": "Scrape successful." - } - }, - "summary": "Scrape a website." - } - }, - "/search": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "page_options": { - "properties": { - "fetchPageContent": { - "description": "Fetch full content", - "type": "boolean" - }, - "includeHtml": { - "description": "Include raw HTML", - "type": "boolean" - }, - "onlyMainContent": { - "description": "Only main content", - "type": "boolean" - } - }, - "type": "object" - }, - "query": { - "description": "Search query string", - "type": "string" - }, - "search_options": { - "properties": { - "limit": { - "description": "Max results", - "type": "integer" - } - }, - "type": "object" - } - }, - "type": "object" - } - } - } - }, - "responses": { - "200": { - "description": "Search successful." - } - }, - "summary": "Search Firecrawl index." - } - } - } -} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_2.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_2.json deleted file mode 100644 index 25cf6c05..00000000 --- a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_2.json +++ /dev/null @@ -1,54 +0,0 @@ -{ - "info": { - "title": "Firecrawl API", - "version": "v0" - }, - "openapi": "3.0.0", - "paths": { - "/crawl/cancel/{jobId}": { - "delete": { - "parameters": [ - { - "description": "ID of crawl job", - "in": "path", - "name": "jobId", - "required": true, - "schema": { - "type": "string" - } - } - ], - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "status": { - "type": "string" - } - }, - "type": "object" - } - } - }, - "description": "Returns cancelled." - } - }, - "security": [ - { - "Bearer": [] - } - ], - "summary": "Cancel crawl job" - } - } - }, - "securitySchemes": { - "Bearer": { - "bearerFormat": "Bearer ", - "scheme": "bearer", - "type": "http" - } - } -} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_22.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_22.json deleted file mode 100644 index ac146a63..00000000 --- a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_22.json +++ /dev/null @@ -1,166 +0,0 @@ -{ - "info": { - "title": "Firecrawl API", - "version": "1.0.0" - }, - "openapi": "3.0.0", - "paths": { - "/check-crawl-status/{jobId}": { - "get": { - "parameters": [ - { - "in": "path", - "name": "jobId", - "required": true, - "schema": { - "type": "string" - } - } - ], - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "current": { - "description": "Current progress", - "type": "integer" - }, - "data": { - "items": { - "properties": { - "content": { - "description": "Raw content", - "type": "string" - }, - "markdown": { - "description": "Markdown content", - "type": "string" - }, - "metadata": { - "description": "Page metadata", - "type": "object" - }, - "provider": { - "description": "Data provider", - "type": "string" - } - }, - "type": "object" - }, - "type": "array" - }, - "status": { - "description": "Job status", - "type": "string" - }, - "total": { - "description": "Total pages", - "type": "integer" - } - }, - "type": "object" - } - } - }, - "description": "Crawl job status." - } - }, - "summary": "Check crawl job status." - } - }, - "/crawl": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "crawlerOptions": { - "description": "Crawler options", - "type": "object" - }, - "url": { - "description": "URL to crawl", - "type": "string" - } - }, - "type": "object" - } - } - } - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "jobId": { - "description": "Job ID", - "type": "string" - } - }, - "type": "object" - } - } - }, - "description": "Crawl job submitted." - } - }, - "summary": "Crawl a URL." - } - }, - "/scrape": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "extractorOptions": { - "description": "Extractor options", - "type": "object" - }, - "pageOptions": { - "description": "Page options", - "type": "object" - }, - "url": { - "description": "URL to scrape", - "type": "string" - } - }, - "type": "object" - } - } - } - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "data": { - "description": "Scraped data", - "type": "object" - }, - "success": { - "description": "Success flag", - "type": "boolean" - } - }, - "type": "object" - } - } - }, - "description": "Scraped data." - } - }, - "summary": "Scrape a single URL." - } - } - } -} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_25.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_25.json deleted file mode 100644 index 9701a462..00000000 --- a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_25.json +++ /dev/null @@ -1,229 +0,0 @@ -{ - "info": { - "title": "Firecrawl API", - "version": "v0" - }, - "openapi": "3.0.0", - "paths": { - "/v0/crawl": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "crawlerOptions": { - "properties": { - "excludes": { - "description": "Paths to exclude", - "items": { - "type": "string" - }, - "type": "array" - }, - "includes": { - "description": "Paths to include", - "items": { - "type": "string" - }, - "type": "array" - }, - "limit": { - "description": "Maximum pages to crawl", - "type": "integer" - }, - "maxDepth": { - "description": "Maximum crawl depth", - "type": "integer" - }, - "returnOnlyUrls": { - "description": "Only return URLs", - "type": "boolean" - } - }, - "type": "object" - }, - "pageOptions": { - "properties": { - "onlyMainContent": { - "description": "Extract main content", - "type": "boolean" - } - }, - "type": "object" - }, - "url": { - "description": "URL to crawl", - "type": "string" - } - }, - "type": "object" - } - } - } - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "jobId": { - "description": "Job ID", - "type": "string" - } - }, - "type": "object" - } - } - }, - "description": "Crawl job created" - } - }, - "summary": "Crawl a website" - } - }, - "/v0/crawl/status/{jobId}": { - "get": { - "parameters": [ - { - "in": "path", - "name": "jobId", - "required": true, - "schema": { - "type": "string" - } - } - ], - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "current": { - "type": "integer" - }, - "data": { - "items": { - "properties": { - "url": { - "type": "string" - } - }, - "type": "object" - }, - "type": "array" - }, - "status": { - "description": "Job status", - "type": "string" - }, - "total": { - "type": "integer" - } - }, - "type": "object" - } - } - }, - "description": "Crawl job status" - } - }, - "summary": "Get crawl job status" - } - }, - "/v0/scrape": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "pageOptions": { - "properties": { - "onlyMainContent": { - "description": "Extract main content", - "type": "boolean" - } - }, - "type": "object" - }, - "url": { - "description": "URL to scrape", - "type": "string" - } - }, - "type": "object" - } - } - } - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "data": { - "properties": { - "content": { - "type": "string" - }, - "html": { - "type": "string" - }, - "llm_extraction": { - "type": "object" - }, - "markdown": { - "type": "string" - }, - "metadata": { - "properties": { - "description": { - "type": "string" - }, - "language": { - "type": "string" - }, - "pageError": { - "type": "string" - }, - "pageStatusCode": { - "type": "integer" - }, - "sourceURL": { - "type": "string" - }, - "title": { - "type": "string" - } - }, - "type": "object" - }, - "rawHtml": { - "type": "string" - }, - "warning": { - "type": "string" - } - }, - "type": "object" - }, - "success": { - "type": "boolean" - } - }, - "type": "object" - } - } - }, - "description": "Scrape results" - } - }, - "summary": "Scrape a webpage" - } - } - } -} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_26.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_26.json deleted file mode 100644 index b642e9c0..00000000 --- a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_26.json +++ /dev/null @@ -1,115 +0,0 @@ -{ - "info": { - "title": "Firecrawl API", - "version": "1.0.0" - }, - "openapi": "3.0.0", - "paths": { - "/scrape": { - "post": { - "requestBody": { - "content": { - "application/json": { - "example": { - "extractorOptions": { - "extractionPrompt": "Extract company info.", - "extractionSchema": { - "properties": { - "company_description": { - "type": "string" - }, - "company_industry": { - "type": "string" - }, - "who_they_serve": { - "type": "string" - } - }, - "required": [ - "company_description", - "company_industry", - "who_they_serve" - ], - "type": "object" - }, - "mode": "llm-extraction" - }, - "pageOptions": { - "onlyMainContent": true - }, - "url": "https://example.com" - }, - "schema": { - "properties": { - "extractorOptions": { - "properties": { - "extractionPrompt": { - "description": "Prompt for LLM extraction.", - "type": "string" - }, - "extractionSchema": { - "properties": { - "properties": { - "company_description": { - "type": "string" - }, - "company_industry": { - "type": "string" - }, - "who_they_serve": { - "type": "string" - } - }, - "required": [ - "company_description", - "company_industry", - "who_they_serve" - ], - "type": { - "type": "string" - } - }, - "type": "object" - }, - "mode": { - "description": "Extraction mode.", - "type": "string" - } - }, - "type": "object" - }, - "pageOptions": { - "properties": { - "onlyMainContent": { - "type": "boolean" - } - }, - "type": "object" - }, - "url": { - "description": "URL to scrape.", - "type": "string" - } - }, - "type": "object" - } - } - } - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "type": "object" - } - } - }, - "description": "Successful scrape." - } - }, - "summary": "Scrape data from URL." - } - } - } -} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_3.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_3.json deleted file mode 100644 index bcf94159..00000000 --- a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_3.json +++ /dev/null @@ -1,185 +0,0 @@ -{ - "components": { - "securitySchemes": { - "bearerAuth": { - "scheme": "bearer", - "type": "http" - } - } - }, - "info": { - "title": "Firecrawl API", - "version": "v0" - }, - "openapi": "3.0.0", - "paths": { - "/v0/scrape": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "extractorOptions": { - "description": "Options for extraction", - "properties": { - "extractionPrompt": { - "description": "Prompt for LLM extraction", - "type": "string" - }, - "extractionSchema": { - "description": "Schema for LLM extraction", - "type": "object" - }, - "mode": { - "description": "Extraction mode", - "enum": [ - "markdown", - "llm-extraction", - "llm-extraction-from-raw-html", - "llm-extraction-from-markdown" - ], - "type": "string" - } - }, - "type": "object" - }, - "pageOptions": { - "properties": { - "fullPageScreenshot": { - "description": "Include full page screenshot", - "type": "boolean" - }, - "headers": { - "description": "Headers for request", - "type": "object" - }, - "includeHtml": { - "description": "Include HTML content", - "type": "boolean" - }, - "includeRawHtml": { - "description": "Include raw HTML content", - "type": "boolean" - }, - "onlyIncludeTags": { - "description": "Include only these tags", - "items": { - "type": "string" - }, - "type": "array" - }, - "onlyMainContent": { - "description": "Only return main content", - "type": "boolean" - }, - "removeTags": { - "description": "Remove these tags", - "items": { - "type": "string" - }, - "type": "array" - }, - "replaceAllPathsWithAbsolutePaths": { - "description": "Replace relative paths", - "type": "boolean" - }, - "screenshot": { - "description": "Include screenshot", - "type": "boolean" - }, - "waitFor": { - "description": "Wait time in ms", - "type": "integer" - } - }, - "type": "object" - }, - "timeout": { - "description": "Timeout in ms", - "type": "integer" - }, - "url": { - "description": "URL to scrape", - "type": "string" - } - }, - "type": "object" - } - } - }, - "required": true - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "data": { - "properties": { - "content": { - "type": "string" - }, - "html": { - "type": "string" - }, - "llm_extraction": { - "type": "object" - }, - "markdown": { - "type": "string" - }, - "metadata": { - "properties": { - "description": { - "type": "string" - }, - "language": { - "type": "string" - }, - "pageError": { - "type": "string" - }, - "pageStatusCode": { - "type": "integer" - }, - "sourceURL": { - "type": "string" - }, - "title": { - "type": "string" - } - }, - "type": "object" - }, - "rawHtml": { - "type": "string" - }, - "warning": { - "type": "string" - } - }, - "type": "object" - }, - "success": { - "type": "boolean" - } - }, - "type": "object" - } - } - }, - "description": "Successful scrape" - } - }, - "security": [ - { - "bearerAuth": [] - } - ], - "summary": "Scrape a webpage" - } - } - } -} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_30.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_30.json deleted file mode 100644 index bc542e2a..00000000 --- a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_30.json +++ /dev/null @@ -1,212 +0,0 @@ -{ - "info": { - "title": "Firecrawl API", - "version": "1.0.0" - }, - "openapi": "3.0.0", - "paths": { - "/crawl": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "crawlerOptions": { - "description": "Crawl job options", - "properties": { - "excludes": { - "description": "Pages to exclude", - "items": { - "type": "string" - }, - "type": "array" - }, - "includes": { - "description": "Pages to include", - "items": { - "type": "string" - }, - "type": "array" - }, - "limit": { - "description": "Max pages to crawl", - "type": "integer" - } - }, - "type": "object" - }, - "pageOptions": { - "description": "Page scraping options", - "properties": { - "onlyMainContent": { - "description": "Only scrape main content", - "type": "boolean" - } - }, - "type": "object" - }, - "url": { - "description": "URL to crawl", - "type": "string" - } - }, - "required": [ - "url" - ], - "type": "object" - } - } - } - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "description": "Crawl job result", - "type": "object" - } - } - }, - "description": "Crawl job result" - } - }, - "summary": "Crawl a website" - } - }, - "/crawl/{jobId}/cancel": { - "post": { - "parameters": [ - { - "description": "Crawl job ID", - "in": "path", - "name": "jobId", - "required": true, - "schema": { - "type": "string" - } - } - ], - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "description": "Cancellation status", - "type": "object" - } - } - }, - "description": "Cancellation status" - } - }, - "summary": "Cancel crawl job" - } - }, - "/crawl/{jobId}/status": { - "get": { - "parameters": [ - { - "description": "Crawl job ID", - "in": "path", - "name": "jobId", - "required": true, - "schema": { - "type": "string" - } - } - ], - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "description": "Crawl status", - "type": "object" - } - } - }, - "description": "Crawl status" - } - }, - "summary": "Check crawl status" - } - }, - "/scrape": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "extractorOptions": { - "description": "LLM extraction options", - "properties": { - "extractionSchema": { - "description": "JSON schema for extraction", - "type": "object" - } - }, - "type": "object" - }, - "url": { - "description": "URL to scrape", - "type": "string" - } - }, - "required": [ - "url" - ], - "type": "object" - } - } - } - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "description": "Scraped data", - "type": "object" - } - } - }, - "description": "Scraped data" - } - }, - "summary": "Scrape a single URL" - } - }, - "/search": { - "get": { - "parameters": [ - { - "description": "Search query", - "in": "query", - "name": "query", - "required": true, - "schema": { - "type": "string" - } - } - ], - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "description": "Search results", - "type": "object" - } - } - }, - "description": "Search results" - } - }, - "summary": "Search and scrape" - } - } - } -} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_31.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_31.json deleted file mode 100644 index 07f71759..00000000 --- a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_31.json +++ /dev/null @@ -1,199 +0,0 @@ -{ - "info": { - "title": "Firecrawl API", - "version": "1.0.0" - }, - "openapi": "3.0.0", - "paths": { - "/crawl": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "crawlerOptions": { - "properties": { - "excludes": { - "description": "Paths to exclude", - "items": { - "type": "string" - }, - "type": "array" - }, - "includes": { - "description": "Paths to include", - "items": { - "type": "string" - }, - "type": "array" - }, - "limit": { - "description": "Maximum pages to crawl", - "type": "integer" - } - }, - "type": "object" - }, - "pageOptions": { - "properties": { - "onlyMainContent": { - "description": "Extract only main content", - "type": "boolean" - } - }, - "type": "object" - }, - "url": { - "description": "Starting URL for crawl", - "type": "string" - } - }, - "type": "object" - } - } - } - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "jobId": { - "description": "Unique job identifier", - "type": "string" - } - }, - "type": "object" - } - } - }, - "description": "Crawl job started" - } - }, - "summary": "Crawl a website" - } - }, - "/crawl/{jobId}/status": { - "get": { - "parameters": [ - { - "in": "path", - "name": "jobId", - "required": true, - "schema": { - "type": "string" - } - } - ], - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "status": { - "description": "Current job status", - "type": "string" - } - }, - "type": "object" - } - } - }, - "description": "Crawl job status" - } - }, - "summary": "Check crawl status" - } - }, - "/scrape": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "extractorOptions": { - "properties": { - "extractionSchema": { - "description": "Zod schema for extraction", - "type": "object" - } - }, - "type": "object" - }, - "url": { - "description": "URL to scrape", - "type": "string" - } - }, - "type": "object" - } - } - } - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "data": { - "description": "Extracted data", - "type": "object" - } - }, - "type": "object" - } - } - }, - "description": "Scraped data" - } - }, - "summary": "Scrape a single URL" - } - }, - "/search": { - "get": { - "parameters": [ - { - "in": "query", - "name": "query", - "required": true, - "schema": { - "type": "string" - } - } - ], - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "items": { - "properties": { - "content": { - "description": "Page content (optional)", - "type": "string" - }, - "url": { - "description": "Result URL", - "type": "string" - } - }, - "type": "object" - }, - "type": "array" - } - } - }, - "description": "Search results" - } - }, - "summary": "Search for a query" - } - } - } -} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_33.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_33.json deleted file mode 100644 index b45ae841..00000000 --- a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_33.json +++ /dev/null @@ -1,202 +0,0 @@ -{ - "info": { - "title": "Firecrawl API", - "version": "1.0.0" - }, - "openapi": "3.0.0", - "paths": { - "/crawl": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "crawlerOptions": { - "description": "Options for crawling", - "properties": { - "excludes": { - "description": "URLs to exclude", - "items": { - "type": "string" - }, - "type": "array" - }, - "includes": { - "description": "URLs to include", - "items": { - "type": "string" - }, - "type": "array" - }, - "limit": { - "description": "Maximum pages to crawl", - "type": "integer" - } - }, - "type": "object" - }, - "pageOptions": { - "description": "Options for page content", - "properties": { - "onlyMainContent": { - "description": "Extract only main content", - "type": "boolean" - } - }, - "type": "object" - }, - "url": { - "description": "URL to crawl", - "type": "string" - } - }, - "type": "object" - } - } - } - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "jobId": { - "description": "Unique crawl job ID", - "type": "string" - } - }, - "type": "object" - } - } - }, - "description": "Crawl job started." - } - }, - "summary": "Crawl a website." - } - }, - "/crawl/{jobId}": { - "get": { - "parameters": [ - { - "in": "path", - "name": "jobId", - "required": true, - "schema": { - "type": "string" - } - } - ], - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "status": { - "description": "Current job status", - "type": "string" - } - }, - "type": "object" - } - } - }, - "description": "Crawl job status." - } - }, - "summary": "Check crawl job status." - } - }, - "/scrape": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "extractorOptions": { - "description": "Options for data extraction", - "properties": { - "extractionSchema": { - "description": "Pydantic schema", - "type": "object" - }, - "mode": { - "description": "Extraction mode", - "type": "string" - } - }, - "type": "object" - }, - "pageOptions": { - "description": "Options for page content", - "properties": { - "onlyMainContent": { - "description": "Extract only main content", - "type": "boolean" - } - }, - "type": "object" - }, - "url": { - "description": "URL to scrape", - "type": "string" - } - }, - "type": "object" - } - } - } - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "type": "object" - } - } - }, - "description": "Scraped data." - } - }, - "summary": "Scrape a single URL." - } - }, - "/search": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "query": { - "description": "Search query", - "type": "string" - } - }, - "type": "object" - } - } - } - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "type": "object" - } - } - }, - "description": "Search results." - } - }, - "summary": "Search the web." - } - } - } -} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_34.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_34.json deleted file mode 100644 index 3bafda42..00000000 --- a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_34.json +++ /dev/null @@ -1,201 +0,0 @@ -{ - "info": { - "title": "Firecrawl API", - "version": "0.1" - }, - "openapi": "3.0.0", - "paths": { - "/crawl": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "crawlerOptions": { - "description": "Crawl job options", - "properties": { - "excludes": { - "description": "URLs to exclude", - "items": { - "type": "string" - }, - "type": "array" - }, - "includes": { - "description": "URLs to include", - "items": { - "type": "string" - }, - "type": "array" - }, - "limit": { - "description": "Maximum pages to crawl", - "type": "integer" - } - }, - "type": "object" - }, - "pageOptions": { - "description": "Page scraping options", - "properties": { - "onlyMainContent": { - "description": "Only scrape main content", - "type": "boolean" - } - }, - "type": "object" - }, - "url": { - "description": "URL to crawl", - "type": "string" - } - }, - "type": "object" - } - } - } - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "type": "object" - } - } - }, - "description": "Crawl job started" - } - }, - "summary": "Crawl a website." - } - }, - "/crawl/{job_id}/cancel": { - "post": { - "parameters": [ - { - "description": "Crawl job ID", - "in": "path", - "name": "job_id", - "required": true, - "schema": { - "type": "string" - } - } - ], - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "type": "object" - } - } - }, - "description": "Cancellation status" - } - }, - "summary": "Cancel crawl job." - } - }, - "/crawl/{job_id}/status": { - "get": { - "parameters": [ - { - "description": "Crawl job ID", - "in": "path", - "name": "job_id", - "required": true, - "schema": { - "type": "string" - } - } - ], - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "type": "object" - } - } - }, - "description": "Crawl status" - } - }, - "summary": "Check crawl status." - } - }, - "/scrape": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "extractorOptions": { - "description": "LLM extraction options", - "properties": { - "extractionSchema": { - "description": "JSON schema for extraction", - "type": "object" - } - }, - "type": "object" - }, - "url": { - "description": "URL to scrape", - "type": "string" - } - }, - "type": "object" - } - } - } - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "type": "object" - } - } - }, - "description": "Scraped data" - } - }, - "summary": "Scrape a single URL." - } - }, - "/search": { - "get": { - "parameters": [ - { - "description": "Search query", - "in": "query", - "name": "query", - "required": true, - "schema": { - "type": "string" - } - } - ], - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "type": "object" - } - } - }, - "description": "Search results" - } - }, - "summary": "Search and scrape results." - } - } - } -} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_35.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_35.json deleted file mode 100644 index 890d31b1..00000000 --- a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_35.json +++ /dev/null @@ -1,245 +0,0 @@ -{ - "info": { - "title": "Firecrawl API", - "version": "1.0.0" - }, - "openapi": "3.0.0", - "paths": { - "/check-crawl-status": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "jobId": { - "description": "Crawl job ID", - "type": "string" - } - }, - "type": "object" - } - } - } - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "current": { - "description": "Current page count", - "type": "integer" - }, - "data": { - "description": "Crawl data", - "items": { - "properties": { - "content": { - "description": "Raw content", - "type": "string" - }, - "markdown": { - "description": "Markdown content", - "type": "string" - }, - "metadata": { - "description": "Page metadata", - "properties": { - "description": { - "description": "Page description", - "type": "string" - }, - "language": { - "description": "Page language", - "type": "string" - }, - "sourceURL": { - "description": "Page URL", - "type": "string" - }, - "title": { - "description": "Page title", - "type": "string" - } - }, - "type": "object" - }, - "provider": { - "description": "Content provider", - "type": "string" - } - }, - "type": "object" - }, - "type": "array" - }, - "status": { - "description": "Crawl status", - "type": "string" - }, - "total": { - "description": "Total page count", - "type": "integer" - } - }, - "type": "object" - } - } - }, - "description": "Crawl job status." - } - }, - "summary": "Check crawl job status." - } - }, - "/crawl": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "crawlerOptions": { - "description": "Crawler options", - "properties": { - "excludes": { - "description": "URLs to exclude", - "items": { - "type": "string" - }, - "type": "array" - } - }, - "type": "object" - }, - "url": { - "description": "URL to crawl", - "type": "string" - } - }, - "type": "object" - } - } - } - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "jobId": { - "description": "Job ID", - "type": "string" - } - }, - "type": "object" - } - } - }, - "description": "Crawl job submitted." - } - }, - "summary": "Crawl a URL." - } - }, - "/scrape-url": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "extractorOptions": { - "description": "Extractor options", - "properties": { - "extractionSchema": { - "description": "Extraction schema", - "type": "string" - }, - "mode": { - "description": "Extraction mode", - "type": "string" - } - }, - "type": "object" - }, - "pageOptions": { - "description": "Page options", - "properties": { - "onlyMainContent": { - "description": "Only main content", - "type": "boolean" - } - }, - "type": "object" - }, - "url": { - "description": "URL to scrape", - "type": "string" - } - }, - "type": "object" - } - } - } - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "data": { - "description": "Scraped data", - "properties": { - "content": { - "description": "Raw content", - "type": "string" - }, - "html": { - "description": "HTML content", - "type": "string" - }, - "llm_extraction": { - "description": "LLM extraction results", - "type": "object" - }, - "markdown": { - "description": "Markdown content", - "type": "string" - }, - "metadata": { - "description": "Page metadata", - "type": "object" - }, - "rawHtml": { - "description": "Raw HTML content", - "type": "string" - }, - "warning": { - "description": "Warning message", - "type": "string" - } - }, - "type": "object" - }, - "success": { - "description": "Request success", - "type": "boolean" - } - }, - "type": "object" - } - } - }, - "description": "Scraped data." - } - }, - "summary": "Scrape a single URL." - } - } - } -} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_4.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_4.json deleted file mode 100644 index daf53932..00000000 --- a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_4.json +++ /dev/null @@ -1,129 +0,0 @@ -{ - "components": { - "securitySchemes": { - "Bearer": { - "scheme": "bearer", - "type": "http" - } - } - }, - "info": { - "title": "Firecrawl API", - "version": "v0" - }, - "openapi": "3.0.0", - "paths": { - "/search": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "pageOptions": { - "properties": { - "fetchPageContent": { - "description": "Fetch content of each page.", - "type": "boolean" - }, - "includeHtml": { - "description": "Include HTML content.", - "type": "boolean" - }, - "includeRawHtml": { - "description": "Include raw HTML content.", - "type": "boolean" - }, - "onlyMainContent": { - "description": "Only return main content.", - "type": "boolean" - } - }, - "type": "object" - }, - "query": { - "description": "The query to search for", - "type": "string" - }, - "searchOptions": { - "properties": { - "limit": { - "description": "Maximum number of results.", - "type": "integer" - } - }, - "type": "object" - } - }, - "type": "object" - } - } - }, - "required": true - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "data": { - "items": { - "properties": { - "content": { - "type": "string" - }, - "markdown": { - "type": "string" - }, - "metadata": { - "properties": { - "description": { - "type": "string" - }, - "language": { - "type": "string" - }, - "sourceURL": { - "type": "string" - }, - "title": { - "type": "string" - } - }, - "type": "object" - }, - "url": { - "type": "string" - } - }, - "type": "object" - }, - "type": "array" - }, - "success": { - "type": "boolean" - } - }, - "type": "object" - } - } - }, - "description": "Successful search." - } - }, - "security": [ - { - "Bearer": [] - } - ], - "summary": "Search the web." - } - } - }, - "servers": [ - { - "url": "https://api.firecrawl.dev/v0" - } - ] -} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_5.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_5.json deleted file mode 100644 index 4fae28c0..00000000 --- a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_5.json +++ /dev/null @@ -1,186 +0,0 @@ -{ - "info": { - "title": "Firecrawl API", - "version": "v0" - }, - "openapi": "3.0.0", - "paths": { - "/crawl/status/{jobId}": { - "get": { - "parameters": [ - { - "description": "ID of crawl job", - "in": "path", - "name": "jobId", - "required": true, - "schema": { - "type": "string" - } - } - ], - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "current": { - "description": "Current page number", - "type": "integer" - }, - "data": { - "description": "Data from the job", - "items": { - "properties": { - "content": { - "type": "string" - }, - "html": { - "description": "HTML content", - "nullable": true, - "type": "string" - }, - "index": { - "description": "Page number crawled", - "type": "integer" - }, - "markdown": { - "type": "string" - }, - "metadata": { - "properties": { - "description": { - "type": "string" - }, - "language": { - "nullable": true, - "type": "string" - }, - "pageError": { - "description": "Error message of page", - "nullable": true, - "type": "string" - }, - "pageStatusCode": { - "description": "Status code of page", - "type": "integer" - }, - "sourceURL": { - "type": "string" - }, - "title": { - "type": "string" - }, - "{any other metadata}": { - "type": "string" - } - }, - "type": "object" - }, - "rawHtml": { - "description": "Raw HTML content", - "nullable": true, - "type": "string" - } - }, - "type": "object" - }, - "type": "array" - }, - "partial_data": { - "description": "Partial documents (streaming)", - "items": { - "properties": { - "content": { - "type": "string" - }, - "html": { - "description": "HTML content", - "nullable": true, - "type": "string" - }, - "index": { - "description": "Page number crawled", - "type": "integer" - }, - "markdown": { - "type": "string" - }, - "metadata": { - "properties": { - "description": { - "type": "string" - }, - "language": { - "nullable": true, - "type": "string" - }, - "pageError": { - "description": "Error message of page", - "nullable": true, - "type": "string" - }, - "pageStatusCode": { - "description": "Status code of page", - "type": "integer" - }, - "sourceURL": { - "type": "string" - }, - "title": { - "type": "string" - }, - "{any other metadata}": { - "type": "string" - } - }, - "type": "object" - }, - "rawHtml": { - "description": "Raw HTML content", - "nullable": true, - "type": "string" - } - }, - "type": "object" - }, - "type": "array" - }, - "status": { - "description": "Status of the job", - "type": "string" - }, - "total": { - "description": "Total number of pages", - "type": "integer" - } - }, - "type": "object" - } - } - }, - "description": "Successful operation" - } - }, - "security": [ - { - "Authorization": [] - } - ], - "summary": "Get crawl job status" - } - } - }, - "securitySchemes": { - "Authorization": { - "bearerFormat": "Bearer ", - "scheme": "bearer", - "type": "http" - } - }, - "servers": [ - { - "url": "https://api.firecrawl.dev/v0" - } - ] -} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_7.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_7.json deleted file mode 100644 index b74b9886..00000000 --- a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_7.json +++ /dev/null @@ -1,86 +0,0 @@ -{ - "info": { - "title": "Firecrawl API", - "version": "v0" - }, - "openapi": "3.0.0", - "paths": { - "/v0/search": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "pageOptions": { - "properties": { - "fetchPageContent": { - "description": "Fetch page content", - "type": "boolean" - } - }, - "type": "object" - }, - "query": { - "description": "Search term", - "type": "string" - } - }, - "type": "object" - } - } - } - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "data": { - "items": { - "properties": { - "markdown": { - "type": "string" - }, - "metadata": { - "properties": { - "description": { - "type": "string" - }, - "language": { - "type": "string" - }, - "sourceURL": { - "type": "string" - }, - "title": { - "type": "string" - } - }, - "type": "object" - }, - "url": { - "type": "string" - } - }, - "type": "object" - }, - "type": "array" - }, - "success": { - "type": "boolean" - } - }, - "type": "object" - } - } - }, - "description": "Successful search" - } - }, - "summary": "Search and extract content" - } - } - } -} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_8.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_8.json deleted file mode 100644 index 2d5f40e2..00000000 --- a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_8.json +++ /dev/null @@ -1,59 +0,0 @@ -{ - "info": { - "title": "Firecrawl API", - "version": "v0" - }, - "openapi": "3.0.0", - "paths": { - "/test": { - "get": { - "description": "Returns a test message.", - "responses": { - "200": { - "content": { - "text/plain": { - "schema": { - "example": "Hello, world!", - "type": "string" - } - } - }, - "description": "Successful operation" - } - }, - "summary": "Test endpoint" - } - }, - "/v0/crawl": { - "post": { - "description": "Processes crawl job for URL.", - "requestBody": { - "content": { - "application/json": { - "example": { - "url": "https://docs.firecrawl.dev" - }, - "schema": { - "properties": { - "url": { - "description": "Website URL", - "type": "string" - } - }, - "type": "object" - } - } - }, - "description": "URL to crawl", - "required": true - }, - "responses": { - "200": { - "description": "Crawl initiated." - } - }, - "summary": "Crawl a given URL." - } - } - } -} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/combined_api_spec.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/combined_api_spec.json deleted file mode 100644 index 77d67234..00000000 --- a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/combined_api_spec.json +++ /dev/null @@ -1,738 +0,0 @@ -{ - "components": { - "schemas": {} - }, - "info": { - "title": "https://docs.firecrawl.dev API Specification", - "version": "1.0.0" - }, - "openapi": "3.0.0", - "paths": { - "/check_crawl_status": { - "post": { - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "current": { - "type": "integer" - }, - "data": { - "items": { - "properties": { - "content": { - "type": "string" - }, - "markdown": { - "type": "string" - }, - "metadata": { - "properties": { - "description": { - "type": "string" - }, - "language": { - "type": "string" - }, - "sourceURL": { - "type": "string" - }, - "title": { - "type": "string" - } - }, - "type": "object" - }, - "provider": { - "type": "string" - } - }, - "type": "object" - }, - "type": "array" - }, - "status": { - "type": "string" - }, - "total": { - "type": "integer" - } - }, - "type": "object" - } - } - }, - "description": "Crawl job status" - } - }, - "summary": "Check crawl job status" - } - }, - "/crawl": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "crawlerOptions": { - "properties": { - "allowBackwardCrawling": { - "description": "Allow backward crawling", - "type": "boolean" - }, - "allowExternalContentLinks": { - "description": "Allow external links", - "type": "boolean" - }, - "excludes": { - "description": "URL patterns to exclude", - "items": { - "type": "string" - }, - "type": "array" - }, - "generateImgAltText": { - "description": "Generate alt text for images", - "type": "boolean" - }, - "ignoreSitemap": { - "description": "Ignore website sitemap", - "type": "boolean" - }, - "includes": { - "description": "URL patterns to include", - "items": { - "type": "string" - }, - "type": "array" - }, - "limit": { - "description": "Maximum pages to crawl", - "type": "integer" - }, - "maxDepth": { - "description": "Maximum crawl depth", - "type": "integer" - }, - "mode": { - "description": "Crawling mode", - "enum": [ - "default", - "fast" - ], - "type": "string" - }, - "returnOnlyUrls": { - "description": "Return only crawled URLs", - "type": "boolean" - } - }, - "type": "object" - }, - "pageOptions": { - "properties": { - "fullPageScreenshot": { - "description": "Include full page screenshot", - "type": "boolean" - }, - "headers": { - "description": "Headers for requests", - "type": "object" - }, - "includeHtml": { - "description": "Include HTML content", - "type": "boolean" - }, - "includeRawHtml": { - "description": "Include raw HTML content", - "type": "boolean" - }, - "onlyIncludeTags": { - "description": "Include only specific tags", - "items": { - "type": "string" - }, - "type": "array" - }, - "onlyMainContent": { - "description": "Return only main content", - "type": "boolean" - }, - "removeTags": { - "description": "Remove specific tags", - "items": { - "type": "string" - }, - "type": "array" - }, - "replaceAllPathsWithAbsolutePaths": { - "description": "Use absolute paths", - "type": "boolean" - }, - "screenshot": { - "description": "Include page screenshot", - "type": "boolean" - }, - "waitFor": { - "description": "Wait for page load (ms)", - "type": "integer" - } - }, - "type": "object" - }, - "url": { - "description": "Base URL to crawl", - "type": "string" - } - }, - "type": "object" - } - } - } - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "jobId": { - "description": "Job ID of the crawl", - "type": "string" - } - }, - "type": "object" - } - } - }, - "description": "Crawl request successful" - } - }, - "security": [ - { - "Bearer": [] - } - ], - "summary": "Crawl a website" - } - }, - "/crawl/cancel/{jobId}": { - "delete": { - "parameters": [ - { - "description": "ID of crawl job", - "in": "path", - "name": "jobId", - "required": true, - "schema": { - "type": "string" - } - } - ], - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "status": { - "type": "string" - } - }, - "type": "object" - } - } - }, - "description": "Returns cancelled." - } - }, - "security": [ - { - "Bearer": [] - } - ], - "summary": "Cancel crawl job" - } - }, - "/search": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "pageOptions": { - "properties": { - "fetchPageContent": { - "description": "Fetch content of each page.", - "type": "boolean" - }, - "includeHtml": { - "description": "Include HTML content.", - "type": "boolean" - }, - "includeRawHtml": { - "description": "Include raw HTML content.", - "type": "boolean" - }, - "onlyMainContent": { - "description": "Only return main content.", - "type": "boolean" - } - }, - "type": "object" - }, - "query": { - "description": "The query to search for", - "type": "string" - }, - "searchOptions": { - "properties": { - "limit": { - "description": "Maximum number of results.", - "type": "integer" - } - }, - "type": "object" - } - }, - "type": "object" - } - } - }, - "required": true - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "data": { - "items": { - "properties": { - "content": { - "type": "string" - }, - "markdown": { - "type": "string" - }, - "metadata": { - "properties": { - "description": { - "type": "string" - }, - "language": { - "type": "string" - }, - "sourceURL": { - "type": "string" - }, - "title": { - "type": "string" - } - }, - "type": "object" - }, - "url": { - "type": "string" - } - }, - "type": "object" - }, - "type": "array" - }, - "success": { - "type": "boolean" - } - }, - "type": "object" - } - } - }, - "description": "Successful search." - } - }, - "security": [ - { - "Bearer": [] - } - ], - "summary": "Search the web." - } - }, - "/v0/crawl": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "crawlerOptions": { - "description": "Crawling options.", - "properties": { - "excludes": { - "description": "URL patterns to exclude.", - "items": { - "type": "string" - }, - "type": "array" - }, - "includes": { - "description": "URL patterns to include.", - "items": { - "type": "string" - }, - "type": "array" - }, - "limit": { - "description": "Maximum pages to crawl.", - "type": "integer" - }, - "maxDepth": { - "description": "Maximum crawl depth.", - "type": "integer" - }, - "mode": { - "description": "Crawling mode.", - "enum": [ - "default", - "fast" - ], - "type": "string" - }, - "returnOnlyUrls": { - "description": "Return only URLs.", - "type": "boolean" - } - }, - "type": "object" - }, - "pageOptions": { - "description": "Page scraping options.", - "properties": { - "includeHtml": { - "description": "Include HTML content.", - "type": "boolean" - }, - "includeRawHtml": { - "description": "Include raw HTML content.", - "type": "boolean" - }, - "onlyMainContent": { - "description": "Only main content.", - "type": "boolean" - }, - "screenshot": { - "description": "Include page screenshot.", - "type": "boolean" - }, - "waitFor": { - "description": "Wait time in milliseconds.", - "type": "integer" - } - }, - "type": "object" - }, - "url": { - "description": "Base URL to crawl.", - "type": "string" - } - }, - "type": "object" - } - } - } - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "jobId": { - "description": "Crawl job ID.", - "type": "string" - } - }, - "type": "object" - } - } - }, - "description": "Crawl job initiated." - } - }, - "summary": "Crawl multiple pages." - } - }, - "/v0/crawl/status/{jobId}": { - "get": { - "parameters": [ - { - "description": "Crawl job ID.", - "in": "path", - "name": "jobId", - "required": true, - "schema": { - "type": "string" - } - } - ], - "responses": { - "200": { - "description": "Crawl job status." - } - }, - "summary": "Check crawl job status." - } - }, - "/v0/scrape": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "extractorOptions": { - "description": "Options for extraction", - "properties": { - "extractionPrompt": { - "description": "Prompt for LLM extraction", - "type": "string" - }, - "extractionSchema": { - "description": "Schema for LLM extraction", - "type": "object" - }, - "mode": { - "description": "Extraction mode", - "enum": [ - "markdown", - "llm-extraction", - "llm-extraction-from-raw-html", - "llm-extraction-from-markdown" - ], - "type": "string" - } - }, - "type": "object" - }, - "pageOptions": { - "properties": { - "fullPageScreenshot": { - "description": "Include full page screenshot", - "type": "boolean" - }, - "headers": { - "description": "Headers for request", - "type": "object" - }, - "includeHtml": { - "description": "Include HTML content", - "type": "boolean" - }, - "includeRawHtml": { - "description": "Include raw HTML content", - "type": "boolean" - }, - "onlyIncludeTags": { - "description": "Include only these tags", - "items": { - "type": "string" - }, - "type": "array" - }, - "onlyMainContent": { - "description": "Only return main content", - "type": "boolean" - }, - "removeTags": { - "description": "Remove these tags", - "items": { - "type": "string" - }, - "type": "array" - }, - "replaceAllPathsWithAbsolutePaths": { - "description": "Replace relative paths", - "type": "boolean" - }, - "screenshot": { - "description": "Include screenshot", - "type": "boolean" - }, - "waitFor": { - "description": "Wait time in ms", - "type": "integer" - } - }, - "type": "object" - }, - "timeout": { - "description": "Timeout in ms", - "type": "integer" - }, - "url": { - "description": "URL to scrape", - "type": "string" - } - }, - "type": "object" - } - } - }, - "required": true - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "data": { - "properties": { - "content": { - "type": "string" - }, - "html": { - "type": "string" - }, - "llm_extraction": { - "type": "object" - }, - "markdown": { - "type": "string" - }, - "metadata": { - "properties": { - "description": { - "type": "string" - }, - "language": { - "type": "string" - }, - "pageError": { - "type": "string" - }, - "pageStatusCode": { - "type": "integer" - }, - "sourceURL": { - "type": "string" - }, - "title": { - "type": "string" - } - }, - "type": "object" - }, - "rawHtml": { - "type": "string" - }, - "warning": { - "type": "string" - } - }, - "type": "object" - }, - "success": { - "type": "boolean" - } - }, - "type": "object" - } - } - }, - "description": "Successful scrape" - } - }, - "security": [ - { - "bearerAuth": [] - } - ], - "summary": "Scrape a webpage" - } - }, - "/v0/search": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "pageOptions": { - "properties": { - "fetchPageContent": { - "description": "Fetch page content", - "type": "boolean" - } - }, - "type": "object" - }, - "query": { - "description": "Search term", - "type": "string" - } - }, - "type": "object" - } - } - } - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "data": { - "items": { - "properties": { - "markdown": { - "type": "string" - }, - "metadata": { - "properties": { - "description": { - "type": "string" - }, - "language": { - "type": "string" - }, - "sourceURL": { - "type": "string" - }, - "title": { - "type": "string" - } - }, - "type": "object" - }, - "url": { - "type": "string" - } - }, - "type": "object" - }, - "type": "array" - }, - "success": { - "type": "boolean" - } - }, - "type": "object" - } - } - }, - "description": "Successful search" - } - }, - "summary": "Search and extract content" - } - } - } -} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/turning_docs_into_api_specs.ipynb b/examples/turning_docs_into_api_specs/turning_docs_into_api_specs.ipynb deleted file mode 100644 index 1b97f67b..00000000 --- a/examples/turning_docs_into_api_specs/turning_docs_into_api_specs.ipynb +++ /dev/null @@ -1,287 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/ericciarla/projects/python_projects/agents_testing/.conda/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - } - ], - "source": [ - "import os\n", - "import datetime\n", - "import time\n", - "from firecrawl import FirecrawlApp\n", - "import json\n", - "import google.generativeai as genai\n", - "from dotenv import load_dotenv\n", - "\n", - "# Load environment variables\n", - "load_dotenv()\n", - "\n", - "# Retrieve API keys from environment variables\n", - "google_api_key = os.getenv(\"GOOGLE_API_KEY\")\n", - "firecrawl_api_key = os.getenv(\"FIRECRAWL_API_KEY\")\n", - "\n", - "# Configure the Google Generative AI module with the API key\n", - "genai.configure(api_key=google_api_key)\n", - "model = genai.GenerativeModel(\"gemini-1.5-pro-001\")\n", - "\n", - "# Set the docs URL\n", - "docs_url=\"https://docs.firecrawl.dev\"\n", - "\n", - "# Initialize the FirecrawlApp with your API key\n", - "app = FirecrawlApp(api_key=firecrawl_api_key)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "36\n" - ] - } - ], - "source": [ - "# Crawl all pages on docs\n", - "params = {\n", - " \"pageOptions\": {\n", - " \"onlyMainContent\": True\n", - " },\n", - "}\n", - "crawl_result = app.crawl_url(docs_url, params=params)\n", - "\n", - "print(len(crawl_result))" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "prompt_instructions = f\"\"\"Given the following API documentation content, generate an OpenAPI 3.0 specification in JSON format ONLY if you are 100% confident and clear about all details. Focus on extracting the main endpoints, their HTTP methods, parameters, request bodies, and responses. The specification should follow OpenAPI 3.0 structure and conventions. Include only the 200 response for each endpoint. Limit all descriptions to 5 words or less.\n", - "\n", - "If there is ANY uncertainty, lack of complete information, or if you are not 100% confident about ANY part of the specification, return an empty JSON object {{}}.\n", - "\n", - "Do not make anything up. Only include information that is explicitly provided in the documentation. If any detail is unclear or missing, do not attempt to fill it in.\n", - "\n", - "API Documentation Content:\n", - "{{content}}\n", - "\n", - "Generate the OpenAPI 3.0 specification in JSON format ONLY if you are 100% confident about every single detail. Include only the JSON object, no additional text, and ensure it has no errors in the JSON format so it can be parsed. Remember to include only the 200 response for each endpoint and keep all descriptions to 5 words maximum.\n", - "\n", - "Once again, if there is ANY doubt, uncertainty, or lack of complete information, return an empty JSON object {{}}.\n", - "\n", - "To reiterate: accuracy is paramount. Do not make anything up. If you are not 100% clear or confident about the entire OpenAPI spec, return an empty JSON object {{}}.\n", - "\"\"\"\n" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "API specification saved to docs.firecrawl.dev/api_spec_0.json\n", - "API specification saved to docs.firecrawl.dev/api_spec_1.json\n", - "API specification saved to docs.firecrawl.dev/api_spec_2.json\n", - "API specification saved to docs.firecrawl.dev/api_spec_3.json\n", - "API specification saved to docs.firecrawl.dev/api_spec_4.json\n", - "An error occurred for page 5: 'content'\n", - "No API specification found for page 6\n", - "API specification saved to docs.firecrawl.dev/api_spec_7.json\n", - "No API specification found for page 8\n", - "No API specification found for page 9\n", - "API specification saved to docs.firecrawl.dev/api_spec_10.json\n", - "No API specification found for page 11\n", - "No API specification found for page 12\n", - "API specification saved to docs.firecrawl.dev/api_spec_13.json\n", - "No API specification found for page 14\n", - "No API specification found for page 15\n", - "No API specification found for page 16\n", - "No API specification found for page 17\n", - "No API specification found for page 18\n", - "No API specification found for page 19\n", - "No API specification found for page 20\n", - "No API specification found for page 21\n", - "No API specification found for page 22\n", - "No API specification found for page 23\n", - "No API specification found for page 24\n", - "No API specification found for page 25\n", - "No API specification found for page 26\n", - "No API specification found for page 27\n", - "No API specification found for page 28\n", - "No API specification found for page 29\n", - "No API specification found for page 30\n", - "No API specification found for page 31\n", - "No API specification found for page 32\n", - "No API specification found for page 33\n", - "No API specification found for page 34\n", - "No API specification found for page 35\n", - "Total API specifications collected: 8\n" - ] - } - ], - "source": [ - "# Create a folder for storing API specs\n", - "import os\n", - "import urllib.parse\n", - "\n", - "folder_name = urllib.parse.urlparse(docs_url).netloc\n", - "os.makedirs(folder_name, exist_ok=True)\n", - "\n", - "# Initialize a list to store all API specs\n", - "all_api_specs = []\n", - "\n", - "# Process each page in crawl_result\n", - "for index, result in enumerate(crawl_result):\n", - " if 'content' in result:\n", - " # Update prompt_instructions with the current page's content\n", - " current_prompt = prompt_instructions.replace(\"{content}\", result['content'])\n", - " try:\n", - " # Query the model\n", - " response = model.generate_content([current_prompt])\n", - " response_dict = response.to_dict()\n", - " response_text = response_dict['candidates'][0]['content']['parts'][0]['text']\n", - " \n", - " # Remove the ```json code wrap if present\n", - " response_text = response_text.strip().removeprefix('```json').removesuffix('```').strip()\n", - " \n", - " # Parse JSON\n", - " json_data = json.loads(response_text)\n", - " \n", - " # Save non-empty API specs\n", - " if json_data != {}:\n", - " output_file = os.path.join(folder_name, f'api_spec_{index}.json')\n", - " with open(output_file, 'w') as f:\n", - " json.dump(json_data, f, indent=2, sort_keys=True)\n", - " print(f\"API specification saved to {output_file}\")\n", - " \n", - " # Add the API spec to the list\n", - " all_api_specs.append(json_data)\n", - " else:\n", - " print(f\"No API specification found for page {index}\")\n", - " \n", - " except json.JSONDecodeError:\n", - " print(f\"Error parsing JSON response for page {index}\")\n", - " except Exception as e:\n", - " print(f\"An error occurred for page {index}: {str(e)}\")\n", - "\n", - "# Print the total number of API specs collected\n", - "print(f\"Total API specifications collected: {len(all_api_specs)}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Combined API specification saved to docs.firecrawl.dev/combined_api_spec.json\n", - "Total paths in combined spec: 8\n", - "Total schemas in combined spec: 0\n" - ] - } - ], - "source": [ - "# Combine all API specs and keep the most filled out spec for each path and method\n", - "combined_spec = {\n", - " \"openapi\": \"3.0.0\",\n", - " \"info\": {\n", - " \"title\": f\"{docs_url} API Specification\",\n", - " \"version\": \"1.0.0\"\n", - " },\n", - " \"paths\": {},\n", - " \"components\": {\n", - " \"schemas\": {}\n", - " }\n", - "}\n", - "\n", - "def count_properties(obj):\n", - " if isinstance(obj, dict):\n", - " return sum(count_properties(v) for v in obj.values()) + len(obj)\n", - " elif isinstance(obj, list):\n", - " return sum(count_properties(item) for item in obj)\n", - " else:\n", - " return 1\n", - "\n", - "for spec in all_api_specs:\n", - " if \"paths\" in spec:\n", - " for path, methods in spec[\"paths\"].items():\n", - " if path not in combined_spec[\"paths\"]:\n", - " combined_spec[\"paths\"][path] = {}\n", - " for method, details in methods.items():\n", - " if method not in combined_spec[\"paths\"][path] or count_properties(details) > count_properties(combined_spec[\"paths\"][path][method]):\n", - " combined_spec[\"paths\"][path][method] = details\n", - "\n", - " if \"components\" in spec and \"schemas\" in spec[\"components\"]:\n", - " for schema_name, schema in spec[\"components\"][\"schemas\"].items():\n", - " if schema_name not in combined_spec[\"components\"][\"schemas\"] or count_properties(schema) > count_properties(combined_spec[\"components\"][\"schemas\"][schema_name]):\n", - " combined_spec[\"components\"][\"schemas\"][schema_name] = schema\n", - "\n", - "# Save the combined API spec\n", - "output_file = os.path.join(folder_name, 'combined_api_spec.json')\n", - "with open(output_file, 'w') as f:\n", - " json.dump(combined_spec, f, indent=2, sort_keys=True)\n", - "\n", - "print(f\"Combined API specification saved to {output_file}\")\n", - "print(f\"Total paths in combined spec: {len(combined_spec['paths'])}\")\n", - "print(f\"Total schemas in combined spec: {len(combined_spec['components']['schemas'])}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# note: turn this into a simple web app like roast my site\n", - "- select which methods you want to add\n", - "- generate a UI for each method\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/examples/turning_docs_into_api_specs/turning_docs_into_api_specs.py b/examples/turning_docs_into_api_specs/turning_docs_into_api_specs.py new file mode 100644 index 00000000..47b54ede --- /dev/null +++ b/examples/turning_docs_into_api_specs/turning_docs_into_api_specs.py @@ -0,0 +1,137 @@ +# %% +import os +import datetime +import time +from firecrawl import FirecrawlApp +import json +import google.generativeai as genai +from dotenv import load_dotenv + +# Load environment variables +load_dotenv() + +# Retrieve API keys from environment variables +google_api_key = os.getenv("GOOGLE_API_KEY") +firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY") + +# Configure the Google Generative AI module with the API key +genai.configure(api_key=google_api_key) +model = genai.GenerativeModel("gemini-1.5-pro-001") + +# Set the docs URL +docs_url = "https://docs.firecrawl.dev/api-reference" + +# Initialize the FirecrawlApp with your API key +app = FirecrawlApp(api_key=firecrawl_api_key) + +# %% +# Crawl all pages on docs +crawl_result = app.crawl_url(docs_url) +print(f"Total pages crawled: {len(crawl_result['data'])}") + +# %% +# Define the prompt instructions for generating OpenAPI specs +prompt_instructions = """ +Given the following API documentation content, generate an OpenAPI 3.0 specification in JSON format ONLY if you are 100% confident and clear about all details. Focus on extracting the main endpoints, their HTTP methods, parameters, request bodies, and responses. The specification should follow OpenAPI 3.0 structure and conventions. Include only the 200 response for each endpoint. Limit all descriptions to 5 words or less. + +If there is ANY uncertainty, lack of complete information, or if you are not 100% confident about ANY part of the specification, return an empty JSON object {{}}. + +Do not make anything up. Only include information that is explicitly provided in the documentation. If any detail is unclear or missing, do not attempt to fill it in. + +API Documentation Content: +{{content}} + +Generate the OpenAPI 3.0 specification in JSON format ONLY if you are 100% confident about every single detail. Include only the JSON object, no additional text, and ensure it has no errors in the JSON format so it can be parsed. Remember to include only the 200 response for each endpoint and keep all descriptions to 5 words maximum. + +Once again, if there is ANY doubt, uncertainty, or lack of complete information, return an empty JSON object {{}}. + +To reiterate: accuracy is paramount. Do not make anything up. If you are not 100% clear or confident about the entire OpenAPI spec, return an empty JSON object {{}}. +""" + +# %% +# Initialize a list to store all API specs +all_api_specs = [] + +# Process each page in crawl_result +for index, page in enumerate(crawl_result['data']): + if 'markdown' in page: + # Update prompt_instructions with the current page's content + current_prompt = prompt_instructions.replace("{content}", page['markdown']) + try: + # Query the model + response = model.generate_content([current_prompt]) + response_dict = response.to_dict() + response_text = response_dict['candidates'][0]['content']['parts'][0]['text'] + + # Remove the ```json code wrap if present + response_text = response_text.strip().removeprefix('```json').removesuffix('```').strip() + + # Parse JSON + json_data = json.loads(response_text) + + # Add non-empty API specs to the list + if json_data != {}: + all_api_specs.append(json_data) + print(f"API specification generated for page {index}") + else: + print(f"No API specification found for page {index}") + + except json.JSONDecodeError: + print(f"Error parsing JSON response for page {index}") + except Exception as e: + print(f"An error occurred for page {index}: {str(e)}") + +# Print the total number of API specs collected +print(f"Total API specifications collected: {len(all_api_specs)}") + +# %% +# Combine all API specs and keep the most filled out spec for each path and method +combined_spec = { + "openapi": "3.0.0", + "info": { + "title": f"{docs_url} API Specification", + "version": "1.0.0" + }, + "paths": {}, + "components": { + "schemas": {} + } +} + +# Helper function to count properties in an object +def count_properties(obj): + if isinstance(obj, dict): + return sum(count_properties(v) for v in obj.values()) + len(obj) + elif isinstance(obj, list): + return sum(count_properties(item) for item in obj) + else: + return 1 + +# Combine specs, keeping the most detailed version of each path and schema +for spec in all_api_specs: + # Combine paths + if "paths" in spec: + for path, methods in spec["paths"].items(): + if path not in combined_spec["paths"]: + combined_spec["paths"][path] = {} + for method, details in methods.items(): + if method not in combined_spec["paths"][path] or count_properties(details) > count_properties(combined_spec["paths"][path][method]): + combined_spec["paths"][path][method] = details + + # Combine schemas + if "components" in spec and "schemas" in spec["components"]: + for schema_name, schema in spec["components"]["schemas"].items(): + if schema_name not in combined_spec["components"]["schemas"] or count_properties(schema) > count_properties(combined_spec["components"]["schemas"][schema_name]): + combined_spec["components"]["schemas"][schema_name] = schema + +# Print summary of combined spec +print(f"Combined API specification generated") +print(f"Total paths in combined spec: {len(combined_spec['paths'])}") +print(f"Total schemas in combined spec: {len(combined_spec['components']['schemas'])}") + +# Save the combined spec to a JSON file in the same directory as the Python file +output_file = os.path.join(os.path.dirname(__file__), "combined_api_spec.json") +with open(output_file, "w") as f: + json.dump(combined_spec, f, indent=2) + +print(f"Combined API specification saved to {output_file}") From 2d245a35f2131cb2f00b759b92adb75d306a4447 Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Fri, 6 Sep 2024 15:27:58 -0400 Subject: [PATCH 09/47] Delete combined_api_spec.json --- .../combined_api_spec.json | 510 ------------------ 1 file changed, 510 deletions(-) delete mode 100644 examples/turning_docs_into_api_specs/combined_api_spec.json diff --git a/examples/turning_docs_into_api_specs/combined_api_spec.json b/examples/turning_docs_into_api_specs/combined_api_spec.json deleted file mode 100644 index 526dec8b..00000000 --- a/examples/turning_docs_into_api_specs/combined_api_spec.json +++ /dev/null @@ -1,510 +0,0 @@ -{ - "openapi": "3.0.0", - "info": { - "title": "https://docs.firecrawl.dev/api-reference API Specification", - "version": "1.0.0" - }, - "paths": { - "/crawl": { - "post": { - "summary": "Crawl a website", - "requestBody": { - "content": { - "application/json": { - "schema": { - "type": "object", - "properties": { - "url": { - "type": "string", - "description": "Base URL to crawl" - }, - "excludePaths": { - "type": "array", - "items": { - "type": "string" - }, - "description": "URL patterns to exclude" - }, - "includePaths": { - "type": "array", - "items": { - "type": "string" - }, - "description": "URL patterns to include" - }, - "maxDepth": { - "type": "integer", - "description": "Maximum crawl depth" - }, - "ignoreSitemap": { - "type": "boolean", - "description": "Ignore sitemap?" - }, - "limit": { - "type": "integer", - "description": "Maximum pages to crawl" - }, - "allowBackwardLinks": { - "type": "boolean", - "description": "Allow backward links?" - }, - "allowExternalLinks": { - "type": "boolean", - "description": "Allow external links?" - }, - "webhook": { - "type": "string", - "description": "Webhook URL" - }, - "scrapeOptions": { - "type": "object", - "properties": { - "formats": { - "type": "array", - "items": { - "type": "string" - }, - "description": "Formats to include" - }, - "headers": { - "type": "object", - "description": "Headers to send" - }, - "includeTags": { - "type": "array", - "items": { - "type": "string" - }, - "description": "Tags to include" - }, - "excludeTags": { - "type": "array", - "items": { - "type": "string" - }, - "description": "Tags to exclude" - }, - "onlyMainContent": { - "type": "boolean", - "description": "Only main content?" - }, - "waitFor": { - "type": "integer", - "description": "Wait time in ms" - } - } - } - } - } - } - } - }, - "responses": { - "200": { - "description": "Crawl started", - "content": { - "application/json": { - "schema": { - "type": "object", - "properties": { - "success": { - "type": "boolean" - }, - "id": { - "type": "string" - }, - "url": { - "type": "string" - } - } - } - } - } - } - }, - "security": [ - { - "Authorization": [] - } - ] - } - }, - "/scrape": { - "post": { - "summary": "Scrape a webpage", - "requestBody": { - "required": true, - "content": { - "application/json": { - "schema": { - "type": "object", - "properties": { - "url": { - "type": "string", - "description": "URL to scrape" - }, - "formats": { - "type": "array", - "description": "Output formats", - "items": { - "type": "string", - "enum": [ - "markdown", - "html", - "rawHtml", - "links", - "screenshot", - "extract", - "screenshot@fullPage" - ] - } - }, - "onlyMainContent": { - "type": "boolean", - "description": "Only main content" - }, - "includeTags": { - "type": "array", - "description": "Tags to include", - "items": { - "type": "string" - } - }, - "excludeTags": { - "type": "array", - "description": "Tags to exclude", - "items": { - "type": "string" - } - }, - "headers": { - "type": "object", - "description": "Request headers" - }, - "waitFor": { - "type": "integer", - "description": "Delay in ms" - }, - "timeout": { - "type": "integer", - "description": "Timeout in ms" - }, - "extract": { - "type": "object", - "description": "Extract object", - "properties": { - "schema": { - "type": "object", - "description": "Extraction schema" - }, - "systemPrompt": { - "type": "string", - "description": "System prompt" - }, - "prompt": { - "type": "string", - "description": "Extraction prompt" - } - } - } - } - } - } - } - }, - "responses": { - "200": { - "description": "Successful scrape", - "content": { - "application/json": { - "schema": { - "type": "object", - "properties": { - "success": { - "type": "boolean" - }, - "data": { - "type": "object", - "properties": { - "markdown": { - "type": "string" - }, - "html": { - "type": "string" - }, - "rawHtml": { - "type": "string" - }, - "screenshot": { - "type": "string" - }, - "links": { - "type": "array", - "items": { - "type": "string" - } - }, - "metadata": { - "type": "object", - "properties": { - "title": { - "type": "string" - }, - "description": { - "type": "string" - }, - "language": { - "type": "string" - }, - "sourceURL": { - "type": "string" - }, - "statusCode": { - "type": "integer" - }, - "error": { - "type": "string" - } - } - }, - "llm_extraction": { - "type": "object" - }, - "warning": { - "type": "string" - } - } - } - } - } - } - } - } - }, - "security": [ - { - "Bearer": [] - } - ] - } - }, - "/v1/crawl/{id}": { - "get": { - "summary": "Get crawl status", - "parameters": [ - { - "name": "id", - "in": "path", - "description": "ID of crawl job", - "required": true, - "schema": { - "type": "string" - } - } - ], - "responses": { - "200": { - "description": "Crawl status", - "content": { - "application/json": { - "schema": { - "type": "object", - "properties": { - "status": { - "type": "string", - "description": "Current status of crawl" - }, - "total": { - "type": "integer", - "description": "Total pages crawled" - }, - "completed": { - "type": "integer", - "description": "Number of pages crawled" - }, - "creditsUsed": { - "type": "integer", - "description": "Credits used" - }, - "expiresAt": { - "type": "string", - "format": "date-time", - "description": "Crawl expiry" - }, - "next": { - "type": "string", - "nullable": true, - "description": "URL for next data" - }, - "data": { - "type": "array", - "description": "Data of the crawl", - "items": { - "type": "object", - "properties": { - "markdown": { - "type": "string" - }, - "html": { - "type": "string" - }, - "rawHtml": { - "type": "string" - }, - "links": { - "type": "array", - "items": { - "type": "string" - } - }, - "screenshot": { - "type": "string" - }, - "metadata": { - "type": "object", - "properties": { - "title": { - "type": "string" - }, - "description": { - "type": "string" - }, - "language": { - "type": "string" - }, - "sourceURL": { - "type": "string" - }, - "statusCode": { - "type": "integer" - }, - "error": { - "type": "string" - } - } - } - } - } - } - } - } - } - } - } - }, - "security": [ - { - "Bearer": [] - } - ] - } - }, - "/crawl/{id}": { - "delete": { - "summary": "Cancel crawl job", - "security": [ - { - "bearerAuth": [] - } - ], - "parameters": [ - { - "name": "id", - "in": "path", - "description": "ID of crawl job", - "required": true, - "schema": { - "type": "string" - } - } - ], - "responses": { - "200": { - "description": "Crawl job cancelled", - "content": { - "application/json": { - "schema": { - "type": "object", - "properties": { - "success": { - "type": "boolean" - }, - "message": { - "type": "string" - } - } - } - } - } - } - } - } - }, - "/map": { - "post": { - "summary": "Map website and return links", - "requestBody": { - "required": true, - "content": { - "application/json": { - "schema": { - "type": "object", - "properties": { - "url": { - "type": "string", - "description": "Base URL to crawl" - }, - "search": { - "type": "string", - "description": "Search query for mapping" - }, - "ignoreSitemap": { - "type": "boolean", - "description": "Ignore sitemap?" - }, - "includeSubdomains": { - "type": "boolean", - "description": "Include subdomains?" - }, - "limit": { - "type": "integer", - "description": "Max links to return" - } - }, - "required": [ - "url" - ] - } - } - } - }, - "responses": { - "200": { - "description": "Successful mapping", - "content": { - "application/json": { - "schema": { - "type": "object", - "properties": { - "success": { - "type": "boolean" - }, - "links": { - "type": "array", - "items": { - "type": "string" - } - } - } - } - } - } - } - } - } - } - }, - "components": { - "schemas": {} - } -} \ No newline at end of file From 79870e73053ef2a960112f2fa2227b39d0512bdb Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 6 Sep 2024 20:15:26 -0300 Subject: [PATCH 10/47] Update excludeTags.ts --- apps/api/src/scraper/WebScraper/utils/excludeTags.ts | 8 -------- 1 file changed, 8 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/utils/excludeTags.ts b/apps/api/src/scraper/WebScraper/utils/excludeTags.ts index bb9c5194..400ef84f 100644 --- a/apps/api/src/scraper/WebScraper/utils/excludeTags.ts +++ b/apps/api/src/scraper/WebScraper/utils/excludeTags.ts @@ -39,16 +39,8 @@ export const excludeNonMainTags = [ "#search", ".share", "#share", - ".pagination", - "#pagination", ".widget", "#widget", - ".related", - "#related", - ".tag", - "#tag", - ".category", - "#category", ".cookie", "#cookie" ]; From 5758af3291aaebbdc13c3e7c469b3406f730476a Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sat, 7 Sep 2024 13:12:46 -0300 Subject: [PATCH 11/47] Update website_params.ts --- .../src/scraper/WebScraper/utils/custom/website_params.ts | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts b/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts index af8d1f34..8169d9d3 100644 --- a/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts +++ b/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts @@ -242,5 +242,13 @@ export const urlSpecificParams = { engine: "chrome-cdp", }, }, + }, + "lorealparis.hu":{ + defaultScraper: "fire-engine", + params:{ + fireEngineOptions:{ + engine: "tlsclient", + }, + }, } }; From 48c665519ebff263316601a33620115d68d00c41 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sat, 7 Sep 2024 13:42:45 -0300 Subject: [PATCH 12/47] Update credit_billing.ts --- .../src/services/billing/credit_billing.ts | 54 ++++++++++--------- 1 file changed, 29 insertions(+), 25 deletions(-) diff --git a/apps/api/src/services/billing/credit_billing.ts b/apps/api/src/services/billing/credit_billing.ts index 53031de9..d22f0372 100644 --- a/apps/api/src/services/billing/credit_billing.ts +++ b/apps/api/src/services/billing/credit_billing.ts @@ -186,7 +186,8 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) { getValue(cacheKeyCoupons) ]); - let subscription, subscriptionError, coupons; + let subscription, subscriptionError; + let coupons : {credits: number}[]; if (cachedSubscription && cachedCoupons) { subscription = JSON.parse(cachedSubscription); @@ -225,16 +226,16 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) { ); } + + // If there are available coupons and they are enough for the operation + if (couponCredits >= credits) { + return { success: true, message: "Sufficient credits available", remainingCredits: couponCredits }; + } // Free credits, no coupons if (!subscription || subscriptionError) { - // If there is no active subscription but there are available coupons - if (couponCredits >= credits) { - return { success: true, message: "Sufficient credits available", remainingCredits: couponCredits }; - } - let creditUsages; let creditUsageError; let totalCreditsUsed = 0; @@ -251,6 +252,7 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) { const retryInterval = 2000; // 2 seconds while (retries < maxRetries) { + // Reminder, this has an 1000 limit. const result = await supabase_service .from("credit_usage") .select("credits_used") @@ -292,7 +294,7 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) { end.setDate(end.getDate() + 30); // check if usage is within 80% of the limit const creditLimit = FREE_CREDITS; - const creditUsagePercentage = (totalCreditsUsed + credits) / creditLimit; + const creditUsagePercentage = totalCreditsUsed / creditLimit; // Add a check to ensure totalCreditsUsed is greater than 0 if (totalCreditsUsed > 0 && creditUsagePercentage >= 0.8 && creditUsagePercentage < 1) { @@ -306,7 +308,7 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) { } // 5. Compare the total credits used with the credits allowed by the plan. - if (totalCreditsUsed + credits > FREE_CREDITS) { + if (totalCreditsUsed > FREE_CREDITS) { // Send email notification for insufficient credits await sendNotification( team_id, @@ -366,7 +368,7 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) { // Get the price details from cache or database const priceCacheKey = `price_${subscription.price_id}`; - let price; + let price : {credits: number}; try { const cachedPrice = await getValue(priceCacheKey); @@ -394,29 +396,31 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) { Logger.error(`Error retrieving or caching price: ${error}`); Sentry.captureException(error); // If errors, just assume it's a big number so user don't get an error - price = { credits: 1000000 }; + price = { credits: 10000000 }; } const creditLimit = price.credits; - const creditUsagePercentage = (adjustedCreditsUsed + credits) / creditLimit; + + // Removal of + credits + const creditUsagePercentage = adjustedCreditsUsed / creditLimit; // Compare the adjusted total credits used with the credits allowed by the plan - if (adjustedCreditsUsed + credits > price.credits) { - // await sendNotification( - // team_id, - // NotificationType.LIMIT_REACHED, - // subscription.current_period_start, - // subscription.current_period_end - // ); + if (adjustedCreditsUsed > price.credits) { + await sendNotification( + team_id, + NotificationType.LIMIT_REACHED, + subscription.current_period_start, + subscription.current_period_end + ); return { success: false, message: "Insufficient credits, please upgrade!", remainingCredits: creditLimit - adjustedCreditsUsed }; - } else if (creditUsagePercentage >= 0.8) { + } else if (creditUsagePercentage >= 0.8 && creditUsagePercentage < 1) { // Send email notification for approaching credit limit - // await sendNotification( - // team_id, - // NotificationType.APPROACHING_LIMIT, - // subscription.current_period_start, - // subscription.current_period_end - // ); + await sendNotification( + team_id, + NotificationType.APPROACHING_LIMIT, + subscription.current_period_start, + subscription.current_period_end + ); } return { success: true, message: "Sufficient credits available", remainingCredits: creditLimit - adjustedCreditsUsed }; From fbdfa1256bb6095a08434b356fb51688d5337780 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sun, 8 Sep 2024 13:07:10 -0300 Subject: [PATCH 13/47] Update credit_billing.ts --- apps/api/src/services/billing/credit_billing.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/api/src/services/billing/credit_billing.ts b/apps/api/src/services/billing/credit_billing.ts index d22f0372..6a71b40a 100644 --- a/apps/api/src/services/billing/credit_billing.ts +++ b/apps/api/src/services/billing/credit_billing.ts @@ -308,7 +308,7 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) { } // 5. Compare the total credits used with the credits allowed by the plan. - if (totalCreditsUsed > FREE_CREDITS) { + if (totalCreditsUsed >= FREE_CREDITS) { // Send email notification for insufficient credits await sendNotification( team_id, @@ -405,7 +405,7 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) { const creditUsagePercentage = adjustedCreditsUsed / creditLimit; // Compare the adjusted total credits used with the credits allowed by the plan - if (adjustedCreditsUsed > price.credits) { + if (adjustedCreditsUsed >= price.credits) { await sendNotification( team_id, NotificationType.LIMIT_REACHED, From 60a15d00eb73244257b99dfd05a2d55b0aab9dd4 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sun, 8 Sep 2024 16:39:12 -0300 Subject: [PATCH 14/47] Update types.ts --- apps/api/src/controllers/v1/types.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index c4e0cf84..63ec1dd4 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -322,6 +322,7 @@ export function legacyScrapeOptions(x: ScrapeOptions): PageOptions { removeTags: x.excludeTags, onlyMainContent: x.onlyMainContent, waitFor: x.waitFor, + headers: x.headers, includeLinks: x.formats.includes("links"), screenshot: x.formats.includes("screenshot"), fullPageScreenshot: x.formats.includes("screenshot@fullPage"), From 22a5e85899eb893c9a68f53201e13f5fb569bc46 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 9 Sep 2024 12:26:55 -0300 Subject: [PATCH 15/47] Update index.ts --- apps/api/src/index.ts | 102 ++++++++++++++++++++++++++++-------------- 1 file changed, 68 insertions(+), 34 deletions(-) diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index 58370158..1edf3759 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -1,5 +1,5 @@ import "dotenv/config"; -import "./services/sentry" +import "./services/sentry"; import * as Sentry from "@sentry/node"; import express, { NextFunction, Request, Response } from "express"; import bodyParser from "body-parser"; @@ -12,9 +12,9 @@ import os from "os"; import { Logger } from "./lib/logger"; import { adminRouter } from "./routes/admin"; import { ScrapeEvents } from "./lib/scrape-events"; -import http from 'node:http'; -import https from 'node:https'; -import CacheableLookup from 'cacheable-lookup'; +import http from "node:http"; +import https from "node:https"; +import CacheableLookup from "cacheable-lookup"; import { v1Router } from "./routes/v1"; import expressWs from "express-ws"; import { crawlStatusWSController } from "./controllers/v1/crawl-status-ws"; @@ -31,11 +31,11 @@ Logger.info(`Number of CPUs: ${numCPUs} available`); const cacheable = new CacheableLookup({ // this is important to avoid querying local hostnames see https://github.com/szmarczak/cacheable-lookup readme - lookup:false + lookup: false, }); cacheable.install(http.globalAgent); -cacheable.install(https.globalAgent) +cacheable.install(https.globalAgent); if (cluster.isMaster) { Logger.info(`Master ${process.pid} is running`); @@ -115,9 +115,7 @@ if (cluster.isMaster) { app.get(`/serverHealthCheck`, async (req, res) => { try { const scrapeQueue = getScrapeQueue(); - const [waitingJobs] = await Promise.all([ - scrapeQueue.getWaitingCount(), - ]); + const [waitingJobs] = await Promise.all([scrapeQueue.getWaitingCount()]); const noWaitingJobs = waitingJobs === 0; // 200 if no active jobs, 503 if there are active jobs @@ -190,38 +188,77 @@ if (cluster.isMaster) { res.send({ isProduction: global.isProduction }); }); - app.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: Response, next: NextFunction) => { - if (err instanceof ZodError) { - res.status(400).json({ success: false, error: "Bad Request", details: err.errors }); - } else { + app.use( + ( + err: unknown, + req: Request<{}, ErrorResponse, undefined>, + res: Response, + next: NextFunction + ) => { + if (err instanceof ZodError) { + res + .status(400) + .json({ success: false, error: "Bad Request", details: err.errors }); + } else { next(err); + } } - }); + ); Sentry.setupExpressErrorHandler(app); - app.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: ResponseWithSentry, next: NextFunction) => { - const id = res.sentry ?? uuidv4(); - let verbose = JSON.stringify(err); - if (verbose === "{}") { - if (err instanceof Error) { - verbose = JSON.stringify({ - message: err.message, - name: err.name, - stack: err.stack, - }); - } - } + app.use( + ( + err: unknown, + req: Request<{}, ErrorResponse, undefined>, + res: ResponseWithSentry, + next: NextFunction + ) => { + if ( + err instanceof SyntaxError && + "status" in err && + err.status === 400 && + "body" in err + ) { + return res + .status(400) + .json({ success: false, error: "Bad request, malformed JSON" }); + } - Logger.error("Error occurred in request! (" + req.path + ") -- ID " + id + " -- " + verbose); - res.status(500).json({ success: false, error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id }); - }); + const id = res.sentry ?? uuidv4(); + let verbose = JSON.stringify(err); + if (verbose === "{}") { + if (err instanceof Error) { + verbose = JSON.stringify({ + message: err.message, + name: err.name, + stack: err.stack, + }); + } + } + + Logger.error( + "Error occurred in request! (" + + req.path + + ") -- ID " + + id + + " -- " + + verbose + ); + res + .status(500) + .json({ + success: false, + error: + "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + + id, + }); + } + ); Logger.info(`Worker ${process.pid} started`); } - - // const sq = getScrapeQueue(); // sq.on("waiting", j => ScrapeEvents.logJobEvent(j, "waiting")); @@ -230,6 +267,3 @@ if (cluster.isMaster) { // sq.on("paused", j => ScrapeEvents.logJobEvent(j, "paused")); // sq.on("resumed", j => ScrapeEvents.logJobEvent(j, "resumed")); // sq.on("removed", j => ScrapeEvents.logJobEvent(j, "removed")); - - - From ca9a781eb7fbadf7aee7dd6926aea3a0b1ca5e07 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 9 Sep 2024 12:27:55 -0300 Subject: [PATCH 16/47] Update index.ts --- apps/api/src/index.ts | 106 +++++++++++++++--------------------------- 1 file changed, 38 insertions(+), 68 deletions(-) diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index 1edf3759..7d8817af 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -1,5 +1,5 @@ import "dotenv/config"; -import "./services/sentry"; +import "./services/sentry" import * as Sentry from "@sentry/node"; import express, { NextFunction, Request, Response } from "express"; import bodyParser from "body-parser"; @@ -12,9 +12,9 @@ import os from "os"; import { Logger } from "./lib/logger"; import { adminRouter } from "./routes/admin"; import { ScrapeEvents } from "./lib/scrape-events"; -import http from "node:http"; -import https from "node:https"; -import CacheableLookup from "cacheable-lookup"; +import http from 'node:http'; +import https from 'node:https'; +import CacheableLookup from 'cacheable-lookup'; import { v1Router } from "./routes/v1"; import expressWs from "express-ws"; import { crawlStatusWSController } from "./controllers/v1/crawl-status-ws"; @@ -31,11 +31,11 @@ Logger.info(`Number of CPUs: ${numCPUs} available`); const cacheable = new CacheableLookup({ // this is important to avoid querying local hostnames see https://github.com/szmarczak/cacheable-lookup readme - lookup: false, + lookup:false }); cacheable.install(http.globalAgent); -cacheable.install(https.globalAgent); +cacheable.install(https.globalAgent) if (cluster.isMaster) { Logger.info(`Master ${process.pid} is running`); @@ -115,7 +115,9 @@ if (cluster.isMaster) { app.get(`/serverHealthCheck`, async (req, res) => { try { const scrapeQueue = getScrapeQueue(); - const [waitingJobs] = await Promise.all([scrapeQueue.getWaitingCount()]); + const [waitingJobs] = await Promise.all([ + scrapeQueue.getWaitingCount(), + ]); const noWaitingJobs = waitingJobs === 0; // 200 if no active jobs, 503 if there are active jobs @@ -188,77 +190,42 @@ if (cluster.isMaster) { res.send({ isProduction: global.isProduction }); }); - app.use( - ( - err: unknown, - req: Request<{}, ErrorResponse, undefined>, - res: Response, - next: NextFunction - ) => { - if (err instanceof ZodError) { - res - .status(400) - .json({ success: false, error: "Bad Request", details: err.errors }); - } else { + app.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: Response, next: NextFunction) => { + if (err instanceof ZodError) { + res.status(400).json({ success: false, error: "Bad Request", details: err.errors }); + } else { next(err); - } } - ); + }); Sentry.setupExpressErrorHandler(app); - app.use( - ( - err: unknown, - req: Request<{}, ErrorResponse, undefined>, - res: ResponseWithSentry, - next: NextFunction - ) => { - if ( - err instanceof SyntaxError && - "status" in err && - err.status === 400 && - "body" in err - ) { - return res - .status(400) - .json({ success: false, error: "Bad request, malformed JSON" }); - } - - const id = res.sentry ?? uuidv4(); - let verbose = JSON.stringify(err); - if (verbose === "{}") { - if (err instanceof Error) { - verbose = JSON.stringify({ - message: err.message, - name: err.name, - stack: err.stack, - }); - } - } - - Logger.error( - "Error occurred in request! (" + - req.path + - ") -- ID " + - id + - " -- " + - verbose - ); - res - .status(500) - .json({ - success: false, - error: - "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + - id, - }); + app.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: ResponseWithSentry, next: NextFunction) => { + if (err instanceof SyntaxError && 'status' in err && err.status === 400 && 'body' in err) { + return res.status(400).json({ success: false, error: 'Bad request, malformed JSON' }); } - ); + + const id = res.sentry ?? uuidv4(); + let verbose = JSON.stringify(err); + if (verbose === "{}") { + if (err instanceof Error) { + verbose = JSON.stringify({ + message: err.message, + name: err.name, + stack: err.stack, + }); + } + } + + Logger.error("Error occurred in request! (" + req.path + ") -- ID " + id + " -- " + verbose); + res.status(500).json({ success: false, error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id }); + }); Logger.info(`Worker ${process.pid} started`); } + + // const sq = getScrapeQueue(); // sq.on("waiting", j => ScrapeEvents.logJobEvent(j, "waiting")); @@ -267,3 +234,6 @@ if (cluster.isMaster) { // sq.on("paused", j => ScrapeEvents.logJobEvent(j, "paused")); // sq.on("resumed", j => ScrapeEvents.logJobEvent(j, "resumed")); // sq.on("removed", j => ScrapeEvents.logJobEvent(j, "removed")); + + + From 17e419a7fb82dacba45692ea676f0487e66d5f70 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 9 Sep 2024 21:06:23 -0300 Subject: [PATCH 17/47] Nick: --- .../scraper/WebScraper/scrapers/fireEngine.ts | 2 +- apps/api/src/scraper/WebScraper/single_url.ts | 18 +++++++++--------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts index e7361c5c..a3f393c8 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts @@ -55,7 +55,7 @@ export async function scrapWithFireEngine({ try { const reqParams = await generateRequestParams(url); let waitParam = reqParams["params"]?.wait ?? waitFor; - let engineParam = reqParams["params"]?.engine ?? reqParams["params"]?.fireEngineOptions?.engine ?? fireEngineOptions?.engine ?? "playwright"; + let engineParam = reqParams["params"]?.engine ?? reqParams["params"]?.fireEngineOptions?.engine ?? fireEngineOptions?.engine ?? "chrome-cdp"; let screenshotParam = reqParams["params"]?.screenshot ?? screenshot; let fullPageScreenshotParam = reqParams["params"]?.fullPageScreenshot ?? fullPageScreenshot; let fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions; diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 8bafd203..2be65899 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -96,15 +96,15 @@ function getScrapingFallbackOrder( "fetch", ].filter(Boolean); - if (isWaitPresent || isScreenshotPresent || isHeadersPresent) { - defaultOrder = [ - "fire-engine", - useFireEngine ? undefined : "playwright", - ...defaultOrder.filter( - (scraper) => scraper !== "fire-engine" && scraper !== "playwright" - ), - ].filter(Boolean); - } + // if (isWaitPresent || isScreenshotPresent || isHeadersPresent) { + // defaultOrder = [ + // "fire-engine", + // useFireEngine ? undefined : "playwright", + // ...defaultOrder.filter( + // (scraper) => scraper !== "fire-engine" && scraper !== "playwright" + // ), + // ].filter(Boolean); + // } const filteredDefaultOrder = defaultOrder.filter( (scraper: (typeof baseScrapers)[number]) => From a6bcf7b4389409f9972a69b3d48e1ecd084e3121 Mon Sep 17 00:00:00 2001 From: Gergo Moricz Date: Tue, 10 Sep 2024 08:51:58 +0200 Subject: [PATCH 18/47] fix(v0/crawl-status): don't crash on big crawls when requesting jobs from supabase --- apps/api/src/controllers/v0/crawl-status.ts | 8 ++--- apps/api/src/controllers/v0/status.ts | 2 +- apps/api/src/lib/supabase-jobs.ts | 34 +++++++++++++++++++++ 3 files changed, 39 insertions(+), 5 deletions(-) diff --git a/apps/api/src/controllers/v0/crawl-status.ts b/apps/api/src/controllers/v0/crawl-status.ts index a3f3f16f..41491f86 100644 --- a/apps/api/src/controllers/v0/crawl-status.ts +++ b/apps/api/src/controllers/v0/crawl-status.ts @@ -4,16 +4,16 @@ import { RateLimiterMode } from "../../../src/types"; import { getScrapeQueue } from "../../../src/services/queue-service"; import { Logger } from "../../../src/lib/logger"; import { getCrawl, getCrawlJobs } from "../../../src/lib/crawl-redis"; -import { supabaseGetJobsById } from "../../../src/lib/supabase-jobs"; +import { supabaseGetJobsByCrawlId } from "../../../src/lib/supabase-jobs"; import * as Sentry from "@sentry/node"; import { configDotenv } from "dotenv"; configDotenv(); -export async function getJobs(ids: string[]) { +export async function getJobs(crawlId: string, ids: string[]) { const jobs = (await Promise.all(ids.map(x => getScrapeQueue().getJob(x)))).filter(x => x); if (process.env.USE_DB_AUTHENTICATION === "true") { - const supabaseData = await supabaseGetJobsById(ids); + const supabaseData = await supabaseGetJobsByCrawlId(crawlId); supabaseData.forEach(x => { const job = jobs.find(y => y.id === x.job_id); @@ -52,7 +52,7 @@ export async function crawlStatusController(req: Request, res: Response) { const jobIDs = await getCrawlJobs(req.params.jobId); - const jobs = (await getJobs(jobIDs)).sort((a, b) => a.timestamp - b.timestamp); + const jobs = (await getJobs(req.params.jobId, jobIDs)).sort((a, b) => a.timestamp - b.timestamp); const jobStatuses = await Promise.all(jobs.map(x => x.getState())); const jobStatus = sc.cancelled ? "failed" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "active"; diff --git a/apps/api/src/controllers/v0/status.ts b/apps/api/src/controllers/v0/status.ts index 34ebb3c6..bf8d2834 100644 --- a/apps/api/src/controllers/v0/status.ts +++ b/apps/api/src/controllers/v0/status.ts @@ -22,7 +22,7 @@ export async function crawlJobStatusPreviewController(req: Request, res: Respons // } // } - const jobs = (await getJobs(jobIDs)).sort((a, b) => a.timestamp - b.timestamp); + const jobs = (await getJobs(req.params.jobId, jobIDs)).sort((a, b) => a.timestamp - b.timestamp); const jobStatuses = await Promise.all(jobs.map(x => x.getState())); const jobStatus = sc.cancelled ? "failed" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "active"; diff --git a/apps/api/src/lib/supabase-jobs.ts b/apps/api/src/lib/supabase-jobs.ts index cda6fd46..52e594c4 100644 --- a/apps/api/src/lib/supabase-jobs.ts +++ b/apps/api/src/lib/supabase-jobs.ts @@ -2,6 +2,11 @@ import { supabase_service } from "../services/supabase"; import { Logger } from "./logger"; import * as Sentry from "@sentry/node"; +/** + * Get a single firecrawl_job by ID + * @param jobId ID of Job + * @returns {any | null} Job + */ export const supabaseGetJobById = async (jobId: string) => { const { data, error } = await supabase_service .from("firecrawl_jobs") @@ -20,6 +25,11 @@ export const supabaseGetJobById = async (jobId: string) => { return data; }; +/** + * Get multiple firecrawl_jobs by ID. Use this if you're not requesting a lot (50+) of jobs at once. + * @param jobIds IDs of Jobs + * @returns {any[]} Jobs + */ export const supabaseGetJobsById = async (jobIds: string[]) => { const { data, error } = await supabase_service.rpc("get_jobs_by_ids", { job_ids: jobIds, @@ -38,6 +48,30 @@ export const supabaseGetJobsById = async (jobIds: string[]) => { return data; }; +/** + * Get multiple firecrawl_jobs by crawl ID. Use this if you need a lot of jobs at once. + * @param crawlId ID of crawl + * @returns {any[]} Jobs + */ +export const supabaseGetJobsByCrawlId = async (crawlId: string) => { + const { data, error } = await supabase_service + .from("firecrawl_jobs") + .select() + .eq("crawl_id", crawlId) + + if (error) { + Logger.error(`Error in supabaseGetJobsByCrawlId: ${error}`); + Sentry.captureException(error); + return []; + } + + if (!data) { + return []; + } + + return data; +}; + export const supabaseGetJobByIdOnlyData = async (jobId: string) => { const { data, error } = await supabase_service From f8fbc71f91a842c86fba9b02ceca4bff4e74d7d6 Mon Sep 17 00:00:00 2001 From: Gergo Moricz Date: Tue, 10 Sep 2024 09:20:18 +0200 Subject: [PATCH 19/47] fix(supabase-jobs): do not use RPCs RPCs are more failure-prone for this use case than regular queries are. --- apps/api/src/lib/supabase-jobs.ts | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/apps/api/src/lib/supabase-jobs.ts b/apps/api/src/lib/supabase-jobs.ts index 52e594c4..c418a6e0 100644 --- a/apps/api/src/lib/supabase-jobs.ts +++ b/apps/api/src/lib/supabase-jobs.ts @@ -31,12 +31,13 @@ export const supabaseGetJobById = async (jobId: string) => { * @returns {any[]} Jobs */ export const supabaseGetJobsById = async (jobIds: string[]) => { - const { data, error } = await supabase_service.rpc("get_jobs_by_ids", { - job_ids: jobIds, - }); + const { data, error } = await supabase_service + .from("firecrawl_jobs") + .select() + .in("job_id", jobIds); if (error) { - Logger.error(`Error in get_jobs_by_ids: ${error}`); + Logger.error(`Error in supabaseGetJobsById: ${error}`); Sentry.captureException(error); return []; } From 26f2095de61103e854ef95326b6e0570b2494879 Mon Sep 17 00:00:00 2001 From: Gergo Moricz Date: Tue, 10 Sep 2024 09:24:23 +0200 Subject: [PATCH 20/47] fix(v1): proper Invalid URL handling --- apps/api/src/controllers/v1/types.ts | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 63ec1dd4..f812f981 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -30,7 +30,14 @@ export const url = z.preprocess( "URL must have a valid top-level domain or be a valid path" ) .refine( - (x) => checkUrl(x as string), + (x) => { + try { + checkUrl(x as string) + return true; + } catch (_) { + return false; + } + }, "Invalid URL" ) .refine( From b4dbf7553750a54040ff47fea9042d2858aaa9cd Mon Sep 17 00:00:00 2001 From: Gergo Moricz Date: Tue, 10 Sep 2024 10:25:14 +0200 Subject: [PATCH 21/47] fix(v1): check if url is string in blocklistMiddleware Fixes FIRECRAWL-SCRAPER-JS-9Z --- apps/api/src/routes/v1.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/routes/v1.ts b/apps/api/src/routes/v1.ts index daa9bf43..484ab5dc 100644 --- a/apps/api/src/routes/v1.ts +++ b/apps/api/src/routes/v1.ts @@ -83,7 +83,7 @@ function idempotencyMiddleware(req: Request, res: Response, next: NextFunction) } function blocklistMiddleware(req: Request, res: Response, next: NextFunction) { - if (req.body.url && isUrlBlocked(req.body.url)) { + if (typeof req.body.url === "string" && isUrlBlocked(req.body.url)) { if (!res.headersSent) { return res.status(403).json({ success: false, error: "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." }); } From a17e1cac929ace616e371b4df4100a1029300609 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 10 Sep 2024 06:53:24 -0300 Subject: [PATCH 22/47] Rate bump --- apps/api/src/services/rate-limiter.ts | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/apps/api/src/services/rate-limiter.ts b/apps/api/src/services/rate-limiter.ts index dade8493..7cfff35b 100644 --- a/apps/api/src/services/rate-limiter.ts +++ b/apps/api/src/services/rate-limiter.ts @@ -6,7 +6,7 @@ const RATE_LIMITS = { crawl: { default: 3, free: 2, - starter: 3, + starter: 10, standard: 5, standardOld: 40, scale: 50, @@ -19,9 +19,9 @@ const RATE_LIMITS = { scrape: { default: 20, free: 10, - starter: 20, + starter: 100, standard: 100, - standardOld: 40, + standardOld: 100, scale: 500, hobby: 20, standardNew: 100, @@ -32,8 +32,8 @@ const RATE_LIMITS = { search: { default: 20, free: 5, - starter: 20, - standard: 40, + starter: 50, + standard: 50, standardOld: 40, scale: 500, hobby: 10, @@ -45,9 +45,9 @@ const RATE_LIMITS = { map:{ default: 20, free: 5, - starter: 20, - standard: 40, - standardOld: 40, + starter: 50, + standard: 50, + standardOld: 50, scale: 500, hobby: 10, standardNew: 50, From 45237a29dde6f38af4a1a9b7c3d203fbb6c38795 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 10 Sep 2024 09:09:39 -0300 Subject: [PATCH 23/47] updated js-sdk examples --- apps/js-sdk/example.js | 2 +- apps/js-sdk/example.ts | 2 +- apps/js-sdk/package-lock.json | 76 +++++++++++++++++++++++++++++++++-- apps/js-sdk/package.json | 1 + 4 files changed, 75 insertions(+), 6 deletions(-) diff --git a/apps/js-sdk/example.js b/apps/js-sdk/example.js index eb4bc489..c4b21d5f 100644 --- a/apps/js-sdk/example.js +++ b/apps/js-sdk/example.js @@ -1,4 +1,4 @@ -import FirecrawlApp from '@mendable/firecrawl-js'; +import FirecrawlApp from 'firecrawl'; const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"}); diff --git a/apps/js-sdk/example.ts b/apps/js-sdk/example.ts index 4142416f..7412e479 100644 --- a/apps/js-sdk/example.ts +++ b/apps/js-sdk/example.ts @@ -1,4 +1,4 @@ -import FirecrawlApp, { CrawlStatusResponse, ErrorResponse } from '@mendable/firecrawl-js'; +import FirecrawlApp, { CrawlStatusResponse, ErrorResponse } from 'firecrawl'; const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"}); diff --git a/apps/js-sdk/package-lock.json b/apps/js-sdk/package-lock.json index 95dd7d27..b0f358cb 100644 --- a/apps/js-sdk/package-lock.json +++ b/apps/js-sdk/package-lock.json @@ -9,8 +9,8 @@ "version": "1.0.0", "license": "ISC", "dependencies": { - "@mendable/firecrawl-js": "^0.0.36", "axios": "^1.6.8", + "firecrawl": "^1.2.0", "ts-node": "^10.9.2", "typescript": "^5.4.5", "uuid": "^10.0.0", @@ -422,12 +422,14 @@ } }, "node_modules/@mendable/firecrawl-js": { - "version": "0.0.36", - "resolved": "https://registry.npmjs.org/@mendable/firecrawl-js/-/firecrawl-js-0.0.36.tgz", - "integrity": "sha512-5zQMWUD49r6Q7cxj+QBthQ964Bm9fMooW4E8E4nIca3BMXCeEuQFVf5C3OEWwZf0SjJvR+5Yx2wUbXJWd1wCOA==", + "version": "1.2.2", + "resolved": "https://registry.npmjs.org/@mendable/firecrawl-js/-/firecrawl-js-1.2.2.tgz", + "integrity": "sha512-2A1GzLD0bczlFIlcjxHcm/x8i76ndtV4EUzOfc81oOJ/HbycE2mbT6EUthoL+r4s5A8yO3bKr9o/GxmEn456VA==", "dependencies": { "axios": "^1.6.8", "dotenv": "^16.4.5", + "isows": "^1.0.4", + "typescript-event-target": "^1.1.1", "uuid": "^9.0.1", "zod": "^3.23.8", "zod-to-json-schema": "^3.23.0" @@ -594,6 +596,32 @@ "@esbuild/win32-x64": "0.20.2" } }, + "node_modules/firecrawl": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/firecrawl/-/firecrawl-1.2.0.tgz", + "integrity": "sha512-Sy1BCCvs5FhGc4yxPP7NG9iWnK8RXdvA1ZS/K1Gj+LrEN3iAT2WRzhYET7x8G2bif25F6rHJg57vdVb5sr6RyQ==", + "dependencies": { + "axios": "^1.6.8", + "dotenv": "^16.4.5", + "isows": "^1.0.4", + "typescript-event-target": "^1.1.1", + "uuid": "^9.0.1", + "zod": "^3.23.8", + "zod-to-json-schema": "^3.23.0" + } + }, + "node_modules/firecrawl/node_modules/uuid": { + "version": "9.0.1", + "resolved": "https://registry.npmjs.org/uuid/-/uuid-9.0.1.tgz", + "integrity": "sha512-b+1eJOlsR9K8HJpow9Ok3fiWOWSIcIzXodvv0rQjVoOVNpWMpxf1wZNpt4y9h10odCNrqnYp1OBzRktckBe3sA==", + "funding": [ + "https://github.com/sponsors/broofa", + "https://github.com/sponsors/ctavan" + ], + "bin": { + "uuid": "dist/bin/uuid" + } + }, "node_modules/follow-redirects": { "version": "1.15.6", "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.6.tgz", @@ -652,6 +680,20 @@ "url": "https://github.com/privatenumber/get-tsconfig?sponsor=1" } }, + "node_modules/isows": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/isows/-/isows-1.0.4.tgz", + "integrity": "sha512-hEzjY+x9u9hPmBom9IIAqdJCwNLax+xrPb51vEPpERoFlIxgmZcHzsT5jKG06nvInKOBGvReAVz80Umed5CczQ==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/wagmi-dev" + } + ], + "peerDependencies": { + "ws": "*" + } + }, "node_modules/make-error": { "version": "1.3.6", "resolved": "https://registry.npmjs.org/make-error/-/make-error-1.3.6.tgz", @@ -763,6 +805,11 @@ "node": ">=14.17" } }, + "node_modules/typescript-event-target": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/typescript-event-target/-/typescript-event-target-1.1.1.tgz", + "integrity": "sha512-dFSOFBKV6uwaloBCCUhxlD3Pr/P1a/tJdcmPrTXCHlEFD3faj0mztjcGn6VBAhQ0/Bdy8K3VWrrqwbt/ffsYsg==" + }, "node_modules/undici-types": { "version": "5.26.5", "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz", @@ -786,6 +833,27 @@ "resolved": "https://registry.npmjs.org/v8-compile-cache-lib/-/v8-compile-cache-lib-3.0.1.tgz", "integrity": "sha512-wa7YjyUGfNZngI/vtK0UHAN+lgDCxBPCylVXGp0zu59Fz5aiGtNXaq3DhIov063MorB+VfufLh3JlF2KdTK3xg==" }, + "node_modules/ws": { + "version": "8.18.0", + "resolved": "https://registry.npmjs.org/ws/-/ws-8.18.0.tgz", + "integrity": "sha512-8VbfWfHLbbwu3+N6OKsOMpBdT4kXPDDB9cJk2bJ6mh9ucxdlnNvH1e+roYkKmN9Nxw2yjz7VzeO9oOz2zJ04Pw==", + "peer": true, + "engines": { + "node": ">=10.0.0" + }, + "peerDependencies": { + "bufferutil": "^4.0.1", + "utf-8-validate": ">=5.0.2" + }, + "peerDependenciesMeta": { + "bufferutil": { + "optional": true + }, + "utf-8-validate": { + "optional": true + } + } + }, "node_modules/yn": { "version": "3.1.1", "resolved": "https://registry.npmjs.org/yn/-/yn-3.1.1.tgz", diff --git a/apps/js-sdk/package.json b/apps/js-sdk/package.json index b5d919f4..ac3ef038 100644 --- a/apps/js-sdk/package.json +++ b/apps/js-sdk/package.json @@ -13,6 +13,7 @@ "dependencies": { "@mendable/firecrawl-js": "^1.0.3", "axios": "^1.6.8", + "firecrawl": "^1.2.0", "ts-node": "^10.9.2", "typescript": "^5.4.5", "uuid": "^10.0.0", From ee8a54213c50ae88720ce5a03f76a65d270e81d0 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 10 Sep 2024 10:25:27 -0300 Subject: [PATCH 24/47] fix(py-sdk): removed asyncio package tested websocket with example.py without asyncio and it works with no problem. --- apps/python-sdk/firecrawl/firecrawl.py | 1 - apps/python-sdk/pyproject.toml | 3 +-- apps/python-sdk/requirements.txt | 1 - 3 files changed, 1 insertion(+), 4 deletions(-) diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 254f4c70..3961631e 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -13,7 +13,6 @@ import logging import os import time from typing import Any, Dict, Optional, List -import asyncio import json import requests diff --git a/apps/python-sdk/pyproject.toml b/apps/python-sdk/pyproject.toml index 969fb051..87cb91f1 100644 --- a/apps/python-sdk/pyproject.toml +++ b/apps/python-sdk/pyproject.toml @@ -12,8 +12,7 @@ dependencies = [ "requests", "python-dotenv", "websockets", - "asyncio", -"nest-asyncio" + "nest-asyncio" ] authors = [{name = "Mendable.ai",email = "nick@mendable.ai"}] maintainers = [{name = "Mendable.ai",email = "nick@mendable.ai"}] diff --git a/apps/python-sdk/requirements.txt b/apps/python-sdk/requirements.txt index 94971fde..db67ceeb 100644 --- a/apps/python-sdk/requirements.txt +++ b/apps/python-sdk/requirements.txt @@ -2,5 +2,4 @@ requests pytest python-dotenv websockets -asyncio nest-asyncio \ No newline at end of file From f855ad3436f97972383193980f1fb9f775636a0f Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 10 Sep 2024 10:29:44 -0300 Subject: [PATCH 25/47] bumping py-sdk version --- apps/python-sdk/firecrawl/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py index f178cd61..540ce67e 100644 --- a/apps/python-sdk/firecrawl/__init__.py +++ b/apps/python-sdk/firecrawl/__init__.py @@ -13,7 +13,7 @@ import os from .firecrawl import FirecrawlApp -__version__ = "1.2.3" +__version__ = "1.2.4" # Define the logger for the Firecrawl project logger: logging.Logger = logging.getLogger("firecrawl") From 4ebc35c9dde46e1fd2e38364000aa493287b9650 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 10 Sep 2024 18:59:09 +0200 Subject: [PATCH 26/47] fix(crawl-status): add success: true --- apps/api/src/controllers/v1/crawl-status-ws.ts | 1 + apps/api/src/controllers/v1/crawl-status.ts | 1 + apps/api/src/controllers/v1/types.ts | 1 + 3 files changed, 3 insertions(+) diff --git a/apps/api/src/controllers/v1/crawl-status-ws.ts b/apps/api/src/controllers/v1/crawl-status-ws.ts index 8d823096..16a67682 100644 --- a/apps/api/src/controllers/v1/crawl-status-ws.ts +++ b/apps/api/src/controllers/v1/crawl-status-ws.ts @@ -103,6 +103,7 @@ async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth Date: Tue, 10 Sep 2024 19:29:38 +0200 Subject: [PATCH 27/47] feat(js-sdk): paginate next on checkCrawlStatus + better types for CSR --- apps/js-sdk/firecrawl/package.json | 2 +- apps/js-sdk/firecrawl/src/index.ts | 62 ++++++++++++++++++------------ 2 files changed, 38 insertions(+), 26 deletions(-) diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 7114a625..75ebe390 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.2.2", + "version": "1.2.3", "description": "JavaScript SDK for Firecrawl API", "main": "build/cjs/index.js", "types": "types/index.d.ts", diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 8b16adfb..55c5be0b 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -131,15 +131,14 @@ export interface CrawlResponse { */ export interface CrawlStatusResponse { success: true; - total: number; + status: "scraping" | "completed" | "failed" | "cancelled"; completed: number; + total: number; creditsUsed: number; expiresAt: Date; - status: "scraping" | "completed" | "failed"; - next: string; - data?: FirecrawlDocument[]; - error?: string; -} + next?: string; + data: FirecrawlDocument[]; +}; /** * Parameters for mapping operations. @@ -329,9 +328,10 @@ export default class FirecrawlApp { /** * Checks the status of a crawl job using the Firecrawl API. * @param id - The ID of the crawl operation. + * @param getAllData - Paginate through all the pages of documents, returning the full list of all documents. (default: `false`) * @returns The response containing the job status. */ - async checkCrawlStatus(id?: string): Promise { + async checkCrawlStatus(id?: string, getAllData = false): Promise { if (!id) { throw new Error("No crawl ID provided"); } @@ -342,17 +342,29 @@ export default class FirecrawlApp { `${this.apiUrl}/v1/crawl/${id}`, headers ); - if (response.status === 200) { + if (response.status === 200 && getAllData) { + let allData = response.data.data; + if (response.data.status === "completed") { + let statusData = response.data + if ("data" in statusData) { + let data = statusData.data; + while ('next' in statusData) { + statusData = (await this.getRequest(statusData.next, headers)).data; + data = data.concat(statusData.data); + } + allData = data; + } + } return ({ - success: true, + success: response.data.success, status: response.data.status, total: response.data.total, completed: response.data.completed, creditsUsed: response.data.creditsUsed, expiresAt: new Date(response.data.expiresAt), next: response.data.next, - data: response.data.data, - error: response.data.error + data: allData, + error: response.data.error, }) } else { this.handleError(response, "check crawl status"); @@ -452,7 +464,7 @@ export default class FirecrawlApp { id: string, headers: AxiosRequestHeaders, checkInterval: number - ): Promise { + ): Promise { while (true) { let statusResponse: AxiosResponse = await this.getRequest( `${this.apiUrl}/v1/crawl/${id}`, @@ -460,20 +472,20 @@ export default class FirecrawlApp { ); if (statusResponse.status === 200) { let statusData = statusResponse.data; - if (statusData.status === "completed") { - if ("data" in statusData) { - let data = statusData.data; - while ('next' in statusData) { - statusResponse = await this.getRequest(statusData.next, headers); - statusData = statusResponse.data; - data = data.concat(statusData.data); + if (statusData.status === "completed") { + if ("data" in statusData) { + let data = statusData.data; + while ('next' in statusData) { + statusResponse = await this.getRequest(statusData.next, headers); + statusData = statusResponse.data; + data = data.concat(statusData.data); + } + statusData.data = data; + return statusData; + } else { + throw new Error("Crawl job completed but no data was returned"); } - statusData.data = data; - return statusData; - } else { - throw new Error("Crawl job completed but no data was returned"); - } - } else if ( + } else if ( ["active", "paused", "pending", "queued", "waiting", "scraping"].includes(statusData.status) ) { checkInterval = Math.max(checkInterval, 2); From ad1a6fbc74eeb51c8ac2be870c4535382c8e0428 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 10 Sep 2024 19:41:01 +0200 Subject: [PATCH 28/47] fix(v1/map): handle invalid URLs gracefully --- apps/api/src/controllers/v1/map.ts | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/apps/api/src/controllers/v1/map.ts b/apps/api/src/controllers/v1/map.ts index e6abd9ae..a9c61d04 100644 --- a/apps/api/src/controllers/v1/map.ts +++ b/apps/api/src/controllers/v1/map.ts @@ -88,7 +88,13 @@ export async function mapController( links = performCosineSimilarity(links, searchQuery); } - links = links.map((x) => checkAndUpdateURLForMap(x).url.trim()); + links = links.map((x) => { + try { + return checkAndUpdateURLForMap(x).url.trim() + } catch (_) { + return null; + } + }).filter(x => x !== null); // allows for subdomains to be included links = links.filter((x) => isSameDomain(x, req.body.url)); From 83a165db0fd0e680f4dfb1c41cbcb20901d5e8f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 10 Sep 2024 21:18:53 +0200 Subject: [PATCH 29/47] fix(v0/scrape): ensure url is string --- apps/api/src/controllers/v0/scrape.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/controllers/v0/scrape.ts b/apps/api/src/controllers/v0/scrape.ts index bc91da18..2a5f1d4f 100644 --- a/apps/api/src/controllers/v0/scrape.ts +++ b/apps/api/src/controllers/v0/scrape.ts @@ -39,7 +39,7 @@ export async function scrapeHelper( returnCode: number; }> { const url = req.body.url; - if (!url) { + if (typeof url !== "string") { return { success: false, error: "Url is required", returnCode: 400 }; } From 97ffabff3a6b6bd1c7455b85ce794949f540469b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 10 Sep 2024 21:21:20 +0200 Subject: [PATCH 30/47] fix(v1): converting bad docs always gives null --- apps/api/src/controllers/v1/types.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 6b2db308..c44c1cc5 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -348,7 +348,7 @@ export function legacyExtractorOptions(x: ExtractOptions): ExtractorOptions { } export function legacyDocumentConverter(doc: any): Document { - if (doc === null || doc === undefined) return doc; + if (doc === null || doc === undefined) return null; if (doc.metadata) { if (doc.metadata.screenshot) { From f6fc71b46a54f16b34cdada9be0c6fb53810582d Mon Sep 17 00:00:00 2001 From: Andrei Bobkov Date: Wed, 11 Sep 2024 17:53:17 +0300 Subject: [PATCH 31/47] fix(js-sdk): bring back cjs exports --- apps/js-sdk/firecrawl/package-lock.json | 1549 ++++++++++++++++++++++- apps/js-sdk/firecrawl/package.json | 10 +- apps/js-sdk/firecrawl/tsconfig.json | 12 +- apps/js-sdk/firecrawl/tsup.config.ts | 9 + 4 files changed, 1560 insertions(+), 20 deletions(-) create mode 100644 apps/js-sdk/firecrawl/tsup.config.ts diff --git a/apps/js-sdk/firecrawl/package-lock.json b/apps/js-sdk/firecrawl/package-lock.json index ce6a1a4a..ee1baba3 100644 --- a/apps/js-sdk/firecrawl/package-lock.json +++ b/apps/js-sdk/firecrawl/package-lock.json @@ -1,12 +1,12 @@ { "name": "@mendable/firecrawl-js", - "version": "1.1.0", + "version": "1.2.1", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "@mendable/firecrawl-js", - "version": "1.1.0", + "version": "1.2.1", "license": "MIT", "dependencies": { "axios": "^1.6.8", @@ -27,6 +27,7 @@ "@types/uuid": "^9.0.8", "jest": "^29.7.0", "ts-jest": "^29.2.2", + "tsup": "^8.2.4", "typescript": "^5.4.5" } }, @@ -600,6 +601,486 @@ "integrity": "sha512-0hYQ8SB4Db5zvZB4axdMHGwEaQjkZzFjQiN9LVYvIFB2nSUHW9tYpxWriPrWDASIxiaXax83REcLxuSdnGPZtw==", "dev": true }, + "node_modules/@esbuild/aix-ppc64": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.23.1.tgz", + "integrity": "sha512-6VhYk1diRqrhBAqpJEdjASR/+WVRtfjpqKuNw11cLiaWpAT/Uu+nokB+UJnevzy/P9C/ty6AOe0dwueMrGh/iQ==", + "cpu": [ + "ppc64" + ], + "dev": true, + "optional": true, + "os": [ + "aix" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/android-arm": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/android-arm/-/android-arm-0.23.1.tgz", + "integrity": "sha512-uz6/tEy2IFm9RYOyvKl88zdzZfwEfKZmnX9Cj1BHjeSGNuGLuMD1kR8y5bteYmwqKm1tj8m4cb/aKEorr6fHWQ==", + "cpu": [ + "arm" + ], + "dev": true, + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/android-arm64": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/android-arm64/-/android-arm64-0.23.1.tgz", + "integrity": "sha512-xw50ipykXcLstLeWH7WRdQuysJqejuAGPd30vd1i5zSyKK3WE+ijzHmLKxdiCMtH1pHz78rOg0BKSYOSB/2Khw==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/android-x64": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/android-x64/-/android-x64-0.23.1.tgz", + "integrity": "sha512-nlN9B69St9BwUoB+jkyU090bru8L0NA3yFvAd7k8dNsVH8bi9a8cUAUSEcEEgTp2z3dbEDGJGfP6VUnkQnlReg==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/darwin-arm64": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/darwin-arm64/-/darwin-arm64-0.23.1.tgz", + "integrity": "sha512-YsS2e3Wtgnw7Wq53XXBLcV6JhRsEq8hkfg91ESVadIrzr9wO6jJDMZnCQbHm1Guc5t/CdDiFSSfWP58FNuvT3Q==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/darwin-x64": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/darwin-x64/-/darwin-x64-0.23.1.tgz", + "integrity": "sha512-aClqdgTDVPSEGgoCS8QDG37Gu8yc9lTHNAQlsztQ6ENetKEO//b8y31MMu2ZaPbn4kVsIABzVLXYLhCGekGDqw==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/freebsd-arm64": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/freebsd-arm64/-/freebsd-arm64-0.23.1.tgz", + "integrity": "sha512-h1k6yS8/pN/NHlMl5+v4XPfikhJulk4G+tKGFIOwURBSFzE8bixw1ebjluLOjfwtLqY0kewfjLSrO6tN2MgIhA==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "freebsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/freebsd-x64": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/freebsd-x64/-/freebsd-x64-0.23.1.tgz", + "integrity": "sha512-lK1eJeyk1ZX8UklqFd/3A60UuZ/6UVfGT2LuGo3Wp4/z7eRTRYY+0xOu2kpClP+vMTi9wKOfXi2vjUpO1Ro76g==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "freebsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-arm": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/linux-arm/-/linux-arm-0.23.1.tgz", + "integrity": "sha512-CXXkzgn+dXAPs3WBwE+Kvnrf4WECwBdfjfeYHpMeVxWE0EceB6vhWGShs6wi0IYEqMSIzdOF1XjQ/Mkm5d7ZdQ==", + "cpu": [ + "arm" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-arm64": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/linux-arm64/-/linux-arm64-0.23.1.tgz", + "integrity": "sha512-/93bf2yxencYDnItMYV/v116zff6UyTjo4EtEQjUBeGiVpMmffDNUyD9UN2zV+V3LRV3/on4xdZ26NKzn6754g==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-ia32": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/linux-ia32/-/linux-ia32-0.23.1.tgz", + "integrity": "sha512-VTN4EuOHwXEkXzX5nTvVY4s7E/Krz7COC8xkftbbKRYAl96vPiUssGkeMELQMOnLOJ8k3BY1+ZY52tttZnHcXQ==", + "cpu": [ + "ia32" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-loong64": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/linux-loong64/-/linux-loong64-0.23.1.tgz", + "integrity": "sha512-Vx09LzEoBa5zDnieH8LSMRToj7ir/Jeq0Gu6qJ/1GcBq9GkfoEAoXvLiW1U9J1qE/Y/Oyaq33w5p2ZWrNNHNEw==", + "cpu": [ + "loong64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-mips64el": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/linux-mips64el/-/linux-mips64el-0.23.1.tgz", + "integrity": "sha512-nrFzzMQ7W4WRLNUOU5dlWAqa6yVeI0P78WKGUo7lg2HShq/yx+UYkeNSE0SSfSure0SqgnsxPvmAUu/vu0E+3Q==", + "cpu": [ + "mips64el" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-ppc64": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/linux-ppc64/-/linux-ppc64-0.23.1.tgz", + "integrity": "sha512-dKN8fgVqd0vUIjxuJI6P/9SSSe/mB9rvA98CSH2sJnlZ/OCZWO1DJvxj8jvKTfYUdGfcq2dDxoKaC6bHuTlgcw==", + "cpu": [ + "ppc64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-riscv64": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/linux-riscv64/-/linux-riscv64-0.23.1.tgz", + "integrity": "sha512-5AV4Pzp80fhHL83JM6LoA6pTQVWgB1HovMBsLQ9OZWLDqVY8MVobBXNSmAJi//Csh6tcY7e7Lny2Hg1tElMjIA==", + "cpu": [ + "riscv64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-s390x": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/linux-s390x/-/linux-s390x-0.23.1.tgz", + "integrity": "sha512-9ygs73tuFCe6f6m/Tb+9LtYxWR4c9yg7zjt2cYkjDbDpV/xVn+68cQxMXCjUpYwEkze2RcU/rMnfIXNRFmSoDw==", + "cpu": [ + "s390x" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-x64": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/linux-x64/-/linux-x64-0.23.1.tgz", + "integrity": "sha512-EV6+ovTsEXCPAp58g2dD68LxoP/wK5pRvgy0J/HxPGB009omFPv3Yet0HiaqvrIrgPTBuC6wCH1LTOY91EO5hQ==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/netbsd-x64": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/netbsd-x64/-/netbsd-x64-0.23.1.tgz", + "integrity": "sha512-aevEkCNu7KlPRpYLjwmdcuNz6bDFiE7Z8XC4CPqExjTvrHugh28QzUXVOZtiYghciKUacNktqxdpymplil1beA==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "netbsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/openbsd-arm64": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/openbsd-arm64/-/openbsd-arm64-0.23.1.tgz", + "integrity": "sha512-3x37szhLexNA4bXhLrCC/LImN/YtWis6WXr1VESlfVtVeoFJBRINPJ3f0a/6LV8zpikqoUg4hyXw0sFBt5Cr+Q==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "openbsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/openbsd-x64": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/openbsd-x64/-/openbsd-x64-0.23.1.tgz", + "integrity": "sha512-aY2gMmKmPhxfU+0EdnN+XNtGbjfQgwZj43k8G3fyrDM/UdZww6xrWxmDkuz2eCZchqVeABjV5BpildOrUbBTqA==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "openbsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/sunos-x64": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/sunos-x64/-/sunos-x64-0.23.1.tgz", + "integrity": "sha512-RBRT2gqEl0IKQABT4XTj78tpk9v7ehp+mazn2HbUeZl1YMdaGAQqhapjGTCe7uw7y0frDi4gS0uHzhvpFuI1sA==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "sunos" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/win32-arm64": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/win32-arm64/-/win32-arm64-0.23.1.tgz", + "integrity": "sha512-4O+gPR5rEBe2FpKOVyiJ7wNDPA8nGzDuJ6gN4okSA1gEOYZ67N8JPk58tkWtdtPeLz7lBnY6I5L3jdsr3S+A6A==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/win32-ia32": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/win32-ia32/-/win32-ia32-0.23.1.tgz", + "integrity": "sha512-BcaL0Vn6QwCwre3Y717nVHZbAa4UBEigzFm6VdsVdT/MbZ38xoj1X9HPkZhbmaBGUD1W8vxAfffbDe8bA6AKnQ==", + "cpu": [ + "ia32" + ], + "dev": true, + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/win32-x64": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/win32-x64/-/win32-x64-0.23.1.tgz", + "integrity": "sha512-BHpFFeslkWrXWyUPnbKm+xYYVYruCinGcftSBaa8zoF9hZO4BcSCFUvHVTtzpIY6YzUnYtuEhZ+C9iEXjxnasg==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@isaacs/cliui": { + "version": "8.0.2", + "resolved": "https://registry.npmjs.org/@isaacs/cliui/-/cliui-8.0.2.tgz", + "integrity": "sha512-O8jcjabXaleOG9DQ0+ARXWZBTfnP4WNAqzuiJK7ll44AmxGKv/J2M4TPjxjY3znBCfvBXFzucm1twdyFybFqEA==", + "dev": true, + "dependencies": { + "string-width": "^5.1.2", + "string-width-cjs": "npm:string-width@^4.2.0", + "strip-ansi": "^7.0.1", + "strip-ansi-cjs": "npm:strip-ansi@^6.0.1", + "wrap-ansi": "^8.1.0", + "wrap-ansi-cjs": "npm:wrap-ansi@^7.0.0" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/@isaacs/cliui/node_modules/ansi-regex": { + "version": "6.1.0", + "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-6.1.0.tgz", + "integrity": "sha512-7HSX4QQb4CspciLpVFwyRe79O3xsIZDDLER21kERQ71oaPodF8jL725AgJMFAYbooIqolJoRLuM81SpeUkpkvA==", + "dev": true, + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/chalk/ansi-regex?sponsor=1" + } + }, + "node_modules/@isaacs/cliui/node_modules/ansi-styles": { + "version": "6.2.1", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-6.2.1.tgz", + "integrity": "sha512-bN798gFfQX+viw3R7yrGWRqnrN2oRkEkUjjl4JNn4E8GxxbjtG3FbrEIIY3l8/hrwUwIeCZvi4QuOTP4MErVug==", + "dev": true, + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/chalk/ansi-styles?sponsor=1" + } + }, + "node_modules/@isaacs/cliui/node_modules/emoji-regex": { + "version": "9.2.2", + "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-9.2.2.tgz", + "integrity": "sha512-L18DaJsXSUk2+42pv8mLs5jJT2hqFkFE4j21wOmgbUqsZ2hL72NsUU785g9RXgo3s0ZNgVl42TiHp3ZtOv/Vyg==", + "dev": true + }, + "node_modules/@isaacs/cliui/node_modules/string-width": { + "version": "5.1.2", + "resolved": "https://registry.npmjs.org/string-width/-/string-width-5.1.2.tgz", + "integrity": "sha512-HnLOCR3vjcY8beoNLtcjZ5/nxn2afmME6lhrDrebokqMap+XbeW8n9TXpPDOqdGK5qcI3oT0GKTW6wC7EMiVqA==", + "dev": true, + "dependencies": { + "eastasianwidth": "^0.2.0", + "emoji-regex": "^9.2.2", + "strip-ansi": "^7.0.1" + }, + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/@isaacs/cliui/node_modules/strip-ansi": { + "version": "7.1.0", + "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-7.1.0.tgz", + "integrity": "sha512-iq6eVVI64nQQTRYq2KtEg2d2uU7LElhTJwsH4YzIHZshxlgZms/wIc4VoDQTlG/IvVIrBKG06CrZnp0qv7hkcQ==", + "dev": true, + "dependencies": { + "ansi-regex": "^6.0.1" + }, + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/chalk/strip-ansi?sponsor=1" + } + }, + "node_modules/@isaacs/cliui/node_modules/wrap-ansi": { + "version": "8.1.0", + "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-8.1.0.tgz", + "integrity": "sha512-si7QWI6zUMq56bESFvagtmzMdGOtoxfR+Sez11Mobfc7tm+VkUckk9bW2UeffTGVUbOksxmSw0AA2gs8g71NCQ==", + "dev": true, + "dependencies": { + "ansi-styles": "^6.1.0", + "string-width": "^5.0.1", + "strip-ansi": "^7.0.1" + }, + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/chalk/wrap-ansi?sponsor=1" + } + }, "node_modules/@istanbuljs/load-nyc-config": { "version": "1.1.0", "resolved": "https://registry.npmjs.org/@istanbuljs/load-nyc-config/-/load-nyc-config-1.1.0.tgz", @@ -951,6 +1432,259 @@ "@jridgewell/sourcemap-codec": "^1.4.14" } }, + "node_modules/@nodelib/fs.scandir": { + "version": "2.1.5", + "resolved": "https://registry.npmjs.org/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz", + "integrity": "sha512-vq24Bq3ym5HEQm2NKCr3yXDwjc7vTsEThRDnkp2DK9p1uqLR+DHurm/NOTo0KG7HYHU7eppKZj3MyqYuMBf62g==", + "dev": true, + "dependencies": { + "@nodelib/fs.stat": "2.0.5", + "run-parallel": "^1.1.9" + }, + "engines": { + "node": ">= 8" + } + }, + "node_modules/@nodelib/fs.stat": { + "version": "2.0.5", + "resolved": "https://registry.npmjs.org/@nodelib/fs.stat/-/fs.stat-2.0.5.tgz", + "integrity": "sha512-RkhPPp2zrqDAQA/2jNhnztcPAlv64XdhIp7a7454A5ovI7Bukxgt7MX7udwAu3zg1DcpPU0rz3VV1SeaqvY4+A==", + "dev": true, + "engines": { + "node": ">= 8" + } + }, + "node_modules/@nodelib/fs.walk": { + "version": "1.2.8", + "resolved": "https://registry.npmjs.org/@nodelib/fs.walk/-/fs.walk-1.2.8.tgz", + "integrity": "sha512-oGB+UxlgWcgQkgwo8GcEGwemoTFt3FIO9ababBmaGwXIoBKZ+GTy0pP185beGg7Llih/NSHSV2XAs1lnznocSg==", + "dev": true, + "dependencies": { + "@nodelib/fs.scandir": "2.1.5", + "fastq": "^1.6.0" + }, + "engines": { + "node": ">= 8" + } + }, + "node_modules/@pkgjs/parseargs": { + "version": "0.11.0", + "resolved": "https://registry.npmjs.org/@pkgjs/parseargs/-/parseargs-0.11.0.tgz", + "integrity": "sha512-+1VkjdD0QBLPodGrJUeqarH8VAIvQODIbwh9XpP5Syisf7YoQgsJKPNFoqqLQlu+VQ/tVSshMR6loPMn8U+dPg==", + "dev": true, + "optional": true, + "engines": { + "node": ">=14" + } + }, + "node_modules/@rollup/rollup-android-arm-eabi": { + "version": "4.21.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm-eabi/-/rollup-android-arm-eabi-4.21.2.tgz", + "integrity": "sha512-fSuPrt0ZO8uXeS+xP3b+yYTCBUd05MoSp2N/MFOgjhhUhMmchXlpTQrTpI8T+YAwAQuK7MafsCOxW7VrPMrJcg==", + "cpu": [ + "arm" + ], + "dev": true, + "optional": true, + "os": [ + "android" + ] + }, + "node_modules/@rollup/rollup-android-arm64": { + "version": "4.21.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm64/-/rollup-android-arm64-4.21.2.tgz", + "integrity": "sha512-xGU5ZQmPlsjQS6tzTTGwMsnKUtu0WVbl0hYpTPauvbRAnmIvpInhJtgjj3mcuJpEiuUw4v1s4BimkdfDWlh7gA==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "android" + ] + }, + "node_modules/@rollup/rollup-darwin-arm64": { + "version": "4.21.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-arm64/-/rollup-darwin-arm64-4.21.2.tgz", + "integrity": "sha512-99AhQ3/ZMxU7jw34Sq8brzXqWH/bMnf7ZVhvLk9QU2cOepbQSVTns6qoErJmSiAvU3InRqC2RRZ5ovh1KN0d0Q==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "darwin" + ] + }, + "node_modules/@rollup/rollup-darwin-x64": { + "version": "4.21.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-x64/-/rollup-darwin-x64-4.21.2.tgz", + "integrity": "sha512-ZbRaUvw2iN/y37x6dY50D8m2BnDbBjlnMPotDi/qITMJ4sIxNY33HArjikDyakhSv0+ybdUxhWxE6kTI4oX26w==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "darwin" + ] + }, + "node_modules/@rollup/rollup-linux-arm-gnueabihf": { + "version": "4.21.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-gnueabihf/-/rollup-linux-arm-gnueabihf-4.21.2.tgz", + "integrity": "sha512-ztRJJMiE8nnU1YFcdbd9BcH6bGWG1z+jP+IPW2oDUAPxPjo9dverIOyXz76m6IPA6udEL12reYeLojzW2cYL7w==", + "cpu": [ + "arm" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-arm-musleabihf": { + "version": "4.21.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-musleabihf/-/rollup-linux-arm-musleabihf-4.21.2.tgz", + "integrity": "sha512-flOcGHDZajGKYpLV0JNc0VFH361M7rnV1ee+NTeC/BQQ1/0pllYcFmxpagltANYt8FYf9+kL6RSk80Ziwyhr7w==", + "cpu": [ + "arm" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-arm64-gnu": { + "version": "4.21.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-gnu/-/rollup-linux-arm64-gnu-4.21.2.tgz", + "integrity": "sha512-69CF19Kp3TdMopyteO/LJbWufOzqqXzkrv4L2sP8kfMaAQ6iwky7NoXTp7bD6/irKgknDKM0P9E/1l5XxVQAhw==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-arm64-musl": { + "version": "4.21.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-musl/-/rollup-linux-arm64-musl-4.21.2.tgz", + "integrity": "sha512-48pD/fJkTiHAZTnZwR0VzHrao70/4MlzJrq0ZsILjLW/Ab/1XlVUStYyGt7tdyIiVSlGZbnliqmult/QGA2O2w==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-powerpc64le-gnu": { + "version": "4.21.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-powerpc64le-gnu/-/rollup-linux-powerpc64le-gnu-4.21.2.tgz", + "integrity": "sha512-cZdyuInj0ofc7mAQpKcPR2a2iu4YM4FQfuUzCVA2u4HI95lCwzjoPtdWjdpDKyHxI0UO82bLDoOaLfpZ/wviyQ==", + "cpu": [ + "ppc64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-riscv64-gnu": { + "version": "4.21.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-gnu/-/rollup-linux-riscv64-gnu-4.21.2.tgz", + "integrity": "sha512-RL56JMT6NwQ0lXIQmMIWr1SW28z4E4pOhRRNqwWZeXpRlykRIlEpSWdsgNWJbYBEWD84eocjSGDu/XxbYeCmwg==", + "cpu": [ + "riscv64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-s390x-gnu": { + "version": "4.21.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-s390x-gnu/-/rollup-linux-s390x-gnu-4.21.2.tgz", + "integrity": "sha512-PMxkrWS9z38bCr3rWvDFVGD6sFeZJw4iQlhrup7ReGmfn7Oukrr/zweLhYX6v2/8J6Cep9IEA/SmjXjCmSbrMQ==", + "cpu": [ + "s390x" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-x64-gnu": { + "version": "4.21.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-gnu/-/rollup-linux-x64-gnu-4.21.2.tgz", + "integrity": "sha512-B90tYAUoLhU22olrafY3JQCFLnT3NglazdwkHyxNDYF/zAxJt5fJUB/yBoWFoIQ7SQj+KLe3iL4BhOMa9fzgpw==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-x64-musl": { + "version": "4.21.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-musl/-/rollup-linux-x64-musl-4.21.2.tgz", + "integrity": "sha512-7twFizNXudESmC9oneLGIUmoHiiLppz/Xs5uJQ4ShvE6234K0VB1/aJYU3f/4g7PhssLGKBVCC37uRkkOi8wjg==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-win32-arm64-msvc": { + "version": "4.21.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-arm64-msvc/-/rollup-win32-arm64-msvc-4.21.2.tgz", + "integrity": "sha512-9rRero0E7qTeYf6+rFh3AErTNU1VCQg2mn7CQcI44vNUWM9Ze7MSRS/9RFuSsox+vstRt97+x3sOhEey024FRQ==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@rollup/rollup-win32-ia32-msvc": { + "version": "4.21.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-ia32-msvc/-/rollup-win32-ia32-msvc-4.21.2.tgz", + "integrity": "sha512-5rA4vjlqgrpbFVVHX3qkrCo/fZTj1q0Xxpg+Z7yIo3J2AilW7t2+n6Q8Jrx+4MrYpAnjttTYF8rr7bP46BPzRw==", + "cpu": [ + "ia32" + ], + "dev": true, + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@rollup/rollup-win32-x64-msvc": { + "version": "4.21.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-x64-msvc/-/rollup-win32-x64-msvc-4.21.2.tgz", + "integrity": "sha512-6UUxd0+SKomjdzuAcp+HAmxw1FlGBnl1v2yEPSabtx4lBfdXHDVsW7+lQkgz9cNFJGY3AWR7+V8P5BqkD9L9nA==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "win32" + ] + }, "node_modules/@sinclair/typebox": { "version": "0.27.8", "resolved": "https://registry.npmjs.org/@sinclair/typebox/-/typebox-0.27.8.tgz", @@ -1036,6 +1770,12 @@ "dotenv": "*" } }, + "node_modules/@types/estree": { + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.5.tgz", + "integrity": "sha512-/kYRxGDLWzHOB7q+wtSUQlFrtcdUccpfy+X+9iMBpHK8QLLhx2wIPYuS5DYtR9Wa/YlZAbIovy7qVdB1Aq6Lyw==", + "dev": true + }, "node_modules/@types/graceful-fs": { "version": "4.1.9", "resolved": "https://registry.npmjs.org/@types/graceful-fs/-/graceful-fs-4.1.9.tgz", @@ -1160,6 +1900,12 @@ "url": "https://github.com/chalk/ansi-styles?sponsor=1" } }, + "node_modules/any-promise": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/any-promise/-/any-promise-1.3.0.tgz", + "integrity": "sha512-7UvmKalWRt1wgjL1RrGxoSJW/0QZFIegpeGvZG9kjp8vrRu55XTHbwnqq2GpXm9uLbcuhxm3IqX9OB4MZR1b2A==", + "dev": true + }, "node_modules/anymatch": { "version": "3.1.3", "resolved": "https://registry.npmjs.org/anymatch/-/anymatch-3.1.3.tgz", @@ -1182,6 +1928,15 @@ "sprintf-js": "~1.0.2" } }, + "node_modules/array-union": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/array-union/-/array-union-2.1.0.tgz", + "integrity": "sha512-HGyxoOTYUyCM6stUe6EJgnd4EoewAI7zMdfqO+kGjnlZmBDz/cR5pf8r/cR4Wq60sL/p0IkcjUEEPwS3GFrIyw==", + "dev": true, + "engines": { + "node": ">=8" + } + }, "node_modules/async": { "version": "3.2.5", "resolved": "https://registry.npmjs.org/async/-/async-3.2.5.tgz", @@ -1316,6 +2071,18 @@ "integrity": "sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw==", "dev": true }, + "node_modules/binary-extensions": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/binary-extensions/-/binary-extensions-2.3.0.tgz", + "integrity": "sha512-Ceh+7ox5qe7LJuLHoY0feh3pHuUDHAcRUeyL2VYghZwfpkNIy/+8Ocg0a3UuSoYzavmylwuLWQOf3hl0jjMMIw==", + "dev": true, + "engines": { + "node": ">=8" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "node_modules/brace-expansion": { "version": "1.1.11", "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.11.tgz", @@ -1397,6 +2164,30 @@ "integrity": "sha512-E+XQCRwSbaaiChtv6k6Dwgc+bx+Bs6vuKJHHl5kox/BaKbhiXzqQOwK4cO22yElGp2OCmjwVhT3HmxgyPGnJfQ==", "dev": true }, + "node_modules/bundle-require": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/bundle-require/-/bundle-require-5.0.0.tgz", + "integrity": "sha512-GuziW3fSSmopcx4KRymQEJVbZUfqlCqcq7dvs6TYwKRZiegK/2buMxQTPs6MGlNv50wms1699qYO54R8XfRX4w==", + "dev": true, + "dependencies": { + "load-tsconfig": "^0.2.3" + }, + "engines": { + "node": "^12.20.0 || ^14.13.1 || >=16.0.0" + }, + "peerDependencies": { + "esbuild": ">=0.18" + } + }, + "node_modules/cac": { + "version": "6.7.14", + "resolved": "https://registry.npmjs.org/cac/-/cac-6.7.14.tgz", + "integrity": "sha512-b6Ilus+c3RrdDk+JhLKUAQfzzgLEPy6wcXqS7f/xe1EETvsDP6GORG7SFuOs6cID5YkqchW/LXZbX5bc8j7ZcQ==", + "dev": true, + "engines": { + "node": ">=8" + } + }, "node_modules/callsites": { "version": "3.1.0", "resolved": "https://registry.npmjs.org/callsites/-/callsites-3.1.0.tgz", @@ -1460,6 +2251,30 @@ "node": ">=10" } }, + "node_modules/chokidar": { + "version": "3.6.0", + "resolved": "https://registry.npmjs.org/chokidar/-/chokidar-3.6.0.tgz", + "integrity": "sha512-7VT13fmjotKpGipCW9JEQAusEPE+Ei8nl6/g4FBAmIm0GOOLMua9NDDo/DWp0ZAxCr3cPq5ZpBqmPAQgDda2Pw==", + "dev": true, + "dependencies": { + "anymatch": "~3.1.2", + "braces": "~3.0.2", + "glob-parent": "~5.1.2", + "is-binary-path": "~2.1.0", + "is-glob": "~4.0.1", + "normalize-path": "~3.0.0", + "readdirp": "~3.6.0" + }, + "engines": { + "node": ">= 8.10.0" + }, + "funding": { + "url": "https://paulmillr.com/funding/" + }, + "optionalDependencies": { + "fsevents": "~2.3.2" + } + }, "node_modules/ci-info": { "version": "3.9.0", "resolved": "https://registry.npmjs.org/ci-info/-/ci-info-3.9.0.tgz", @@ -1540,12 +2355,30 @@ "node": ">= 0.8" } }, + "node_modules/commander": { + "version": "4.1.1", + "resolved": "https://registry.npmjs.org/commander/-/commander-4.1.1.tgz", + "integrity": "sha512-NOKm8xhkzAjzFx8B2v5OAHT+u5pRQc2UCa2Vq9jYL/31o2wi9mxBA7LIFs3sV5VSC49z6pEhfbMULvShKj26WA==", + "dev": true, + "engines": { + "node": ">= 6" + } + }, "node_modules/concat-map": { "version": "0.0.1", "resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz", "integrity": "sha512-/Srv4dswyQNBfohGpz9o6Yb3Gz3SrUDqBH5rTuhGR7ahtlbYKnVxw2bCFMRljaA7EXHaXZ8wsHdodFvbkhKmqg==", "dev": true }, + "node_modules/consola": { + "version": "3.2.3", + "resolved": "https://registry.npmjs.org/consola/-/consola-3.2.3.tgz", + "integrity": "sha512-I5qxpzLv+sJhTVEoLYNcTW+bThDCPsit0vLNKShZx6rLtpilNpmmeTPaeqJb9ZE9dV3DGaeby6Vuhrw38WjeyQ==", + "dev": true, + "engines": { + "node": "^14.18.0 || >=16.10.0" + } + }, "node_modules/convert-source-map": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/convert-source-map/-/convert-source-map-2.0.0.tgz", @@ -1588,12 +2421,12 @@ } }, "node_modules/debug": { - "version": "4.3.4", - "resolved": "https://registry.npmjs.org/debug/-/debug-4.3.4.tgz", - "integrity": "sha512-PRWFHuSU3eDtQJPvnNY7Jcket1j0t5OuOsFzPPzsekD52Zl8qUfFIPEiswXqIvHWGVHOgX+7G/vCNNhehwxfkQ==", + "version": "4.3.7", + "resolved": "https://registry.npmjs.org/debug/-/debug-4.3.7.tgz", + "integrity": "sha512-Er2nc/H7RrMXZBFCEim6TCmMk02Z8vLC2Rbi1KEBggpo0fS6l0S1nnapwmIi3yW/+GOJap1Krg4w0Hg80oCqgQ==", "dev": true, "dependencies": { - "ms": "2.1.2" + "ms": "^2.1.3" }, "engines": { "node": ">=6.0" @@ -1653,6 +2486,18 @@ "node": "^14.15.0 || ^16.10.0 || >=18.0.0" } }, + "node_modules/dir-glob": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/dir-glob/-/dir-glob-3.0.1.tgz", + "integrity": "sha512-WkrWp9GR4KXfKGYzOLmTuGVi1UWFfws377n9cc55/tb6DuqyF6pcQ5AbiHEshaDpY9v6oaSr2XCDidGmMwdzIA==", + "dev": true, + "dependencies": { + "path-type": "^4.0.0" + }, + "engines": { + "node": ">=8" + } + }, "node_modules/dotenv": { "version": "16.4.5", "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.4.5.tgz", @@ -1664,6 +2509,12 @@ "url": "https://dotenvx.com" } }, + "node_modules/eastasianwidth": { + "version": "0.2.0", + "resolved": "https://registry.npmjs.org/eastasianwidth/-/eastasianwidth-0.2.0.tgz", + "integrity": "sha512-I88TYZWc9XiYHRQ4/3c5rjjfgkjhLyW2luGIheGERbNQ6OY7yTybanSpDXZa8y7VUP9YmDcYa+eyq4ca7iLqWA==", + "dev": true + }, "node_modules/ejs": { "version": "3.1.10", "resolved": "https://registry.npmjs.org/ejs/-/ejs-3.1.10.tgz", @@ -1712,6 +2563,45 @@ "is-arrayish": "^0.2.1" } }, + "node_modules/esbuild": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.23.1.tgz", + "integrity": "sha512-VVNz/9Sa0bs5SELtn3f7qhJCDPCF5oMEl5cO9/SSinpE9hbPVvxbd572HH5AKiP7WD8INO53GgfDDhRjkylHEg==", + "dev": true, + "hasInstallScript": true, + "bin": { + "esbuild": "bin/esbuild" + }, + "engines": { + "node": ">=18" + }, + "optionalDependencies": { + "@esbuild/aix-ppc64": "0.23.1", + "@esbuild/android-arm": "0.23.1", + "@esbuild/android-arm64": "0.23.1", + "@esbuild/android-x64": "0.23.1", + "@esbuild/darwin-arm64": "0.23.1", + "@esbuild/darwin-x64": "0.23.1", + "@esbuild/freebsd-arm64": "0.23.1", + "@esbuild/freebsd-x64": "0.23.1", + "@esbuild/linux-arm": "0.23.1", + "@esbuild/linux-arm64": "0.23.1", + "@esbuild/linux-ia32": "0.23.1", + "@esbuild/linux-loong64": "0.23.1", + "@esbuild/linux-mips64el": "0.23.1", + "@esbuild/linux-ppc64": "0.23.1", + "@esbuild/linux-riscv64": "0.23.1", + "@esbuild/linux-s390x": "0.23.1", + "@esbuild/linux-x64": "0.23.1", + "@esbuild/netbsd-x64": "0.23.1", + "@esbuild/openbsd-arm64": "0.23.1", + "@esbuild/openbsd-x64": "0.23.1", + "@esbuild/sunos-x64": "0.23.1", + "@esbuild/win32-arm64": "0.23.1", + "@esbuild/win32-ia32": "0.23.1", + "@esbuild/win32-x64": "0.23.1" + } + }, "node_modules/escalade": { "version": "3.1.2", "resolved": "https://registry.npmjs.org/escalade/-/escalade-3.1.2.tgz", @@ -1791,12 +2681,37 @@ "node": "^14.15.0 || ^16.10.0 || >=18.0.0" } }, + "node_modules/fast-glob": { + "version": "3.3.2", + "resolved": "https://registry.npmjs.org/fast-glob/-/fast-glob-3.3.2.tgz", + "integrity": "sha512-oX2ruAFQwf/Orj8m737Y5adxDQO0LAB7/S5MnxCdTNDd4p6BsyIVsv9JQsATbTSq8KHRpLwIHbVlUNatxd+1Ow==", + "dev": true, + "dependencies": { + "@nodelib/fs.stat": "^2.0.2", + "@nodelib/fs.walk": "^1.2.3", + "glob-parent": "^5.1.2", + "merge2": "^1.3.0", + "micromatch": "^4.0.4" + }, + "engines": { + "node": ">=8.6.0" + } + }, "node_modules/fast-json-stable-stringify": { "version": "2.1.0", "resolved": "https://registry.npmjs.org/fast-json-stable-stringify/-/fast-json-stable-stringify-2.1.0.tgz", "integrity": "sha512-lhd/wF+Lk98HZoTCtlVraHtfh5XYijIjalXck7saUtuanSDyLMxnHhSXEDJqHxD7msR8D0uCmqlkwjCV8xvwHw==", "dev": true }, + "node_modules/fastq": { + "version": "1.17.1", + "resolved": "https://registry.npmjs.org/fastq/-/fastq-1.17.1.tgz", + "integrity": "sha512-sRVD3lWVIXWg6By68ZN7vho9a1pQcN/WBFaAAsDDFzlJjvoGx0P8z7V1t72grFJfJhu3YPZBuu25f7Kaw2jN1w==", + "dev": true, + "dependencies": { + "reusify": "^1.0.4" + } + }, "node_modules/fb-watchman": { "version": "2.0.2", "resolved": "https://registry.npmjs.org/fb-watchman/-/fb-watchman-2.0.2.tgz", @@ -1880,6 +2795,34 @@ } } }, + "node_modules/foreground-child": { + "version": "3.3.0", + "resolved": "https://registry.npmjs.org/foreground-child/-/foreground-child-3.3.0.tgz", + "integrity": "sha512-Ld2g8rrAyMYFXBhEqMz8ZAHBi4J4uS1i/CxGMDnjyFWddMXLVcDp051DZfu+t7+ab7Wv6SMqpWmyFIj5UbfFvg==", + "dev": true, + "dependencies": { + "cross-spawn": "^7.0.0", + "signal-exit": "^4.0.1" + }, + "engines": { + "node": ">=14" + }, + "funding": { + "url": "https://github.com/sponsors/isaacs" + } + }, + "node_modules/foreground-child/node_modules/signal-exit": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/signal-exit/-/signal-exit-4.1.0.tgz", + "integrity": "sha512-bzyZ1e88w9O1iNJbKnOlvYTrWPDl46O1bG0D3XInv+9tkPrxrN8jUUTiFlDkkmKWgn1M6CfIA13SuGqOa9Korw==", + "dev": true, + "engines": { + "node": ">=14" + }, + "funding": { + "url": "https://github.com/sponsors/isaacs" + } + }, "node_modules/form-data": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.0.tgz", @@ -1981,6 +2924,18 @@ "url": "https://github.com/sponsors/isaacs" } }, + "node_modules/glob-parent": { + "version": "5.1.2", + "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-5.1.2.tgz", + "integrity": "sha512-AOIgSQCepiJYwP3ARnGx+5VnTu2HBYdzbGP45eLw1vr3zB3vZLeyed1sC9hnbcOc9/SrMyM5RPQrkGz4aS9Zow==", + "dev": true, + "dependencies": { + "is-glob": "^4.0.1" + }, + "engines": { + "node": ">= 6" + } + }, "node_modules/globals": { "version": "11.12.0", "resolved": "https://registry.npmjs.org/globals/-/globals-11.12.0.tgz", @@ -1990,6 +2945,26 @@ "node": ">=4" } }, + "node_modules/globby": { + "version": "11.1.0", + "resolved": "https://registry.npmjs.org/globby/-/globby-11.1.0.tgz", + "integrity": "sha512-jhIXaOzy1sb8IyocaruWSn1TjmnBVs8Ayhcy83rmxNJ8q2uWKCAj3CnJY+KpGSXCueAPc0i05kVvVKtP1t9S3g==", + "dev": true, + "dependencies": { + "array-union": "^2.1.0", + "dir-glob": "^3.0.1", + "fast-glob": "^3.2.9", + "ignore": "^5.2.0", + "merge2": "^1.4.1", + "slash": "^3.0.0" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "node_modules/graceful-fs": { "version": "4.2.11", "resolved": "https://registry.npmjs.org/graceful-fs/-/graceful-fs-4.2.11.tgz", @@ -2032,6 +3007,15 @@ "node": ">=10.17.0" } }, + "node_modules/ignore": { + "version": "5.3.2", + "resolved": "https://registry.npmjs.org/ignore/-/ignore-5.3.2.tgz", + "integrity": "sha512-hsBTNUqQTDwkWtcdYI2i06Y/nUBEsNEDJKjWdigLvegy8kDuJAS8uRlpkkcQpyEXL0Z/pjDy5HBmMjRCJ2gq+g==", + "dev": true, + "engines": { + "node": ">= 4" + } + }, "node_modules/import-local": { "version": "3.1.0", "resolved": "https://registry.npmjs.org/import-local/-/import-local-3.1.0.tgz", @@ -2082,6 +3066,18 @@ "integrity": "sha512-zz06S8t0ozoDXMG+ube26zeCTNXcKIPJZJi8hBrF4idCLms4CG9QtK7qBl1boi5ODzFpjswb5JPmHCbMpjaYzg==", "dev": true }, + "node_modules/is-binary-path": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/is-binary-path/-/is-binary-path-2.1.0.tgz", + "integrity": "sha512-ZMERYes6pDydyuGidse7OsHxtbI7WVeUEozgR/g7rd0xUimYNlvZRE/K2MgZTjWy725IfelLeVcEM97mmtRGXw==", + "dev": true, + "dependencies": { + "binary-extensions": "^2.0.0" + }, + "engines": { + "node": ">=8" + } + }, "node_modules/is-core-module": { "version": "2.13.1", "resolved": "https://registry.npmjs.org/is-core-module/-/is-core-module-2.13.1.tgz", @@ -2094,6 +3090,15 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/is-extglob": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-2.1.1.tgz", + "integrity": "sha512-SbKbANkN603Vi4jEZv49LeVJMn4yGwsbzZworEoyEiutsN3nJYdbO36zfhGJ6QEDpOZIFkDtnq5JRxmvl3jsoQ==", + "dev": true, + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/is-fullwidth-code-point": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-3.0.0.tgz", @@ -2112,6 +3117,18 @@ "node": ">=6" } }, + "node_modules/is-glob": { + "version": "4.0.3", + "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-4.0.3.tgz", + "integrity": "sha512-xelSayHH36ZgE7ZWhli7pW34hNbNl8Ojv5KVmkJD4hBdD3th8Tfk9vYasLM+mXWOZhFkgZfxhLSnrwRr4elSSg==", + "dev": true, + "dependencies": { + "is-extglob": "^2.1.1" + }, + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/is-number": { "version": "7.0.0", "resolved": "https://registry.npmjs.org/is-number/-/is-number-7.0.0.tgz", @@ -2252,6 +3269,21 @@ "node": ">=8" } }, + "node_modules/jackspeak": { + "version": "3.4.3", + "resolved": "https://registry.npmjs.org/jackspeak/-/jackspeak-3.4.3.tgz", + "integrity": "sha512-OGlZQpz2yfahA/Rd1Y8Cd9SIEsqvXkLVoSw/cgwhnhFMDbsQFeZYoJJ7bIZBS9BcamUW96asq/npPWugM+RQBw==", + "dev": true, + "dependencies": { + "@isaacs/cliui": "^8.0.2" + }, + "funding": { + "url": "https://github.com/sponsors/isaacs" + }, + "optionalDependencies": { + "@pkgjs/parseargs": "^0.11.0" + } + }, "node_modules/jake": { "version": "10.9.1", "resolved": "https://registry.npmjs.org/jake/-/jake-10.9.1.tgz", @@ -2858,6 +3890,15 @@ "url": "https://github.com/chalk/supports-color?sponsor=1" } }, + "node_modules/joycon": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/joycon/-/joycon-3.1.1.tgz", + "integrity": "sha512-34wB/Y7MW7bzjKRjUKTa46I2Z7eV62Rkhva+KkopW7Qvv/OSWBqvkSY7vusOPrNuZcUG3tApvdVgNB8POj3SPw==", + "dev": true, + "engines": { + "node": ">=10" + } + }, "node_modules/js-tokens": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/js-tokens/-/js-tokens-4.0.0.tgz", @@ -2925,12 +3966,33 @@ "node": ">=6" } }, + "node_modules/lilconfig": { + "version": "3.1.2", + "resolved": "https://registry.npmjs.org/lilconfig/-/lilconfig-3.1.2.tgz", + "integrity": "sha512-eop+wDAvpItUys0FWkHIKeC9ybYrTGbU41U5K7+bttZZeohvnY7M9dZ5kB21GNWiFT2q1OoPTvncPCgSOVO5ow==", + "dev": true, + "engines": { + "node": ">=14" + }, + "funding": { + "url": "https://github.com/sponsors/antonk52" + } + }, "node_modules/lines-and-columns": { "version": "1.2.4", "resolved": "https://registry.npmjs.org/lines-and-columns/-/lines-and-columns-1.2.4.tgz", "integrity": "sha512-7ylylesZQ/PV29jhEDl3Ufjo6ZX7gCqJr5F7PKrqc93v7fzSymt1BpwEU8nAUXs8qzzvqhbjhK5QZg6Mt/HkBg==", "dev": true }, + "node_modules/load-tsconfig": { + "version": "0.2.5", + "resolved": "https://registry.npmjs.org/load-tsconfig/-/load-tsconfig-0.2.5.tgz", + "integrity": "sha512-IXO6OCs9yg8tMKzfPZ1YmheJbZCiEsnBdcB03l0OcfK9prKnJb96siuHCr5Fl37/yo9DnKU+TLpxzTUspw9shg==", + "dev": true, + "engines": { + "node": "^12.20.0 || ^14.13.1 || >=16.0.0" + } + }, "node_modules/locate-path": { "version": "5.0.0", "resolved": "https://registry.npmjs.org/locate-path/-/locate-path-5.0.0.tgz", @@ -2949,6 +4011,12 @@ "integrity": "sha512-t7j+NzmgnQzTAYXcsHYLgimltOV1MXHtlOWf6GjL9Kj8GK5FInw5JotxvbOs+IvV1/Dzo04/fCGfLVs7aXb4Ag==", "dev": true }, + "node_modules/lodash.sortby": { + "version": "4.7.0", + "resolved": "https://registry.npmjs.org/lodash.sortby/-/lodash.sortby-4.7.0.tgz", + "integrity": "sha512-HDWXG8isMntAyRF5vZ7xKuEvOhT4AhlRt/3czTSjvGUxjYCBVRQY48ViDHyfYz9VIoBkW4TMGQNapx+l3RUwdA==", + "dev": true + }, "node_modules/lru-cache": { "version": "5.1.1", "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz", @@ -3027,6 +4095,15 @@ "integrity": "sha512-abv/qOcuPfk3URPfDzmZU1LKmuw8kT+0nIHvKrKgFrwifol/doWcdA4ZqsWQ8ENrFKkd67Mfpo/LovbIUsbt3w==", "dev": true }, + "node_modules/merge2": { + "version": "1.4.1", + "resolved": "https://registry.npmjs.org/merge2/-/merge2-1.4.1.tgz", + "integrity": "sha512-8q7VEgMJW4J8tcfVPy8g09NcQwZdbwFEqhe/WZkoIzjn/3TGDwtOCYtXGxA3O8tPzpczCCDgv+P2P5y00ZJOOg==", + "dev": true, + "engines": { + "node": ">= 8" + } + }, "node_modules/micromatch": { "version": "4.0.5", "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-4.0.5.tgz", @@ -3080,12 +4157,32 @@ "node": "*" } }, + "node_modules/minipass": { + "version": "7.1.2", + "resolved": "https://registry.npmjs.org/minipass/-/minipass-7.1.2.tgz", + "integrity": "sha512-qOOzS1cBTWYF4BH8fVePDBOO9iptMnGUEZwNc/cMWnTV2nVLZ7VoNWEPHkYczZA0pdoA7dl6e7FL659nX9S2aw==", + "dev": true, + "engines": { + "node": ">=16 || 14 >=14.17" + } + }, "node_modules/ms": { - "version": "2.1.2", - "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.2.tgz", - "integrity": "sha512-sGkPx+VjMtmA6MX27oA4FBFELFCZZ4S4XqeGOXCv68tT+jb3vk/RyaKWP0PTKyWtmLSM0b+adUTEvbs1PEaH2w==", + "version": "2.1.3", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz", + "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==", "dev": true }, + "node_modules/mz": { + "version": "2.7.0", + "resolved": "https://registry.npmjs.org/mz/-/mz-2.7.0.tgz", + "integrity": "sha512-z81GNO7nnYMEhrGh9LeymoE4+Yr0Wn5McHIZMK5cfQCl+NDX08sCZgUc9/6MHni9IWuFLm1Z3HTCXu2z9fN62Q==", + "dev": true, + "dependencies": { + "any-promise": "^1.0.0", + "object-assign": "^4.0.1", + "thenify-all": "^1.0.0" + } + }, "node_modules/natural-compare": { "version": "1.4.0", "resolved": "https://registry.npmjs.org/natural-compare/-/natural-compare-1.4.0.tgz", @@ -3125,6 +4222,15 @@ "node": ">=8" } }, + "node_modules/object-assign": { + "version": "4.1.1", + "resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz", + "integrity": "sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg==", + "dev": true, + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/once": { "version": "1.4.0", "resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz", @@ -3200,6 +4306,12 @@ "node": ">=6" } }, + "node_modules/package-json-from-dist": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/package-json-from-dist/-/package-json-from-dist-1.0.0.tgz", + "integrity": "sha512-dATvCeZN/8wQsGywez1mzHtTlP22H8OEfPrVMLNr4/eGa+ijtLn/6M5f0dY8UKNrC2O9UCU6SSoG3qRKnt7STw==", + "dev": true + }, "node_modules/parse-json": { "version": "5.2.0", "resolved": "https://registry.npmjs.org/parse-json/-/parse-json-5.2.0.tgz", @@ -3251,6 +4363,37 @@ "integrity": "sha512-LDJzPVEEEPR+y48z93A0Ed0yXb8pAByGWo/k5YYdYgpY2/2EsOsksJrq7lOHxryrVOn1ejG6oAp8ahvOIQD8sw==", "dev": true }, + "node_modules/path-scurry": { + "version": "1.11.1", + "resolved": "https://registry.npmjs.org/path-scurry/-/path-scurry-1.11.1.tgz", + "integrity": "sha512-Xa4Nw17FS9ApQFJ9umLiJS4orGjm7ZzwUrwamcGQuHSzDyth9boKDaycYdDcZDuqYATXw4HFXgaqWTctW/v1HA==", + "dev": true, + "dependencies": { + "lru-cache": "^10.2.0", + "minipass": "^5.0.0 || ^6.0.2 || ^7.0.0" + }, + "engines": { + "node": ">=16 || 14 >=14.18" + }, + "funding": { + "url": "https://github.com/sponsors/isaacs" + } + }, + "node_modules/path-scurry/node_modules/lru-cache": { + "version": "10.4.3", + "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-10.4.3.tgz", + "integrity": "sha512-JNAzZcXrCt42VGLuYz0zfAzDfAvJWW6AfYlDBQyDV5DClI2m5sAmK+OIO7s59XfsRsWHp02jAJrRadPRGTt6SQ==", + "dev": true + }, + "node_modules/path-type": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/path-type/-/path-type-4.0.0.tgz", + "integrity": "sha512-gDKb8aZMDeD/tZWs9P6+q0J9Mwkdl6xMV8TjnGP3qJVJ06bdMgkbBlLU8IdfOsIsFz2BW1rNVT3XuNEl8zPAvw==", + "dev": true, + "engines": { + "node": ">=8" + } + }, "node_modules/picocolors": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.0.1.tgz", @@ -3290,6 +4433,48 @@ "node": ">=8" } }, + "node_modules/postcss-load-config": { + "version": "6.0.1", + "resolved": "https://registry.npmjs.org/postcss-load-config/-/postcss-load-config-6.0.1.tgz", + "integrity": "sha512-oPtTM4oerL+UXmx+93ytZVN82RrlY/wPUV8IeDxFrzIjXOLF1pN+EmKPLbubvKHT2HC20xXsCAH2Z+CKV6Oz/g==", + "dev": true, + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/postcss/" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "dependencies": { + "lilconfig": "^3.1.1" + }, + "engines": { + "node": ">= 18" + }, + "peerDependencies": { + "jiti": ">=1.21.0", + "postcss": ">=8.0.9", + "tsx": "^4.8.1", + "yaml": "^2.4.2" + }, + "peerDependenciesMeta": { + "jiti": { + "optional": true + }, + "postcss": { + "optional": true + }, + "tsx": { + "optional": true + }, + "yaml": { + "optional": true + } + } + }, "node_modules/pretty-format": { "version": "29.7.0", "resolved": "https://registry.npmjs.org/pretty-format/-/pretty-format-29.7.0.tgz", @@ -3334,6 +4519,15 @@ "resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-1.1.0.tgz", "integrity": "sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg==" }, + "node_modules/punycode": { + "version": "2.3.1", + "resolved": "https://registry.npmjs.org/punycode/-/punycode-2.3.1.tgz", + "integrity": "sha512-vYt7UD1U9Wg6138shLtLOvdAu+8DsC/ilFtEVHcH+wydcSpNE20AfSOduf6MkRFahL5FY7X1oU7nKVZFtfq8Fg==", + "dev": true, + "engines": { + "node": ">=6" + } + }, "node_modules/pure-rand": { "version": "6.1.0", "resolved": "https://registry.npmjs.org/pure-rand/-/pure-rand-6.1.0.tgz", @@ -3350,12 +4544,44 @@ } ] }, + "node_modules/queue-microtask": { + "version": "1.2.3", + "resolved": "https://registry.npmjs.org/queue-microtask/-/queue-microtask-1.2.3.tgz", + "integrity": "sha512-NuaNSa6flKT5JaSYQzJok04JzTL1CA6aGhv5rfLW3PgqA+M2ChpZQnAC8h8i4ZFkBS8X5RqkDBHA7r4hej3K9A==", + "dev": true, + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ] + }, "node_modules/react-is": { "version": "18.2.0", "resolved": "https://registry.npmjs.org/react-is/-/react-is-18.2.0.tgz", "integrity": "sha512-xWGDIW6x921xtzPkhiULtthJHoJvBbF3q26fzloPCK0hsvxtPVelvftw3zjbHWSkR2km9Z+4uxbDDK/6Zw9B8w==", "dev": true }, + "node_modules/readdirp": { + "version": "3.6.0", + "resolved": "https://registry.npmjs.org/readdirp/-/readdirp-3.6.0.tgz", + "integrity": "sha512-hOS089on8RduqdbhvQ5Z37A0ESjsqz6qnRcffsMU3495FuTdqSm+7bhJ29JvIOsBDEEnan5DPu9t3To9VRlMzA==", + "dev": true, + "dependencies": { + "picomatch": "^2.2.1" + }, + "engines": { + "node": ">=8.10.0" + } + }, "node_modules/require-directory": { "version": "2.1.1", "resolved": "https://registry.npmjs.org/require-directory/-/require-directory-2.1.1.tgz", @@ -3412,6 +4638,74 @@ "node": ">=10" } }, + "node_modules/reusify": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/reusify/-/reusify-1.0.4.tgz", + "integrity": "sha512-U9nH88a3fc/ekCF1l0/UP1IosiuIjyTh7hBvXVMHYgVcfGvt897Xguj2UOLDeI5BG2m7/uwyaLVT6fbtCwTyzw==", + "dev": true, + "engines": { + "iojs": ">=1.0.0", + "node": ">=0.10.0" + } + }, + "node_modules/rollup": { + "version": "4.21.2", + "resolved": "https://registry.npmjs.org/rollup/-/rollup-4.21.2.tgz", + "integrity": "sha512-e3TapAgYf9xjdLvKQCkQTnbTKd4a6jwlpQSJJFokHGaX2IVjoEqkIIhiQfqsi0cdwlOD+tQGuOd5AJkc5RngBw==", + "dev": true, + "dependencies": { + "@types/estree": "1.0.5" + }, + "bin": { + "rollup": "dist/bin/rollup" + }, + "engines": { + "node": ">=18.0.0", + "npm": ">=8.0.0" + }, + "optionalDependencies": { + "@rollup/rollup-android-arm-eabi": "4.21.2", + "@rollup/rollup-android-arm64": "4.21.2", + "@rollup/rollup-darwin-arm64": "4.21.2", + "@rollup/rollup-darwin-x64": "4.21.2", + "@rollup/rollup-linux-arm-gnueabihf": "4.21.2", + "@rollup/rollup-linux-arm-musleabihf": "4.21.2", + "@rollup/rollup-linux-arm64-gnu": "4.21.2", + "@rollup/rollup-linux-arm64-musl": "4.21.2", + "@rollup/rollup-linux-powerpc64le-gnu": "4.21.2", + "@rollup/rollup-linux-riscv64-gnu": "4.21.2", + "@rollup/rollup-linux-s390x-gnu": "4.21.2", + "@rollup/rollup-linux-x64-gnu": "4.21.2", + "@rollup/rollup-linux-x64-musl": "4.21.2", + "@rollup/rollup-win32-arm64-msvc": "4.21.2", + "@rollup/rollup-win32-ia32-msvc": "4.21.2", + "@rollup/rollup-win32-x64-msvc": "4.21.2", + "fsevents": "~2.3.2" + } + }, + "node_modules/run-parallel": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/run-parallel/-/run-parallel-1.2.0.tgz", + "integrity": "sha512-5l4VyZR86LZ/lDxZTR6jqL8AFE2S0IFLMP26AbjsLVADxHdhB/c0GUsH+y39UfCi3dzz8OlQuPmnaJOMoDHQBA==", + "dev": true, + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ], + "dependencies": { + "queue-microtask": "^1.2.2" + } + }, "node_modules/semver": { "version": "6.3.1", "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz", @@ -3527,6 +4821,21 @@ "node": ">=8" } }, + "node_modules/string-width-cjs": { + "name": "string-width", + "version": "4.2.3", + "resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.3.tgz", + "integrity": "sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==", + "dev": true, + "dependencies": { + "emoji-regex": "^8.0.0", + "is-fullwidth-code-point": "^3.0.0", + "strip-ansi": "^6.0.1" + }, + "engines": { + "node": ">=8" + } + }, "node_modules/strip-ansi": { "version": "6.0.1", "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz", @@ -3539,6 +4848,19 @@ "node": ">=8" } }, + "node_modules/strip-ansi-cjs": { + "name": "strip-ansi", + "version": "6.0.1", + "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz", + "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==", + "dev": true, + "dependencies": { + "ansi-regex": "^5.0.1" + }, + "engines": { + "node": ">=8" + } + }, "node_modules/strip-bom": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/strip-bom/-/strip-bom-4.0.0.tgz", @@ -3569,6 +4891,72 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/sucrase": { + "version": "3.35.0", + "resolved": "https://registry.npmjs.org/sucrase/-/sucrase-3.35.0.tgz", + "integrity": "sha512-8EbVDiu9iN/nESwxeSxDKe0dunta1GOlHufmSSXxMD2z2/tMZpDMpvXQGsc+ajGo8y2uYUmixaSRUc/QPoQ0GA==", + "dev": true, + "dependencies": { + "@jridgewell/gen-mapping": "^0.3.2", + "commander": "^4.0.0", + "glob": "^10.3.10", + "lines-and-columns": "^1.1.6", + "mz": "^2.7.0", + "pirates": "^4.0.1", + "ts-interface-checker": "^0.1.9" + }, + "bin": { + "sucrase": "bin/sucrase", + "sucrase-node": "bin/sucrase-node" + }, + "engines": { + "node": ">=16 || 14 >=14.17" + } + }, + "node_modules/sucrase/node_modules/brace-expansion": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.1.tgz", + "integrity": "sha512-XnAIvQ8eM+kC6aULx6wuQiwVsnzsi9d3WxzV3FpWTGA19F621kwdbsAcFKXgKUHZWsy+mY6iL1sHTxWEFCytDA==", + "dev": true, + "dependencies": { + "balanced-match": "^1.0.0" + } + }, + "node_modules/sucrase/node_modules/glob": { + "version": "10.4.5", + "resolved": "https://registry.npmjs.org/glob/-/glob-10.4.5.tgz", + "integrity": "sha512-7Bv8RF0k6xjo7d4A/PxYLbUCfb6c+Vpd2/mB2yRDlew7Jb5hEXiCD9ibfO7wpk8i4sevK6DFny9h7EYbM3/sHg==", + "dev": true, + "dependencies": { + "foreground-child": "^3.1.0", + "jackspeak": "^3.1.2", + "minimatch": "^9.0.4", + "minipass": "^7.1.2", + "package-json-from-dist": "^1.0.0", + "path-scurry": "^1.11.1" + }, + "bin": { + "glob": "dist/esm/bin.mjs" + }, + "funding": { + "url": "https://github.com/sponsors/isaacs" + } + }, + "node_modules/sucrase/node_modules/minimatch": { + "version": "9.0.5", + "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-9.0.5.tgz", + "integrity": "sha512-G6T0ZX48xgozx7587koeX9Ys2NYy6Gmv//P89sEte9V9whIapMNF4idKxnW2QtCcLiTWlb/wfCabAtAFWhhBow==", + "dev": true, + "dependencies": { + "brace-expansion": "^2.0.1" + }, + "engines": { + "node": ">=16 || 14 >=14.17" + }, + "funding": { + "url": "https://github.com/sponsors/isaacs" + } + }, "node_modules/supports-color": { "version": "7.2.0", "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz", @@ -3607,6 +4995,27 @@ "node": ">=8" } }, + "node_modules/thenify": { + "version": "3.3.1", + "resolved": "https://registry.npmjs.org/thenify/-/thenify-3.3.1.tgz", + "integrity": "sha512-RVZSIV5IG10Hk3enotrhvz0T9em6cyHBLkH/YAZuKqd8hRkKhSfCGIcP2KUY0EPxndzANBmNllzWPwak+bheSw==", + "dev": true, + "dependencies": { + "any-promise": "^1.0.0" + } + }, + "node_modules/thenify-all": { + "version": "1.6.0", + "resolved": "https://registry.npmjs.org/thenify-all/-/thenify-all-1.6.0.tgz", + "integrity": "sha512-RNxQH/qI8/t3thXJDwcstUO4zeqo64+Uy/+sNVRBx4Xn2OX+OZ9oP+iJnNFqplFra2ZUVeKCSa2oVWi3T4uVmA==", + "dev": true, + "dependencies": { + "thenify": ">= 3.1.0 < 4" + }, + "engines": { + "node": ">=0.8" + } + }, "node_modules/tmpl": { "version": "1.0.5", "resolved": "https://registry.npmjs.org/tmpl/-/tmpl-1.0.5.tgz", @@ -3634,6 +5043,30 @@ "node": ">=8.0" } }, + "node_modules/tr46": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/tr46/-/tr46-1.0.1.tgz", + "integrity": "sha512-dTpowEjclQ7Kgx5SdBkqRzVhERQXov8/l9Ft9dVM9fmg0W0KQSVaXX9T4i6twCPNtYiZM53lpSSUAwJbFPOHxA==", + "dev": true, + "dependencies": { + "punycode": "^2.1.0" + } + }, + "node_modules/tree-kill": { + "version": "1.2.2", + "resolved": "https://registry.npmjs.org/tree-kill/-/tree-kill-1.2.2.tgz", + "integrity": "sha512-L0Orpi8qGpRG//Nd+H90vFB+3iHnue1zSSGmNOOCh1GLJ7rUKVwV2HvijphGQS2UmhUZewS9VgvxYIdgr+fG1A==", + "dev": true, + "bin": { + "tree-kill": "cli.js" + } + }, + "node_modules/ts-interface-checker": { + "version": "0.1.13", + "resolved": "https://registry.npmjs.org/ts-interface-checker/-/ts-interface-checker-0.1.13.tgz", + "integrity": "sha512-Y/arvbn+rrz3JCKl9C4kVNfTfSm2/mEp5FSz5EsZSANGPSlQrpRI5M4PKF+mJnE52jOO90PnPSc3Ur3bTQw0gA==", + "dev": true + }, "node_modules/ts-jest": { "version": "29.2.2", "resolved": "https://registry.npmjs.org/ts-jest/-/ts-jest-29.2.2.tgz", @@ -3715,6 +5148,69 @@ "integrity": "sha512-3wdGidZyq5PB084XLES5TpOSRA3wjXAlIWMhum2kRcv/41Sn2emQ0dycQW4uZXLejwKvg6EsvbdlVL+FYEct7A==", "dev": true }, + "node_modules/tsup": { + "version": "8.2.4", + "resolved": "https://registry.npmjs.org/tsup/-/tsup-8.2.4.tgz", + "integrity": "sha512-akpCPePnBnC/CXgRrcy72ZSntgIEUa1jN0oJbbvpALWKNOz1B7aM+UVDWGRGIO/T/PZugAESWDJUAb5FD48o8Q==", + "dev": true, + "dependencies": { + "bundle-require": "^5.0.0", + "cac": "^6.7.14", + "chokidar": "^3.6.0", + "consola": "^3.2.3", + "debug": "^4.3.5", + "esbuild": "^0.23.0", + "execa": "^5.1.1", + "globby": "^11.1.0", + "joycon": "^3.1.1", + "picocolors": "^1.0.1", + "postcss-load-config": "^6.0.1", + "resolve-from": "^5.0.0", + "rollup": "^4.19.0", + "source-map": "0.8.0-beta.0", + "sucrase": "^3.35.0", + "tree-kill": "^1.2.2" + }, + "bin": { + "tsup": "dist/cli-default.js", + "tsup-node": "dist/cli-node.js" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "@microsoft/api-extractor": "^7.36.0", + "@swc/core": "^1", + "postcss": "^8.4.12", + "typescript": ">=4.5.0" + }, + "peerDependenciesMeta": { + "@microsoft/api-extractor": { + "optional": true + }, + "@swc/core": { + "optional": true + }, + "postcss": { + "optional": true + }, + "typescript": { + "optional": true + } + } + }, + "node_modules/tsup/node_modules/source-map": { + "version": "0.8.0-beta.0", + "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.8.0-beta.0.tgz", + "integrity": "sha512-2ymg6oRBpebeZi9UUNsgQ89bhx01TcTkmNTGnNO88imTmbSgy4nfujrgVEFKWpMTEGA11EDkTt7mqObTPdigIA==", + "dev": true, + "dependencies": { + "whatwg-url": "^7.0.0" + }, + "engines": { + "node": ">= 8" + } + }, "node_modules/type-detect": { "version": "4.0.8", "resolved": "https://registry.npmjs.org/type-detect/-/type-detect-4.0.8.tgz", @@ -3825,6 +5321,23 @@ "makeerror": "1.0.12" } }, + "node_modules/webidl-conversions": { + "version": "4.0.2", + "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-4.0.2.tgz", + "integrity": "sha512-YQ+BmxuTgd6UXZW3+ICGfyqRyHXVlD5GtQr5+qjiNW7bF0cqrzX500HVXPBOvgXb5YnzDd+h0zqyv61KUD7+Sg==", + "dev": true + }, + "node_modules/whatwg-url": { + "version": "7.1.0", + "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-7.1.0.tgz", + "integrity": "sha512-WUu7Rg1DroM7oQvGWfOiAK21n74Gg+T4elXEQYkOhtyLeWiJFoOGLXPKI/9gzIie9CtwVLm8wtw6YJdKyxSjeg==", + "dev": true, + "dependencies": { + "lodash.sortby": "^4.7.0", + "tr46": "^1.0.1", + "webidl-conversions": "^4.0.2" + } + }, "node_modules/which": { "version": "2.0.2", "resolved": "https://registry.npmjs.org/which/-/which-2.0.2.tgz", @@ -3857,6 +5370,24 @@ "url": "https://github.com/chalk/wrap-ansi?sponsor=1" } }, + "node_modules/wrap-ansi-cjs": { + "name": "wrap-ansi", + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-7.0.0.tgz", + "integrity": "sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==", + "dev": true, + "dependencies": { + "ansi-styles": "^4.0.0", + "string-width": "^4.1.0", + "strip-ansi": "^6.0.0" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/chalk/wrap-ansi?sponsor=1" + } + }, "node_modules/wrappy": { "version": "1.0.2", "resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz", diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 430cffff..f717365e 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -4,9 +4,16 @@ "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts", + "exports": { + "./package.json": "./package.json", + ".": { + "import": "./dist/index.js", + "default": "./dist/index.cjs" + } + }, "type": "module", "scripts": { - "build": "tsc", + "build": "tsup", "build-and-publish": "npm run build && npm publish --access public", "publish-beta": "npm run build && npm publish --access public --tag beta", "test": "NODE_OPTIONS=--experimental-vm-modules jest --verbose src/__tests__/v1/**/*.test.ts" @@ -40,6 +47,7 @@ "@types/uuid": "^9.0.8", "jest": "^29.7.0", "ts-jest": "^29.2.2", + "tsup": "^8.2.4", "typescript": "^5.4.5" }, "keywords": [ diff --git a/apps/js-sdk/firecrawl/tsconfig.json b/apps/js-sdk/firecrawl/tsconfig.json index 071b13ce..1297aed9 100644 --- a/apps/js-sdk/firecrawl/tsconfig.json +++ b/apps/js-sdk/firecrawl/tsconfig.json @@ -16,17 +16,9 @@ "noUncheckedIndexedAccess": true, "noImplicitOverride": true, - /* If transpiling with TypeScript: */ + /* If NOT transpiling with TypeScript: */ "module": "NodeNext", - "outDir": "dist", - "rootDir": "src", - "sourceMap": true, - - /* AND if you're building for a library: */ - "declaration": true, - - /* AND if you're building for a library in a monorepo: */ - "declarationMap": true /* Skip type checking all .d.ts files. */ + "noEmit": true, }, "include": ["src/**/*"], "exclude": ["node_modules", "dist", "**/__tests__/*"] diff --git a/apps/js-sdk/firecrawl/tsup.config.ts b/apps/js-sdk/firecrawl/tsup.config.ts new file mode 100644 index 00000000..b3b7e42d --- /dev/null +++ b/apps/js-sdk/firecrawl/tsup.config.ts @@ -0,0 +1,9 @@ +import { defineConfig } from "tsup"; + +export default defineConfig({ + entryPoints: ["src/index.ts"], + format: ["cjs", "esm"], + dts: true, + outDir: "dist", + clean: true, +}); \ No newline at end of file From 4cd1065ae22ff24df26a495f277193330beb30a6 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 11 Sep 2024 14:03:34 -0400 Subject: [PATCH 32/47] Update rate-limiter.ts --- apps/api/src/services/rate-limiter.ts | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/apps/api/src/services/rate-limiter.ts b/apps/api/src/services/rate-limiter.ts index 7cfff35b..1a40671a 100644 --- a/apps/api/src/services/rate-limiter.ts +++ b/apps/api/src/services/rate-limiter.ts @@ -112,14 +112,16 @@ export const scrapeStatusRateLimiter = new RateLimiterRedis({ duration: 60, // Duration in seconds }); +const testSuiteTokens = ["a01ccae", "6254cf9", "0f96e673", "23befa1b", "69141c4"]; + export function getRateLimiter( mode: RateLimiterMode, token: string, plan?: string, teamId?: string ) { - - if (token.includes("a01ccae") || token.includes("6254cf9") || token.includes("0f96e673") || token.includes("23befa1b")) { + + if (testSuiteTokens.some(testToken => token.includes(testToken))) { return testSuiteRateLimiter; } From 6e1cf2f40d6877de1d0f1cb666cfaa9403d14338 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 11 Sep 2024 20:15:34 +0200 Subject: [PATCH 33/47] feat(js-sdk): fixes, update tests --- apps/js-sdk/firecrawl/package-lock.json | 4 +- .../__tests__/v1/e2e_withAuth/index.test.ts | 199 ++++++++++-------- apps/js-sdk/firecrawl/src/index.ts | 6 +- 3 files changed, 118 insertions(+), 91 deletions(-) diff --git a/apps/js-sdk/firecrawl/package-lock.json b/apps/js-sdk/firecrawl/package-lock.json index 83745a5b..641f55fc 100644 --- a/apps/js-sdk/firecrawl/package-lock.json +++ b/apps/js-sdk/firecrawl/package-lock.json @@ -1,12 +1,12 @@ { "name": "@mendable/firecrawl-js", - "version": "1.2.1", + "version": "1.2.3", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "@mendable/firecrawl-js", - "version": "1.2.1", + "version": "1.2.3", "license": "MIT", "dependencies": { "axios": "^1.6.8", diff --git a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts index 9f6c6462..5eadd92e 100644 --- a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts +++ b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts @@ -1,4 +1,4 @@ -import FirecrawlApp, { CrawlParams, CrawlResponse, CrawlStatusResponse, MapResponse, ScrapeParams, ScrapeResponse } from '../../../index'; +import FirecrawlApp, { type CrawlParams, type CrawlResponse, type CrawlStatusResponse, type MapResponse, type ScrapeResponse } from '../../../index'; import { v4 as uuidv4 } from 'uuid'; import dotenv from 'dotenv'; import { describe, test, expect } from '@jest/globals'; @@ -6,7 +6,7 @@ import { describe, test, expect } from '@jest/globals'; dotenv.config(); const TEST_API_KEY = process.env.TEST_API_KEY; -const API_URL = "http://127.0.0.1:3002"; +const API_URL = "https://api.firecrawl.dev"; describe('FirecrawlApp E2E Tests', () => { test.concurrent('should throw error for no API key', async () => { @@ -71,6 +71,7 @@ describe('FirecrawlApp E2E Tests', () => { expect(response.links?.length).toBeGreaterThan(0); expect(response.links?.[0]).toContain("https://"); expect(response.metadata).not.toBeNull(); + expect(response.metadata).not.toBeUndefined(); expect(response.metadata).toHaveProperty("title"); expect(response.metadata).toHaveProperty("description"); expect(response.metadata).toHaveProperty("keywords"); @@ -85,19 +86,21 @@ describe('FirecrawlApp E2E Tests', () => { expect(response.metadata).not.toHaveProperty("pageStatusCode"); expect(response.metadata).toHaveProperty("statusCode"); expect(response.metadata).not.toHaveProperty("pageError"); - expect(response.metadata.error).toBeUndefined(); - expect(response.metadata.title).toBe("Roast My Website"); - expect(response.metadata.description).toBe("Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️"); - expect(response.metadata.keywords).toBe("Roast My Website,Roast,Website,GitHub,Firecrawl"); - expect(response.metadata.robots).toBe("follow, index"); - expect(response.metadata.ogTitle).toBe("Roast My Website"); - expect(response.metadata.ogDescription).toBe("Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️"); - expect(response.metadata.ogUrl).toBe("https://www.roastmywebsite.ai"); - expect(response.metadata.ogImage).toBe("https://www.roastmywebsite.ai/og.png"); - expect(response.metadata.ogLocaleAlternate).toStrictEqual([]); - expect(response.metadata.ogSiteName).toBe("Roast My Website"); - expect(response.metadata.sourceURL).toBe("https://roastmywebsite.ai"); - expect(response.metadata.statusCode).toBe(200); + if (response.metadata !== undefined) { + expect(response.metadata.error).toBeUndefined(); + expect(response.metadata.title).toBe("Roast My Website"); + expect(response.metadata.description).toBe("Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️"); + expect(response.metadata.keywords).toBe("Roast My Website,Roast,Website,GitHub,Firecrawl"); + expect(response.metadata.robots).toBe("follow, index"); + expect(response.metadata.ogTitle).toBe("Roast My Website"); + expect(response.metadata.ogDescription).toBe("Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️"); + expect(response.metadata.ogUrl).toBe("https://www.roastmywebsite.ai"); + expect(response.metadata.ogImage).toBe("https://www.roastmywebsite.ai/og.png"); + expect(response.metadata.ogLocaleAlternate).toStrictEqual([]); + expect(response.metadata.ogSiteName).toBe("Roast My Website"); + expect(response.metadata.sourceURL).toBe("https://roastmywebsite.ai"); + expect(response.metadata.statusCode).toBe(200); + } }, 30000); // 30 seconds timeout test.concurrent('should return successful response for valid scrape with PDF file', async () => { @@ -127,7 +130,7 @@ describe('FirecrawlApp E2E Tests', () => { test.concurrent('should return successful response for crawl and wait for completion', async () => { const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); - const response = await app.crawlUrl('https://roastmywebsite.ai', {}, true, 30) as CrawlStatusResponse; + const response = await app.crawlUrl('https://roastmywebsite.ai', {}, 30) as CrawlStatusResponse; expect(response).not.toBeNull(); expect(response).toHaveProperty("total"); expect(response.total).toBeGreaterThan(0); @@ -138,21 +141,25 @@ describe('FirecrawlApp E2E Tests', () => { expect(response).toHaveProperty("status"); expect(response.status).toBe("completed"); expect(response).not.toHaveProperty("next"); // wait until done - expect(response.data?.length).toBeGreaterThan(0); - expect(response.data?.[0]).toHaveProperty("markdown"); - expect(response.data?.[0].markdown).toContain("_Roast_"); - expect(response.data?.[0]).not.toHaveProperty('content'); // v0 - expect(response.data?.[0]).not.toHaveProperty("html"); - expect(response.data?.[0]).not.toHaveProperty("rawHtml"); - expect(response.data?.[0]).not.toHaveProperty("screenshot"); - expect(response.data?.[0]).not.toHaveProperty("links"); - expect(response.data?.[0]).toHaveProperty("metadata"); - expect(response.data?.[0].metadata).toHaveProperty("title"); - expect(response.data?.[0].metadata).toHaveProperty("description"); - expect(response.data?.[0].metadata).toHaveProperty("language"); - expect(response.data?.[0].metadata).toHaveProperty("sourceURL"); - expect(response.data?.[0].metadata).toHaveProperty("statusCode"); - expect(response.data?.[0].metadata).not.toHaveProperty("error"); + expect(response.data.length).toBeGreaterThan(0); + expect(response.data[0]).not.toBeNull(); + expect(response.data[0]).not.toBeUndefined(); + if (response.data[0]) { + expect(response.data[0]).toHaveProperty("markdown"); + expect(response.data[0].markdown).toContain("_Roast_"); + expect(response.data[0]).not.toHaveProperty('content'); // v0 + expect(response.data[0]).not.toHaveProperty("html"); + expect(response.data[0]).not.toHaveProperty("rawHtml"); + expect(response.data[0]).not.toHaveProperty("screenshot"); + expect(response.data[0]).not.toHaveProperty("links"); + expect(response.data[0]).toHaveProperty("metadata"); + expect(response.data[0].metadata).toHaveProperty("title"); + expect(response.data[0].metadata).toHaveProperty("description"); + expect(response.data[0].metadata).toHaveProperty("language"); + expect(response.data[0].metadata).toHaveProperty("sourceURL"); + expect(response.data[0].metadata).toHaveProperty("statusCode"); + expect(response.data[0].metadata).not.toHaveProperty("error"); + } }, 60000); // 60 seconds timeout test.concurrent('should return successful response for crawl with options and wait for completion', async () => { @@ -173,7 +180,7 @@ describe('FirecrawlApp E2E Tests', () => { onlyMainContent: true, waitFor: 1000 } - } as CrawlParams, true, 30) as CrawlStatusResponse; + } as CrawlParams, 30) as CrawlStatusResponse; expect(response).not.toBeNull(); expect(response).toHaveProperty("total"); expect(response.total).toBeGreaterThan(0); @@ -184,41 +191,45 @@ describe('FirecrawlApp E2E Tests', () => { expect(response).toHaveProperty("status"); expect(response.status).toBe("completed"); expect(response).not.toHaveProperty("next"); - expect(response.data?.length).toBeGreaterThan(0); - expect(response.data?.[0]).toHaveProperty("markdown"); - expect(response.data?.[0].markdown).toContain("_Roast_"); - expect(response.data?.[0]).not.toHaveProperty('content'); // v0 - expect(response.data?.[0]).toHaveProperty("html"); - expect(response.data?.[0].html).toContain(" { const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); const uniqueIdempotencyKey = uuidv4(); - const response = await app.crawlUrl('https://roastmywebsite.ai', {}, false, 2, uniqueIdempotencyKey) as CrawlResponse; + const response = await app.asyncCrawlUrl('https://roastmywebsite.ai', {}, uniqueIdempotencyKey) as CrawlResponse; expect(response).not.toBeNull(); expect(response.id).toBeDefined(); - await expect(app.crawlUrl('https://roastmywebsite.ai', {}, true, 2, uniqueIdempotencyKey)).rejects.toThrow("Request failed with status code 409"); + await expect(app.crawlUrl('https://roastmywebsite.ai', {}, 2, uniqueIdempotencyKey)).rejects.toThrow("Request failed with status code 409"); }); test.concurrent('should check crawl status', async () => { const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); - const response = await app.crawlUrl('https://firecrawl.dev', { scrapeOptions: { formats: ['markdown', 'html', 'rawHtml', 'screenshot', 'links']}} as CrawlParams, false) as CrawlResponse; + const response = await app.asyncCrawlUrl('https://firecrawl.dev', { scrapeOptions: { formats: ['markdown', 'html', 'rawHtml', 'screenshot', 'links']}} as CrawlParams) as CrawlResponse; expect(response).not.toBeNull(); expect(response.id).toBeDefined(); @@ -226,7 +237,8 @@ describe('FirecrawlApp E2E Tests', () => { const maxChecks = 15; let checks = 0; - while (statusResponse.status === 'scraping' && checks < maxChecks) { + expect(statusResponse.success).toBe(true); + while ((statusResponse as any).status === 'scraping' && checks < maxChecks) { await new Promise(resolve => setTimeout(resolve, 5000)); expect(statusResponse).not.toHaveProperty("partial_data"); // v0 expect(statusResponse).not.toHaveProperty("current"); // v0 @@ -236,44 +248,55 @@ describe('FirecrawlApp E2E Tests', () => { expect(statusResponse).toHaveProperty("expiresAt"); expect(statusResponse).toHaveProperty("status"); expect(statusResponse).toHaveProperty("next"); - expect(statusResponse.total).toBeGreaterThan(0); - expect(statusResponse.creditsUsed).toBeGreaterThan(0); - expect(statusResponse.expiresAt.getTime()).toBeGreaterThan(Date.now()); - expect(statusResponse.status).toBe("scraping"); - expect(statusResponse.next).toContain("/v1/crawl/"); + expect(statusResponse.success).toBe(true); + if (statusResponse.success === true) { + expect(statusResponse.total).toBeGreaterThan(0); + expect(statusResponse.creditsUsed).toBeGreaterThan(0); + expect(statusResponse.expiresAt.getTime()).toBeGreaterThan(Date.now()); + expect(statusResponse.status).toBe("scraping"); + expect(statusResponse.next).toContain("/v1/crawl/"); + } statusResponse = await app.checkCrawlStatus(response.id) as CrawlStatusResponse; + expect(statusResponse.success).toBe(true); checks++; } expect(statusResponse).not.toBeNull(); expect(statusResponse).toHaveProperty("total"); - expect(statusResponse.total).toBeGreaterThan(0); - expect(statusResponse).toHaveProperty("creditsUsed"); - expect(statusResponse.creditsUsed).toBeGreaterThan(0); - expect(statusResponse).toHaveProperty("expiresAt"); - expect(statusResponse.expiresAt.getTime()).toBeGreaterThan(Date.now()); - expect(statusResponse).toHaveProperty("status"); - expect(statusResponse.status).toBe("completed"); - expect(statusResponse.data?.length).toBeGreaterThan(0); - expect(statusResponse.data?.[0]).toHaveProperty("markdown"); - expect(statusResponse.data?.[0].markdown?.length).toBeGreaterThan(10); - expect(statusResponse.data?.[0]).not.toHaveProperty('content'); // v0 - expect(statusResponse.data?.[0]).toHaveProperty("html"); - expect(statusResponse.data?.[0].html).toContain(" { diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 6c61c628..6248789b 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -183,7 +183,11 @@ export default class FirecrawlApp { * @param config - Configuration options for the FirecrawlApp instance. */ constructor({ apiKey = null, apiUrl = null }: FirecrawlAppConfig) { - this.apiKey = apiKey || ""; + if (typeof apiKey !== "string") { + throw new Error("No API key provided"); + } + + this.apiKey = apiKey; this.apiUrl = apiUrl || "https://api.firecrawl.dev"; } From ad70c30be537fa4aa8283ae98331eb828c0a276b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 11 Sep 2024 20:31:58 +0200 Subject: [PATCH 34/47] fix(js-sdk): check at bad if --- apps/js-sdk/firecrawl/src/index.ts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 6248789b..115e62e9 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -186,7 +186,7 @@ export default class FirecrawlApp { if (typeof apiKey !== "string") { throw new Error("No API key provided"); } - + this.apiKey = apiKey; this.apiUrl = apiUrl || "https://api.firecrawl.dev"; } @@ -346,9 +346,9 @@ export default class FirecrawlApp { `${this.apiUrl}/v1/crawl/${id}`, headers ); - if (response.status === 200 && getAllData) { + if (response.status === 200) { let allData = response.data.data; - if (response.data.status === "completed") { + if (getAllData && response.data.status === "completed") { let statusData = response.data if ("data" in statusData) { let data = statusData.data; From 5adfd74cc5a6e251d0ad6fc4bc4be7cc7cd6d04c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 11 Sep 2024 20:32:34 +0200 Subject: [PATCH 35/47] feat(js-sdk/test): add API_URL env var --- .../firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts index 5eadd92e..98a52538 100644 --- a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts +++ b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts @@ -6,7 +6,7 @@ import { describe, test, expect } from '@jest/globals'; dotenv.config(); const TEST_API_KEY = process.env.TEST_API_KEY; -const API_URL = "https://api.firecrawl.dev"; +const API_URL = process.env.API_URL ?? "https://api.firecrawl.dev"; describe('FirecrawlApp E2E Tests', () => { test.concurrent('should throw error for no API key', async () => { From 99c1af0a9f6d2cf00e3ffce33ad7984aca345548 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 11 Sep 2024 14:59:36 -0400 Subject: [PATCH 36/47] Update package.json --- apps/js-sdk/firecrawl/package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index f5de3159..c3135aca 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.2.3", + "version": "1.3.0", "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts", From 503c8b3efa46697258e2decfd0892bf103679afd Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 12 Sep 2024 11:35:26 -0400 Subject: [PATCH 37/47] Update package-lock.json --- apps/js-sdk/firecrawl/package-lock.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/js-sdk/firecrawl/package-lock.json b/apps/js-sdk/firecrawl/package-lock.json index 641f55fc..2dcca44d 100644 --- a/apps/js-sdk/firecrawl/package-lock.json +++ b/apps/js-sdk/firecrawl/package-lock.json @@ -1,12 +1,12 @@ { "name": "@mendable/firecrawl-js", - "version": "1.2.3", + "version": "1.3.0", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "@mendable/firecrawl-js", - "version": "1.2.3", + "version": "1.3.0", "license": "MIT", "dependencies": { "axios": "^1.6.8", From eec22a56d3dce2253da390afd17753850e70ae31 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 12 Sep 2024 11:43:40 -0400 Subject: [PATCH 38/47] Nick: self host issue template --- .github/ISSUE_TEMPLATE/bug_report.md | 2 +- .github/ISSUE_TEMPLATE/feature_request.md | 2 +- .github/ISSUE_TEMPLATE/self_host_issue.md | 40 +++++++++++++++++++++++ 3 files changed, 42 insertions(+), 2 deletions(-) create mode 100644 .github/ISSUE_TEMPLATE/self_host_issue.md diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index bb47b47f..bbc1e098 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -1,7 +1,7 @@ --- name: Bug report about: Create a report to help us improve -title: "[BUG]" +title: "[Bug] " labels: bug assignees: '' diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md index b01699b7..6760afa8 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.md +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -1,7 +1,7 @@ --- name: Feature request about: Suggest an idea for this project -title: "[Feat]" +title: "[Feat] " labels: '' assignees: '' diff --git a/.github/ISSUE_TEMPLATE/self_host_issue.md b/.github/ISSUE_TEMPLATE/self_host_issue.md new file mode 100644 index 00000000..73a0ef9d --- /dev/null +++ b/.github/ISSUE_TEMPLATE/self_host_issue.md @@ -0,0 +1,40 @@ +--- +name: Self-host issue +about: Report an issue with self-hosting Firecrawl +title: "[Self-Host] " +labels: self-host +assignees: '' + +--- + +**Describe the Issue** +Provide a clear and concise description of the self-hosting issue you're experiencing. + +**To Reproduce** +Steps to reproduce the issue: +1. Configure the environment or settings with '...' +2. Run the command '...' +3. Observe the error or unexpected output at '...' +4. Log output/error message + +**Expected Behavior** +A clear and concise description of what you expected to happen when self-hosting. + +**Screenshots** +If applicable, add screenshots or copies of the command line output to help explain the self-hosting issue. + +**Environment (please complete the following information):** +- OS: [e.g. macOS, Linux, Windows] +- Firecrawl Version: [e.g. 1.2.3] +- Node.js Version: [e.g. 14.x] +- Docker Version (if applicable): [e.g. 20.10.14] +- Database Type and Version: [e.g. PostgreSQL 13.4] + +**Logs** +If applicable, include detailed logs to help understand the self-hosting problem. + +**Configuration** +Provide relevant parts of your configuration files (with sensitive information redacted). + +**Additional Context** +Add any other context about the self-hosting issue here, such as specific infrastructure details, network setup, or any modifications made to the original Firecrawl setup. From a2903e75cfb06fa7cbbb31e663908c70c42a544d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 12 Sep 2024 18:48:19 +0200 Subject: [PATCH 39/47] feat(js-sdk): type-safe LLM extract --- apps/js-sdk/firecrawl/src/index.ts | 43 ++++++++++++++++-------------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 115e62e9..124a84e8 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -1,5 +1,5 @@ import axios, { type AxiosResponse, type AxiosRequestHeaders } from "axios"; -import type { ZodSchema } from "zod"; +import type { infer as ZodInfer, ZodSchema } from "zod"; import { zodToJsonSchema } from "zod-to-json-schema"; import { WebSocket } from "isows"; import { TypedEventTarget } from "typescript-event-target"; @@ -58,13 +58,13 @@ export interface FirecrawlDocumentMetadata { * Document interface for Firecrawl. * Represents a document retrieved or processed by Firecrawl. */ -export interface FirecrawlDocument { +export interface FirecrawlDocument { url?: string; markdown?: string; html?: string; rawHtml?: string; links?: string[]; - extract?: Record; + extract?: T; screenshot?: string; metadata?: FirecrawlDocumentMetadata; } @@ -73,26 +73,29 @@ export interface FirecrawlDocument { * Parameters for scraping operations. * Defines the options and configurations available for scraping web content. */ -export interface ScrapeParams { +export interface CrawlScrapeOptions { formats: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot" | "extract" | "full@scrennshot")[]; headers?: Record; includeTags?: string[]; excludeTags?: string[]; onlyMainContent?: boolean; - extract?: { - prompt?: string; - schema?: ZodSchema | any; - systemPrompt?: string; - }; waitFor?: number; timeout?: number; } +export interface ScrapeParams extends CrawlScrapeOptions { + extract?: { + prompt?: string; + schema?: LLMSchema; + systemPrompt?: string; + }; +} + /** * Response interface for scraping operations. * Defines the structure of the response received after a scraping operation. */ -export interface ScrapeResponse extends FirecrawlDocument { +export interface ScrapeResponse extends FirecrawlDocument { success: true; warning?: string; error?: string; @@ -110,7 +113,7 @@ export interface CrawlParams { allowBackwardLinks?: boolean; allowExternalLinks?: boolean; ignoreSitemap?: boolean; - scrapeOptions?: ScrapeParams; + scrapeOptions?: CrawlScrapeOptions; webhook?: string; } @@ -137,7 +140,7 @@ export interface CrawlStatusResponse { creditsUsed: number; expiresAt: Date; next?: string; - data: FirecrawlDocument[]; + data: FirecrawlDocument[]; }; /** @@ -197,10 +200,10 @@ export default class FirecrawlApp { * @param params - Additional parameters for the scrape request. * @returns The response from the scrape operation. */ - async scrapeUrl( + async scrapeUrl( url: string, - params?: ScrapeParams - ): Promise { + params?: ScrapeParams + ): Promise> | ErrorResponse> { const headers: AxiosRequestHeaders = { "Content-Type": "application/json", Authorization: `Bearer ${this.apiKey}`, @@ -528,21 +531,21 @@ export default class FirecrawlApp { } interface CrawlWatcherEvents { - document: CustomEvent, + document: CustomEvent>, done: CustomEvent<{ status: CrawlStatusResponse["status"]; - data: FirecrawlDocument[]; + data: FirecrawlDocument[]; }>, error: CustomEvent<{ status: CrawlStatusResponse["status"], - data: FirecrawlDocument[], + data: FirecrawlDocument[], error: string, }>, } export class CrawlWatcher extends TypedEventTarget { private ws: WebSocket; - public data: FirecrawlDocument[]; + public data: FirecrawlDocument[]; public status: CrawlStatusResponse["status"]; constructor(id: string, app: FirecrawlApp) { @@ -563,7 +566,7 @@ export class CrawlWatcher extends TypedEventTarget { type DocumentMessage = { type: "document", - data: FirecrawlDocument, + data: FirecrawlDocument, } type DoneMessage = { type: "done" } From 620b02f9ca7de0436b1ea0499d7c4684090d351c Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 12 Sep 2024 12:51:14 -0400 Subject: [PATCH 40/47] Nick: --- apps/api/src/controllers/v0/scrape.ts | 2 +- apps/api/src/controllers/v1/scrape.ts | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/api/src/controllers/v0/scrape.ts b/apps/api/src/controllers/v0/scrape.ts index 2a5f1d4f..c46ebc62 100644 --- a/apps/api/src/controllers/v0/scrape.ts +++ b/apps/api/src/controllers/v0/scrape.ts @@ -229,7 +229,7 @@ export async function scrapeController(req: Request, res: Response) { if (result.success) { let creditsToBeBilled = 1; - const creditsPerLLMExtract = 49; + const creditsPerLLMExtract = 4; if (extractorOptions.mode.includes("llm-extraction")) { // creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length); diff --git a/apps/api/src/controllers/v1/scrape.ts b/apps/api/src/controllers/v1/scrape.ts index 0835cc2a..f0744c22 100644 --- a/apps/api/src/controllers/v1/scrape.ts +++ b/apps/api/src/controllers/v1/scrape.ts @@ -103,7 +103,7 @@ export async function scrapeController( return; } if(req.body.extract && req.body.formats.includes("extract")) { - creditsToBeBilled = 50; + creditsToBeBilled = 5; } billTeam(req.auth.team_id, creditsToBeBilled).catch(error => { From d497284b40dede0c4aae4fe68d46166207195d63 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 12 Sep 2024 19:47:15 +0200 Subject: [PATCH 41/47] feat(api/queue): auto-remove completed jobs after 25 hours --- apps/api/src/services/queue-service.ts | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/apps/api/src/services/queue-service.ts b/apps/api/src/services/queue-service.ts index 113b3fa3..14dddebe 100644 --- a/apps/api/src/services/queue-service.ts +++ b/apps/api/src/services/queue-service.ts @@ -16,6 +16,14 @@ export function getScrapeQueue() { scrapeQueueName, { connection: redisConnection, + defaultJobOptions: { + removeOnComplete: { + age: 90000, // 25 hours + }, + removeOnFail: { + age: 90000, // 25 hours + }, + }, } // { // settings: { From d30356a22c559408547f88979a5a19ca238ff0d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 12 Sep 2024 19:57:33 +0200 Subject: [PATCH 42/47] fix(js-sdk): infer keyword collision --- apps/js-sdk/firecrawl/src/index.ts | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 124a84e8..949cfe98 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -1,5 +1,5 @@ import axios, { type AxiosResponse, type AxiosRequestHeaders } from "axios"; -import type { infer as ZodInfer, ZodSchema } from "zod"; +import type * as zt from "zod"; import { zodToJsonSchema } from "zod-to-json-schema"; import { WebSocket } from "isows"; import { TypedEventTarget } from "typescript-event-target"; @@ -83,7 +83,7 @@ export interface CrawlScrapeOptions { timeout?: number; } -export interface ScrapeParams extends CrawlScrapeOptions { +export interface ScrapeParams extends CrawlScrapeOptions { extract?: { prompt?: string; schema?: LLMSchema; @@ -200,10 +200,10 @@ export default class FirecrawlApp { * @param params - Additional parameters for the scrape request. * @returns The response from the scrape operation. */ - async scrapeUrl( + async scrapeUrl( url: string, params?: ScrapeParams - ): Promise> | ErrorResponse> { + ): Promise> | ErrorResponse> { const headers: AxiosRequestHeaders = { "Content-Type": "application/json", Authorization: `Bearer ${this.apiKey}`, From e6ac90c1a0ec04739ae6a8e9485d3449c4981c28 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 12 Sep 2024 14:01:47 -0400 Subject: [PATCH 43/47] Update package.json --- apps/js-sdk/firecrawl/package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index c3135aca..f6f14fb2 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.3.0", + "version": "1.4.2", "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts", From 0d1b46d4763013c2f7950dc71c66d6d30429e0c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 12 Sep 2024 21:30:17 +0200 Subject: [PATCH 44/47] fix(js-sdk): improve error logging --- apps/js-sdk/firecrawl/src/index.ts | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 949cfe98..661ce34b 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -1,4 +1,4 @@ -import axios, { type AxiosResponse, type AxiosRequestHeaders } from "axios"; +import axios, { type AxiosResponse, type AxiosRequestHeaders, AxiosError } from "axios"; import type * as zt from "zod"; import { zodToJsonSchema } from "zod-to-json-schema"; import { WebSocket } from "isows"; @@ -452,11 +452,19 @@ export default class FirecrawlApp { * @param headers - The headers for the request. * @returns The response from the GET request. */ - getRequest( + async getRequest( url: string, headers: AxiosRequestHeaders ): Promise { - return axios.get(url, { headers }); + try { + return await axios.get(url, { headers }); + } catch (error) { + if (error instanceof AxiosError && error.response) { + return error.response as AxiosResponse; + } else { + throw error; + } + } } /** From f7c4cee404e17b3ed201e005185a5041009d0e6f Mon Sep 17 00:00:00 2001 From: Gergo Moricz Date: Fri, 13 Sep 2024 14:02:49 +0200 Subject: [PATCH 45/47] fix(queue-worker): don't send LLM extract hallucination error to Sentry --- apps/api/src/services/queue-worker.ts | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index ad0e4ad5..37e14baf 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -448,11 +448,13 @@ async function processJob(job: Job, token: string) { } catch (error) { Logger.error(`🐂 Job errored ${job.id} - ${error}`); - Sentry.captureException(error, { - data: { - job: job.id, - }, - }); + if (!(error instanceof Error && error.message.includes("JSON parsing error(s): "))) { + Sentry.captureException(error, { + data: { + job: job.id, + }, + }); + } if (error instanceof CustomError) { // Here we handle the error, then save the failed job From 000a316cc362b935976ac47b73ec02923f4175c5 Mon Sep 17 00:00:00 2001 From: Gergo Moricz Date: Fri, 13 Sep 2024 16:41:27 +0200 Subject: [PATCH 46/47] fix(fire-engine): poll more frequently --- apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts index a3f393c8..80ac7924 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts @@ -127,7 +127,7 @@ export async function scrapWithFireEngine({ let checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${_response.data.jobId}`); while (checkStatusResponse.data.processing && Date.now() - startTime < universalTimeout + waitParam) { - await new Promise(resolve => setTimeout(resolve, 1000)); // wait 1 second + await new Promise(resolve => setTimeout(resolve, 250)); // wait 0.25 seconds checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${_response.data.jobId}`); } From 030ecab6eebd4ad73945e5faf315f1cc547a3277 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 13 Sep 2024 18:09:59 -0400 Subject: [PATCH 47/47] Update rate-limiter.ts --- apps/api/src/services/rate-limiter.ts | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/apps/api/src/services/rate-limiter.ts b/apps/api/src/services/rate-limiter.ts index 1a40671a..51a0ecfa 100644 --- a/apps/api/src/services/rate-limiter.ts +++ b/apps/api/src/services/rate-limiter.ts @@ -104,6 +104,13 @@ export const devBRateLimiter = new RateLimiterRedis({ duration: 60, // Duration in seconds }); +export const manualRateLimiter = new RateLimiterRedis({ + storeClient: redisRateLimitClient, + keyPrefix: "manual", + points: 2000, + duration: 60, // Duration in seconds +}); + export const scrapeStatusRateLimiter = new RateLimiterRedis({ storeClient: redisRateLimitClient, @@ -114,6 +121,8 @@ export const scrapeStatusRateLimiter = new RateLimiterRedis({ const testSuiteTokens = ["a01ccae", "6254cf9", "0f96e673", "23befa1b", "69141c4"]; +const manual = ["69be9e74-7624-4990-b20d-08e0acc70cf6"]; + export function getRateLimiter( mode: RateLimiterMode, token: string, @@ -129,6 +138,10 @@ export function getRateLimiter( return devBRateLimiter; } + if(teamId && manual.includes(teamId)) { + return manualRateLimiter; + } + const rateLimitConfig = RATE_LIMITS[mode]; // {default : 5} if (!rateLimitConfig) return serverRateLimiter;