From 0566e54d859330b8942ce0f1c6341760e30a0ded Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Mon, 26 Aug 2024 15:16:50 -0400 Subject: [PATCH 01/62] init --- .../turning_docs_into_api_specs/api_spec.json | 771 ++++++++++++++++++ .../dify_api_spec.json | 164 ++++ .../docs.firecrawl.dev/api_spec_0.json | 211 +++++ .../docs.firecrawl.dev/api_spec_1.json | 165 ++++ .../docs.firecrawl.dev/api_spec_10.json | 93 +++ .../docs.firecrawl.dev/api_spec_11.json | 131 +++ .../docs.firecrawl.dev/api_spec_13.json | 87 ++ .../docs.firecrawl.dev/api_spec_15.json | 83 ++ .../docs.firecrawl.dev/api_spec_16.json | 200 +++++ .../docs.firecrawl.dev/api_spec_2.json | 54 ++ .../docs.firecrawl.dev/api_spec_22.json | 166 ++++ .../docs.firecrawl.dev/api_spec_25.json | 229 ++++++ .../docs.firecrawl.dev/api_spec_26.json | 115 +++ .../docs.firecrawl.dev/api_spec_3.json | 185 +++++ .../docs.firecrawl.dev/api_spec_30.json | 212 +++++ .../docs.firecrawl.dev/api_spec_31.json | 199 +++++ .../docs.firecrawl.dev/api_spec_33.json | 202 +++++ .../docs.firecrawl.dev/api_spec_34.json | 201 +++++ .../docs.firecrawl.dev/api_spec_35.json | 245 ++++++ .../docs.firecrawl.dev/api_spec_4.json | 129 +++ .../docs.firecrawl.dev/api_spec_5.json | 186 +++++ .../docs.firecrawl.dev/api_spec_7.json | 86 ++ .../docs.firecrawl.dev/api_spec_8.json | 59 ++ .../docs.firecrawl.dev/combined_api_spec.json | 738 +++++++++++++++++ .../turning_docs_into_api_specs.ipynb | 287 +++++++ 25 files changed, 5198 insertions(+) create mode 100644 examples/turning_docs_into_api_specs/api_spec.json create mode 100644 examples/turning_docs_into_api_specs/dify_api_spec.json create mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_0.json create mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_1.json create mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_10.json create mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_11.json create mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_13.json create mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_15.json create mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_16.json create mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_2.json create mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_22.json create mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_25.json create mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_26.json create mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_3.json create mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_30.json create mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_31.json create mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_33.json create mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_34.json create mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_35.json create mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_4.json create mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_5.json create mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_7.json create mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_8.json create mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/combined_api_spec.json create mode 100644 examples/turning_docs_into_api_specs/turning_docs_into_api_specs.ipynb diff --git a/examples/turning_docs_into_api_specs/api_spec.json b/examples/turning_docs_into_api_specs/api_spec.json new file mode 100644 index 00000000..d866efd3 --- /dev/null +++ b/examples/turning_docs_into_api_specs/api_spec.json @@ -0,0 +1,771 @@ +{ + "info": { + "title": "Firecrawl API", + "version": "v0" + }, + "openapi": "3.0.0", + "paths": { + "/crawl": { + "post": { + "/crawl/cancel/{jobId}": { + "/crawl/status/{jobId}": { + "get": { + "/scrape": { + "/search": { + "post": { + "components": { + "securitySchemes": { + "Authorization": { + "bearerFormat": "JWT", + "scheme": "bearer", + "type": "http" + } + } + }, + "description": "Send a request to perform a web search and get scraped results from the top pages.", + "operationId": "searchWeb", + "parameters": [], + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "pageOptions": { + "description": "Options for controlling the scraping behavior of search result pages.", + "properties": { + "fetchPageContent": { + "default": true, + "description": "Fetch the content of each page. If false, defaults to a basic fast serp API.", + "type": "boolean" + }, + "includeHtml": { + "default": false, + "description": "Include the HTML version of the content on page. Will output a html key in the response.", + "type": "boolean" + }, + "includeRawHtml": { + "default": false, + "description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.", + "type": "boolean" + }, + "onlyMainContent": { + "default": false, + "description": "Only return the main content of the page excluding headers, navs, footers, etc.", + "type": "boolean" + } + }, + "type": "object" + }, + "query": { + "description": "The search query.", + "required": true, + "type": "string" + }, + "searchOptions": { + "description": "Options for controlling the search.", + "properties": { + "limit": { + "description": "Maximum number of search results to return.", + "type": "integer" + } + }, + "type": "object" + } + }, + "type": "object" + } + } + }, + "responses": { + "200": { + "402": { + "description": "Payment required." + }, + "429": { + "description": "Rate limit exceeded." + }, + "500": { + "description": "Internal server error." + }, + "content": { + "application/json": { + "schema": { + "properties": { + "data": { + "description": "An array of search results.", + "items": { + "properties": { + "content": { + "description": "Raw content of the search result page.", + "type": "string" + }, + "markdown": { + "description": "Markdown content of the search result page.", + "type": "string" + }, + "metadata": { + "description": "Metadata extracted from the search result page.", + "properties": { + "description": { + "description": "Page description.", + "type": "string" + }, + "language": { + "description": "Page language.", + "nullable": true, + "type": "string" + }, + "sourceURL": { + "description": "Source URL of the search result page.", + "type": "string" + }, + "title": { + "description": "Page title.", + "type": "string" + } + }, + "type": "object" + }, + "url": { + "description": "URL of the search result.", + "type": "string" + } + }, + "type": "object" + }, + "type": "array" + }, + "success": { + "description": "Indicates if the search was successful.", + "type": "boolean" + } + }, + "type": "object" + } + } + }, + "description": "Web search completed successfully." + } + } + }, + "summary": "Search the Web" + } + }, + "post": { + "description": "Send a request to scrape a single URL and get its content.", + "operationId": "scrapeURL", + "parameters": [], + "requestBody": { + "402": { + "description": "Payment required." + }, + "429": { + "description": "Rate limit exceeded." + }, + "500": { + "description": "Internal server error." + }, + "content": { + "application/json": { + "schema": { + "properties": { + "extractorOptions": { + "description": "Options for extraction of structured information from the page content. Note: LLM-based extraction is not performed by default and only occurs when explicitly configured. The 'markdown' mode simply returns the scraped markdown and is the default mode for scraping.", + "properties": { + "extractionPrompt": { + "description": "A prompt describing what information to extract from the page, applicable for LLM extraction modes.", + "type": "string" + }, + "extractionSchema": { + "description": "The schema for the data to be extracted, required only for LLM extraction modes.", + "type": "object" + }, + "mode": { + "default": "markdown", + "description": "The extraction mode to use. 'markdown': Returns the scraped markdown content, does not perform LLM extraction. 'llm-extraction': Extracts information from the cleaned and parsed content using LLM. 'llm-extraction-from-raw-html': Extracts information directly from the raw HTML using LLM. 'llm-extraction-from-markdown': Extracts information from the markdown content using LLM.", + "enum": [ + "markdown", + "llm-extraction", + "llm-extraction-from-raw-html", + "llm-extraction-from-markdown" + ], + "type": "string" + } + }, + "type": "object" + }, + "pageOptions": { + "description": "Options for controlling the scraping behavior.", + "properties": { + "fullPageScreenshot": { + "default": false, + "description": "Include a full page screenshot of the page that you are scraping.", + "type": "boolean" + }, + "headers": { + "description": "Headers to send with the request. Can be used to send cookies, user-agent, etc.", + "type": "object" + }, + "includeHtml": { + "default": false, + "description": "Include the HTML version of the content on page. Will output a html key in the response.", + "type": "boolean" + }, + "includeRawHtml": { + "default": false, + "description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.", + "type": "boolean" + }, + "onlyIncludeTags": { + "description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'", + "items": { + "type": "string" + }, + "type": "array" + }, + "onlyMainContent": { + "default": false, + "description": "Only return the main content of the page excluding headers, navs, footers, etc.", + "type": "boolean" + }, + "removeTags": { + "description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'", + "items": { + "type": "string" + }, + "type": "array" + }, + "replaceAllPathsWithAbsolutePaths": { + "default": false, + "description": "Replace all relative paths with absolute paths for images and links", + "type": "boolean" + }, + "screenshot": { + "default": false, + "description": "Include a screenshot of the top of the page that you are scraping.", + "type": "boolean" + }, + "waitFor": { + "default": 0, + "description": "Wait x amount of milliseconds for the page to load to fetch content", + "type": "integer" + } + }, + "type": "object" + }, + "timeout": { + "default": 30000, + "description": "Timeout in milliseconds for the request", + "type": "integer" + }, + "url": { + "description": "The URL to scrape.", + "required": true, + "type": "string" + } + }, + "type": "object" + } + } + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "data": { + "properties": { + "content": { + "description": "Raw content of the page.", + "type": "string" + }, + "html": { + "description": "HTML version of the page content, only present if `includeHtml` was set to `true` in the request.", + "nullable": true, + "type": "string" + }, + "llm_extraction": { + "description": "Extracted data from the page using the specified schema, only present if an LLM extraction mode was used.", + "nullable": true, + "type": "object" + }, + "markdown": { + "description": "Markdown version of the page content.", + "type": "string" + }, + "metadata": { + "properties": { + " ": { + "description": "Any other extracted metadata.", + "type": "string" + }, + "description": { + "description": "Page description.", + "type": "string" + }, + "language": { + "description": "Page language.", + "nullable": true, + "type": "string" + }, + "pageError": { + "description": "Error message if there was an error scraping the page.", + "nullable": true, + "type": "string" + }, + "pageStatusCode": { + "description": "HTTP status code of the page.", + "type": "integer" + }, + "sourceURL": { + "description": "Source URL of the page.", + "type": "string" + }, + "title": { + "description": "Page title.", + "type": "string" + } + }, + "type": "object" + }, + "rawHtml": { + "description": "Raw HTML content of the page, only present if `includeRawHtml` was set to `true` in the request.", + "nullable": true, + "type": "string" + }, + "warning": { + "description": "Warning message from the LLM extraction process, if any.", + "nullable": true, + "type": "string" + } + }, + "type": "object" + }, + "success": { + "description": "Indicates whether the scraping was successful.", + "type": "boolean" + } + }, + "type": "object" + } + } + }, + "description": "URL scraped successfully." + } + } + }, + "summary": "Scrape a URL" + } + }, + "description": "Send a request to get the status and results of a crawl job.", + "operationId": "getCrawlJobStatus", + "parameters": [ + { + "description": "ID of the crawl job to check.", + "in": "path", + "name": "jobId", + "required": true, + "schema": { + "type": "string" + } + } + ], + "requestBody": { + "content": {} + }, + "responses": { + "200": { + "402": { + "description": "Payment required." + }, + "429": { + "description": "Rate limit exceeded." + }, + "500": { + "description": "Internal server error." + }, + "content": { + "application/json": { + "schema": { + "properties": { + "current": { + "description": "The number of pages crawled so far.", + "type": "integer" + }, + "data": { + "description": "The crawl results. Only available when the crawl job is completed.", + "items": { + "properties": { + "content": { + "description": "Raw content of the page.", + "type": "string" + }, + "html": { + "description": "HTML version of the page content, only present if `includeHtml` was set to `true` in the crawl request.", + "type": "string" + }, + "index": { + "description": "The index of the crawled page in the results.", + "type": "integer" + }, + "markdown": { + "description": "Markdown content of the page.", + "type": "string" + }, + "metadata": { + "description": "Metadata extracted from the page.", + "properties": { + " ": { + "description": "Any other extracted metadata.", + "type": "string" + }, + "description": { + "description": "Page description.", + "type": "string" + }, + "language": { + "description": "Page language.", + "type": "string" + }, + "pageError": { + "description": "Error message if there was an error scraping the page.", + "type": "string" + }, + "pageStatusCode": { + "description": "HTTP status code of the page.", + "type": "integer" + }, + "sourceURL": { + "description": "Source URL of the page.", + "type": "string" + }, + "title": { + "description": "Page title.", + "type": "string" + } + }, + "type": "object" + }, + "rawHtml": { + "description": "Raw HTML content of the page, only present if `includeRawHtml` was set to `true` in the crawl request.", + "type": "string" + } + }, + "type": "object" + }, + "type": "array" + }, + "partial_data": { + "description": "Partial results streamed as the crawl progresses. This feature is in alpha and may change.", + "items": { + "properties": { + "content": { + "description": "Raw content of the page.", + "type": "string" + }, + "html": { + "description": "HTML version of the page content, only present if `includeHtml` was set to `true` in the crawl request.", + "type": "string" + }, + "index": { + "description": "The index of the crawled page in the results.", + "type": "integer" + }, + "markdown": { + "description": "Markdown content of the page.", + "type": "string" + }, + "metadata": { + "description": "Metadata extracted from the page.", + "properties": { + " ": { + "description": "Any other extracted metadata.", + "type": "string" + }, + "description": { + "description": "Page description.", + "type": "string" + }, + "language": { + "description": "Page language.", + "type": "string" + }, + "pageError": { + "description": "Error message if there was an error scraping the page.", + "type": "string" + }, + "pageStatusCode": { + "description": "HTTP status code of the page.", + "type": "integer" + }, + "sourceURL": { + "description": "Source URL of the page.", + "type": "string" + }, + "title": { + "description": "Page title.", + "type": "string" + } + }, + "type": "object" + }, + "rawHtml": { + "description": "Raw HTML content of the page, only present if `includeRawHtml` was set to `true` in the crawl request.", + "type": "string" + } + }, + "type": "object" + }, + "type": "array" + }, + "status": { + "description": "Status of the crawl job. Can be 'completed', 'active', 'failed', or 'paused'.", + "enum": [ + "completed", + "active", + "failed", + "paused" + ], + "type": "string" + }, + "total": { + "description": "The total estimated number of pages to crawl.", + "type": "integer" + } + }, + "type": "object" + } + } + }, + "description": "Crawl job status retrieved." + } + }, + "summary": "Get Crawl Job Status" + } + }, + "delete": { + "description": "Send a request to cancel a running crawl job.", + "operationId": "cancelCrawlJob", + "parameters": [ + { + "description": "ID of the crawl job to cancel.", + "in": "path", + "name": "jobId", + "required": true, + "schema": { + "type": "string" + } + } + ], + "requestBody": { + "content": {} + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "status": { + "description": "The status of the crawl job cancellation request, usually 'cancelled'.", + "type": "string" + } + }, + "type": "object" + } + } + }, + "description": "Crawl job cancellation request submitted." + }, + "402": { + "description": "Payment required." + }, + "429": { + "description": "Rate limit exceeded." + }, + "500": { + "description": "Internal server error." + } + }, + "summary": "Cancel a Crawl Job" + } + }, + "description": "Send a request to crawl a URL and all accessible subpages. This submits a crawl job and returns a job ID to check the status of the crawl.", + "operationId": "crawlWebsite", + "parameters": [], + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "crawlerOptions": { + "description": "Options for controlling the crawling behavior.", + "properties": { + "allowBackwardCrawling": { + "default": false, + "description": "Enables the crawler to navigate from a specific URL to previously linked pages. For instance, from 'example.com/product/123' back to 'example.com/product'", + "type": "boolean" + }, + "allowExternalContentLinks": { + "default": false, + "description": "Allows the crawler to follow links to external websites.", + "type": "boolean" + }, + "excludes": { + "description": "URL patterns to exclude", + "items": { + "type": "string" + }, + "type": "array" + }, + "generateImgAltText": { + "default": false, + "description": "Generate alt text for images using LLMs (must have a paid plan)", + "type": "boolean" + }, + "ignoreSitemap": { + "default": false, + "description": "Ignore the website sitemap when crawling", + "type": "boolean" + }, + "includes": { + "description": "URL patterns to include", + "items": { + "type": "string" + }, + "type": "array" + }, + "limit": { + "default": 10000, + "description": "Maximum number of pages to crawl", + "type": "integer" + }, + "maxDepth": { + "description": "Maximum depth to crawl relative to the entered URL. A maxDepth of 0 scrapes only the entered URL. A maxDepth of 1 scrapes the entered URL and all pages one level deep. A maxDepth of 2 scrapes the entered URL and all pages up to two levels deep. Higher values follow the same pattern.", + "type": "integer" + }, + "mode": { + "default": "default", + "description": "The crawling mode to use. Fast mode crawls 4x faster websites without sitemap, but may not be as accurate and shouldn't be used in heavy js-rendered websites.", + "enum": [ + "default", + "fast" + ], + "type": "string" + }, + "returnOnlyUrls": { + "default": false, + "description": "If true, returns only the URLs as a list on the crawl status. Attention: the return response will be a list of URLs inside the data, not a list of documents.", + "type": "boolean" + } + }, + "type": "object" + }, + "pageOptions": { + "description": "Options for controlling the scraping behavior of individual pages.", + "properties": { + "fullPageScreenshot": { + "default": false, + "description": "Include a full page screenshot of the page that you are scraping.", + "type": "boolean" + }, + "headers": { + "description": "Headers to send with the request. Can be used to send cookies, user-agent, etc.", + "type": "object" + }, + "includeHtml": { + "default": false, + "description": "Include the HTML version of the content on page. Will output a html key in the response.", + "type": "boolean" + }, + "includeRawHtml": { + "default": false, + "description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.", + "type": "boolean" + }, + "onlyIncludeTags": { + "description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'", + "items": { + "type": "string" + }, + "type": "array" + }, + "onlyMainContent": { + "default": false, + "description": "Only return the main content of the page excluding headers, navs, footers, etc.", + "type": "boolean" + }, + "removeTags": { + "description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'", + "items": { + "type": "string" + }, + "type": "array" + }, + "replaceAllPathsWithAbsolutePaths": { + "default": false, + "description": "Replace all relative paths with absolute paths for images and links", + "type": "boolean" + }, + "screenshot": { + "default": false, + "description": "Include a screenshot of the top of the page that you are scraping.", + "type": "boolean" + }, + "waitFor": { + "default": 0, + "description": "Wait x amount of milliseconds for the page to load to fetch content", + "type": "integer" + } + }, + "type": "object" + }, + "url": { + "description": "The base URL to start crawling from", + "required": true, + "type": "string" + } + }, + "type": "object" + } + } + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "jobId": { + "description": "The ID of the submitted crawl job.", + "type": "string" + } + }, + "type": "object" + } + } + }, + "description": "Crawl job submitted successfully." + }, + "402": { + "description": "Payment required." + }, + "429": { + "description": "Rate limit exceeded." + }, + "500": { + "description": "Internal server error." + } + } + }, + "summary": "Crawl a Website" + } + } + }, + "servers": [ + { + "url": "https://api.firecrawl.dev/v0" + } + ] +} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/dify_api_spec.json b/examples/turning_docs_into_api_specs/dify_api_spec.json new file mode 100644 index 00000000..e6eec457 --- /dev/null +++ b/examples/turning_docs_into_api_specs/dify_api_spec.json @@ -0,0 +1,164 @@ +{ + "openapi": "3.0.0", + "info": { + "title": "Knowledge Base API", + "description": "API for managing knowledge bases and documents." + }, + "paths": { + "/datasets": { + "post": { + "summary": "Create an Empty Dataset", + "description": "Only used to create an empty dataset", + "requestBody": { + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "name": { + "type": "string" + } + } + } + } + } + }, + "responses": {} + }, + "get": { + "summary": "Dataset List", + "parameters": [ + { + "name": "page", + "in": "query", + "schema": { + "type": "integer" + } + }, + { + "name": "limit", + "in": "query", + "schema": { + "type": "integer" + } + } + ], + "responses": {} + } + }, + "/datasets/{dataset_id}/document/create_by_text": { + "post": { + "summary": "Create Document by Text", + "requestBody": { + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "text": { + "type": "string" + }, + "indexing_technique": { + "type": "string" + }, + "process_rule": { + "type": "object" + } + } + } + } + } + }, + "responses": {} + } + }, + "/datasets/{dataset_id}/document/create_by_file": { + "post": { + "summary": "Create Document by File", + "requestBody": { + "content": { + "multipart/form-data": { + "schema": { + "type": "object", + "properties": { + "data": { + "type": "string" + }, + "file": { + "type": "string", + "format": "binary" + } + } + } + } + } + }, + "responses": {} + } + }, + "/datasets/{dataset_id}/documents/{batch}/indexing-status": { + "get": { + "summary": "Get Document Embedding Status (Progress)", + "responses": {} + } + }, + "/datasets/{dataset_id}/documents/{document_id}": { + "delete": { + "summary": "Delete Document", + "responses": {} + } + }, + "/datasets/{dataset_id}/documents": { + "get": { + "summary": "Dataset Document List", + "responses": {} + } + }, + "/datasets/{dataset_id}/documents/{document_id}/segments": { + "post": { + "summary": "Add Segments", + "requestBody": { + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "segments": { + "type": "array", + "items": { + "type": "object", + "properties": { + "content": { + "type": "string" + }, + "answer": { + "type": "string" + }, + "keywords": { + "type": "array", + "items": { + "type": "string" + } + } + } + } + } + } + } + } + } + }, + "responses": {} + } + }, + "/datasets/{dataset_id}/segments/{segment_id}": { + "delete": { + "summary": "Delete Document Segment", + "responses": {} + } + } + } +} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_0.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_0.json new file mode 100644 index 00000000..84bce02c --- /dev/null +++ b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_0.json @@ -0,0 +1,211 @@ +{ + "info": { + "title": "Firecrawl API", + "version": "v0" + }, + "openapi": "3.0.0", + "paths": { + "/v0/crawl": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "crawlerOptions": { + "description": "Crawling options.", + "properties": { + "excludes": { + "description": "URL patterns to exclude.", + "items": { + "type": "string" + }, + "type": "array" + }, + "includes": { + "description": "URL patterns to include.", + "items": { + "type": "string" + }, + "type": "array" + }, + "limit": { + "description": "Maximum pages to crawl.", + "type": "integer" + }, + "maxDepth": { + "description": "Maximum crawl depth.", + "type": "integer" + }, + "mode": { + "description": "Crawling mode.", + "enum": [ + "default", + "fast" + ], + "type": "string" + }, + "returnOnlyUrls": { + "description": "Return only URLs.", + "type": "boolean" + } + }, + "type": "object" + }, + "pageOptions": { + "description": "Page scraping options.", + "properties": { + "includeHtml": { + "description": "Include HTML content.", + "type": "boolean" + }, + "includeRawHtml": { + "description": "Include raw HTML content.", + "type": "boolean" + }, + "onlyMainContent": { + "description": "Only main content.", + "type": "boolean" + }, + "screenshot": { + "description": "Include page screenshot.", + "type": "boolean" + }, + "waitFor": { + "description": "Wait time in milliseconds.", + "type": "integer" + } + }, + "type": "object" + }, + "url": { + "description": "Base URL to crawl.", + "type": "string" + } + }, + "type": "object" + } + } + } + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "jobId": { + "description": "Crawl job ID.", + "type": "string" + } + }, + "type": "object" + } + } + }, + "description": "Crawl job initiated." + } + }, + "summary": "Crawl multiple pages." + } + }, + "/v0/crawl/status/{jobId}": { + "get": { + "parameters": [ + { + "description": "Crawl job ID.", + "in": "path", + "name": "jobId", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "description": "Crawl job status." + } + }, + "summary": "Check crawl job status." + } + }, + "/v0/scrape": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "extractorOptions": { + "description": "Data extraction options.", + "properties": { + "extractionPrompt": { + "description": "Prompt for data extraction.", + "type": "string" + }, + "extractionSchema": { + "description": "Schema for data extraction.", + "type": "object" + }, + "mode": { + "description": "Extraction mode.", + "enum": [ + "llm-extraction", + "llm-extraction-from-raw-html" + ], + "type": "string" + } + }, + "type": "object" + }, + "pageOptions": { + "description": "Page scraping options.", + "properties": { + "includeHtml": { + "description": "Include HTML content.", + "type": "boolean" + }, + "includeRawHtml": { + "description": "Include raw HTML content.", + "type": "boolean" + }, + "onlyMainContent": { + "description": "Only main content.", + "type": "boolean" + }, + "screenshot": { + "description": "Include page screenshot.", + "type": "boolean" + }, + "waitFor": { + "description": "Wait time in milliseconds.", + "type": "integer" + } + }, + "type": "object" + }, + "timeout": { + "description": "Timeout in milliseconds.", + "type": "integer" + }, + "url": { + "description": "URL to scrape.", + "type": "string" + } + }, + "type": "object" + } + } + } + }, + "responses": { + "200": { + "description": "Successful scraping." + } + }, + "summary": "Scrape a single page." + } + } + } +} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_1.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_1.json new file mode 100644 index 00000000..8656c978 --- /dev/null +++ b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_1.json @@ -0,0 +1,165 @@ +{ + "info": { + "title": "Firecrawl API", + "version": "v0" + }, + "openapi": "3.0.0", + "paths": { + "/crawl": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "crawlerOptions": { + "properties": { + "allowBackwardCrawling": { + "description": "Allow backward crawling", + "type": "boolean" + }, + "allowExternalContentLinks": { + "description": "Allow external links", + "type": "boolean" + }, + "excludes": { + "description": "URL patterns to exclude", + "items": { + "type": "string" + }, + "type": "array" + }, + "generateImgAltText": { + "description": "Generate alt text for images", + "type": "boolean" + }, + "ignoreSitemap": { + "description": "Ignore website sitemap", + "type": "boolean" + }, + "includes": { + "description": "URL patterns to include", + "items": { + "type": "string" + }, + "type": "array" + }, + "limit": { + "description": "Maximum pages to crawl", + "type": "integer" + }, + "maxDepth": { + "description": "Maximum crawl depth", + "type": "integer" + }, + "mode": { + "description": "Crawling mode", + "enum": [ + "default", + "fast" + ], + "type": "string" + }, + "returnOnlyUrls": { + "description": "Return only crawled URLs", + "type": "boolean" + } + }, + "type": "object" + }, + "pageOptions": { + "properties": { + "fullPageScreenshot": { + "description": "Include full page screenshot", + "type": "boolean" + }, + "headers": { + "description": "Headers for requests", + "type": "object" + }, + "includeHtml": { + "description": "Include HTML content", + "type": "boolean" + }, + "includeRawHtml": { + "description": "Include raw HTML content", + "type": "boolean" + }, + "onlyIncludeTags": { + "description": "Include only specific tags", + "items": { + "type": "string" + }, + "type": "array" + }, + "onlyMainContent": { + "description": "Return only main content", + "type": "boolean" + }, + "removeTags": { + "description": "Remove specific tags", + "items": { + "type": "string" + }, + "type": "array" + }, + "replaceAllPathsWithAbsolutePaths": { + "description": "Use absolute paths", + "type": "boolean" + }, + "screenshot": { + "description": "Include page screenshot", + "type": "boolean" + }, + "waitFor": { + "description": "Wait for page load (ms)", + "type": "integer" + } + }, + "type": "object" + }, + "url": { + "description": "Base URL to crawl", + "type": "string" + } + }, + "type": "object" + } + } + } + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "jobId": { + "description": "Job ID of the crawl", + "type": "string" + } + }, + "type": "object" + } + } + }, + "description": "Crawl request successful" + } + }, + "security": [ + { + "Bearer": [] + } + ], + "summary": "Crawl a website" + } + } + }, + "securitySchemes": { + "Bearer": { + "bearerFormat": "JWT", + "scheme": "bearer", + "type": "http" + } + } +} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_10.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_10.json new file mode 100644 index 00000000..55f73a32 --- /dev/null +++ b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_10.json @@ -0,0 +1,93 @@ +{ + "info": { + "title": "Firecrawl API", + "version": "1.0.0" + }, + "openapi": "3.0.0", + "paths": { + "/check_crawl_status": { + "post": { + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "current": { + "type": "integer" + }, + "data": { + "items": { + "properties": { + "content": { + "type": "string" + }, + "markdown": { + "type": "string" + }, + "metadata": { + "properties": { + "description": { + "type": "string" + }, + "language": { + "type": "string" + }, + "sourceURL": { + "type": "string" + }, + "title": { + "type": "string" + } + }, + "type": "object" + }, + "provider": { + "type": "string" + } + }, + "type": "object" + }, + "type": "array" + }, + "status": { + "type": "string" + }, + "total": { + "type": "integer" + } + }, + "type": "object" + } + } + }, + "description": "Crawl job status" + } + }, + "summary": "Check crawl job status" + } + }, + "/crawl": { + "post": { + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "jobId": { + "type": "string" + } + }, + "type": "object" + } + } + }, + "description": "Job ID" + } + }, + "summary": "Crawl URL and subpages" + } + } + } +} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_11.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_11.json new file mode 100644 index 00000000..e19ed056 --- /dev/null +++ b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_11.json @@ -0,0 +1,131 @@ +{ + "info": { + "title": "Firecrawl API", + "version": "v0" + }, + "openapi": "3.0.0", + "paths": { + "/scrape": { + "post": { + "requestBody": { + "content": { + "application/json": { + "example": { + "extractorOptions": { + "extractionPrompt": "Based on the information on the page, extract the information from the schema. ", + "extractionSchema": { + "properties": { + "company_mission": { + "type": "string" + }, + "is_in_yc": { + "type": "boolean" + }, + "is_open_source": { + "type": "boolean" + }, + "supports_sso": { + "type": "boolean" + } + }, + "required": [ + "company_mission", + "supports_sso", + "is_open_source", + "is_in_yc" + ], + "type": "object" + }, + "mode": "llm-extraction" + }, + "url": "https://docs.firecrawl.dev/" + }, + "schema": { + "properties": { + "extractorOptions": { + "properties": { + "extractionPrompt": { + "description": "Prompt for extraction", + "type": "string" + }, + "extractionSchema": { + "description": "Schema for data extraction", + "type": "object" + }, + "mode": { + "description": "Extraction mode", + "type": "string" + } + }, + "type": "object" + }, + "url": { + "description": "URL to scrape", + "type": "string" + } + }, + "type": "object" + } + } + } + }, + "responses": { + "200": { + "content": { + "application/json": { + "example": { + "data": { + "content": "Raw Content", + "llm_extraction": { + "company_mission": "Train a secure AI on your technical resources that answers customer and employee questions so your team doesn't have to", + "is_in_yc": true, + "is_open_source": false, + "supports_sso": true + }, + "metadata": { + "description": "Mendable allows you to easily build AI chat applications. Ingest, customize, then deploy with one line of code anywhere you want. Brought to you by SideGuide", + "ogDescription": "Mendable allows you to easily build AI chat applications. Ingest, customize, then deploy with one line of code anywhere you want. Brought to you by SideGuide", + "ogImage": "https://docs.firecrawl.dev/mendable_new_og1.png", + "ogLocaleAlternate": [], + "ogSiteName": "Mendable", + "ogTitle": "Mendable", + "ogUrl": "https://docs.firecrawl.dev/", + "robots": "follow, index", + "sourceURL": "https://docs.firecrawl.dev/", + "title": "Mendable" + } + }, + "success": true + }, + "schema": { + "properties": { + "data": { + "properties": { + "content": { + "type": "string" + }, + "llm_extraction": { + "type": "object" + }, + "metadata": { + "type": "object" + } + }, + "type": "object" + }, + "success": { + "type": "boolean" + } + }, + "type": "object" + } + } + }, + "description": "Successful scrape" + } + }, + "summary": "Extract data from pages." + } + } + } +} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_13.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_13.json new file mode 100644 index 00000000..0352c66f --- /dev/null +++ b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_13.json @@ -0,0 +1,87 @@ +{ + "info": { + "title": "Firecrawl API", + "version": "v0" + }, + "openapi": "3.0.0", + "paths": { + "/search": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "pageOptions": { + "properties": { + "fetchPageContent": { + "type": "boolean" + } + }, + "type": "object" + }, + "query": { + "type": "string" + } + }, + "type": "object" + } + } + } + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "data": { + "items": { + "properties": { + "markdown": { + "type": "string" + }, + "metadata": { + "properties": { + "description": { + "type": "string" + }, + "language": { + "type": "string" + }, + "sourceURL": { + "type": "string" + }, + "title": { + "type": "string" + } + }, + "type": "object" + }, + "provider": { + "type": "string" + }, + "url": { + "type": "string" + } + }, + "type": "object" + }, + "type": "array" + }, + "success": { + "type": "boolean" + } + }, + "type": "object" + } + } + }, + "description": "Successful search and scrape." + } + }, + "summary": "Search web, scrape, return markdown." + } + } + } +} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_15.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_15.json new file mode 100644 index 00000000..e7384f8e --- /dev/null +++ b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_15.json @@ -0,0 +1,83 @@ +{ + "info": { + "title": "Firecrawl API", + "version": "1.0.0" + }, + "openapi": "3.0.0", + "paths": { + "/crawl": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "url": { + "description": "Website URL to crawl.", + "type": "string" + } + }, + "type": "object" + } + } + } + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "items": { + "properties": { + "markdown": { + "description": "Markdown content.", + "type": "string" + } + }, + "type": "object" + }, + "type": "array" + } + } + }, + "description": "Website crawled successfully." + } + }, + "summary": "Crawl a website." + } + }, + "/scrape": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "url": { + "description": "Page URL to scrape.", + "type": "string" + } + }, + "type": "object" + } + } + } + }, + "responses": { + "200": { + "content": { + "text/plain": { + "schema": { + "description": "Scraped content.", + "type": "string" + } + } + }, + "description": "Page scraped successfully." + } + }, + "summary": "Scrape a single page." + } + } + } +} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_16.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_16.json new file mode 100644 index 00000000..ed6fb9d6 --- /dev/null +++ b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_16.json @@ -0,0 +1,200 @@ +{ + "info": { + "title": "Firecrawl API", + "version": "1.0.0" + }, + "openapi": "3.0.0", + "paths": { + "/crawl": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "crawler_options": { + "properties": { + "exclude": { + "description": "URL patterns to exclude", + "items": { + "type": "string" + }, + "type": "array" + }, + "generateImgAltText": { + "description": "Generate alt text for images", + "type": "boolean" + }, + "includes": { + "description": "URL patterns to include", + "items": { + "type": "string" + }, + "type": "array" + }, + "limit": { + "description": "Max pages to crawl", + "type": "integer" + }, + "maxDepth": { + "description": "Maximum crawl depth", + "type": "integer" + }, + "mode": { + "description": "Crawling mode", + "type": "string" + }, + "returnOnlyUrls": { + "description": "Return only URLs", + "type": "boolean" + }, + "timeout": { + "description": "Timeout in milliseconds", + "type": "integer" + } + }, + "type": "object" + }, + "page_options": { + "properties": { + "includeHtml": { + "description": "Include raw HTML", + "type": "boolean" + }, + "onlyMainContent": { + "description": "Only main content", + "type": "boolean" + } + }, + "type": "object" + }, + "url": { + "description": "Base URL to crawl", + "type": "string" + } + }, + "type": "object" + } + } + } + }, + "responses": { + "200": { + "description": "Crawl successful." + } + }, + "summary": "Crawl a website." + } + }, + "/scrape": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "extractor_options": { + "properties": { + "extractionPrompt": { + "description": "Prompt for extraction", + "type": "string" + }, + "extractionSchema": { + "description": "Schema for extraction", + "type": "string" + }, + "mode": { + "description": "Extraction mode", + "type": "string" + } + }, + "type": "object" + }, + "page_options": { + "properties": { + "includeHtml": { + "description": "Include raw HTML", + "type": "boolean" + }, + "onlyMainContent": { + "description": "Only main content", + "type": "boolean" + } + }, + "type": "object" + }, + "timeout": { + "description": "Timeout in milliseconds", + "type": "integer" + }, + "url": { + "description": "URL to scrape", + "type": "string" + } + }, + "type": "object" + } + } + } + }, + "responses": { + "200": { + "description": "Scrape successful." + } + }, + "summary": "Scrape a website." + } + }, + "/search": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "page_options": { + "properties": { + "fetchPageContent": { + "description": "Fetch full content", + "type": "boolean" + }, + "includeHtml": { + "description": "Include raw HTML", + "type": "boolean" + }, + "onlyMainContent": { + "description": "Only main content", + "type": "boolean" + } + }, + "type": "object" + }, + "query": { + "description": "Search query string", + "type": "string" + }, + "search_options": { + "properties": { + "limit": { + "description": "Max results", + "type": "integer" + } + }, + "type": "object" + } + }, + "type": "object" + } + } + } + }, + "responses": { + "200": { + "description": "Search successful." + } + }, + "summary": "Search Firecrawl index." + } + } + } +} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_2.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_2.json new file mode 100644 index 00000000..25cf6c05 --- /dev/null +++ b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_2.json @@ -0,0 +1,54 @@ +{ + "info": { + "title": "Firecrawl API", + "version": "v0" + }, + "openapi": "3.0.0", + "paths": { + "/crawl/cancel/{jobId}": { + "delete": { + "parameters": [ + { + "description": "ID of crawl job", + "in": "path", + "name": "jobId", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "status": { + "type": "string" + } + }, + "type": "object" + } + } + }, + "description": "Returns cancelled." + } + }, + "security": [ + { + "Bearer": [] + } + ], + "summary": "Cancel crawl job" + } + } + }, + "securitySchemes": { + "Bearer": { + "bearerFormat": "Bearer ", + "scheme": "bearer", + "type": "http" + } + } +} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_22.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_22.json new file mode 100644 index 00000000..ac146a63 --- /dev/null +++ b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_22.json @@ -0,0 +1,166 @@ +{ + "info": { + "title": "Firecrawl API", + "version": "1.0.0" + }, + "openapi": "3.0.0", + "paths": { + "/check-crawl-status/{jobId}": { + "get": { + "parameters": [ + { + "in": "path", + "name": "jobId", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "current": { + "description": "Current progress", + "type": "integer" + }, + "data": { + "items": { + "properties": { + "content": { + "description": "Raw content", + "type": "string" + }, + "markdown": { + "description": "Markdown content", + "type": "string" + }, + "metadata": { + "description": "Page metadata", + "type": "object" + }, + "provider": { + "description": "Data provider", + "type": "string" + } + }, + "type": "object" + }, + "type": "array" + }, + "status": { + "description": "Job status", + "type": "string" + }, + "total": { + "description": "Total pages", + "type": "integer" + } + }, + "type": "object" + } + } + }, + "description": "Crawl job status." + } + }, + "summary": "Check crawl job status." + } + }, + "/crawl": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "crawlerOptions": { + "description": "Crawler options", + "type": "object" + }, + "url": { + "description": "URL to crawl", + "type": "string" + } + }, + "type": "object" + } + } + } + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "jobId": { + "description": "Job ID", + "type": "string" + } + }, + "type": "object" + } + } + }, + "description": "Crawl job submitted." + } + }, + "summary": "Crawl a URL." + } + }, + "/scrape": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "extractorOptions": { + "description": "Extractor options", + "type": "object" + }, + "pageOptions": { + "description": "Page options", + "type": "object" + }, + "url": { + "description": "URL to scrape", + "type": "string" + } + }, + "type": "object" + } + } + } + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "data": { + "description": "Scraped data", + "type": "object" + }, + "success": { + "description": "Success flag", + "type": "boolean" + } + }, + "type": "object" + } + } + }, + "description": "Scraped data." + } + }, + "summary": "Scrape a single URL." + } + } + } +} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_25.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_25.json new file mode 100644 index 00000000..9701a462 --- /dev/null +++ b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_25.json @@ -0,0 +1,229 @@ +{ + "info": { + "title": "Firecrawl API", + "version": "v0" + }, + "openapi": "3.0.0", + "paths": { + "/v0/crawl": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "crawlerOptions": { + "properties": { + "excludes": { + "description": "Paths to exclude", + "items": { + "type": "string" + }, + "type": "array" + }, + "includes": { + "description": "Paths to include", + "items": { + "type": "string" + }, + "type": "array" + }, + "limit": { + "description": "Maximum pages to crawl", + "type": "integer" + }, + "maxDepth": { + "description": "Maximum crawl depth", + "type": "integer" + }, + "returnOnlyUrls": { + "description": "Only return URLs", + "type": "boolean" + } + }, + "type": "object" + }, + "pageOptions": { + "properties": { + "onlyMainContent": { + "description": "Extract main content", + "type": "boolean" + } + }, + "type": "object" + }, + "url": { + "description": "URL to crawl", + "type": "string" + } + }, + "type": "object" + } + } + } + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "jobId": { + "description": "Job ID", + "type": "string" + } + }, + "type": "object" + } + } + }, + "description": "Crawl job created" + } + }, + "summary": "Crawl a website" + } + }, + "/v0/crawl/status/{jobId}": { + "get": { + "parameters": [ + { + "in": "path", + "name": "jobId", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "current": { + "type": "integer" + }, + "data": { + "items": { + "properties": { + "url": { + "type": "string" + } + }, + "type": "object" + }, + "type": "array" + }, + "status": { + "description": "Job status", + "type": "string" + }, + "total": { + "type": "integer" + } + }, + "type": "object" + } + } + }, + "description": "Crawl job status" + } + }, + "summary": "Get crawl job status" + } + }, + "/v0/scrape": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "pageOptions": { + "properties": { + "onlyMainContent": { + "description": "Extract main content", + "type": "boolean" + } + }, + "type": "object" + }, + "url": { + "description": "URL to scrape", + "type": "string" + } + }, + "type": "object" + } + } + } + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "data": { + "properties": { + "content": { + "type": "string" + }, + "html": { + "type": "string" + }, + "llm_extraction": { + "type": "object" + }, + "markdown": { + "type": "string" + }, + "metadata": { + "properties": { + "description": { + "type": "string" + }, + "language": { + "type": "string" + }, + "pageError": { + "type": "string" + }, + "pageStatusCode": { + "type": "integer" + }, + "sourceURL": { + "type": "string" + }, + "title": { + "type": "string" + } + }, + "type": "object" + }, + "rawHtml": { + "type": "string" + }, + "warning": { + "type": "string" + } + }, + "type": "object" + }, + "success": { + "type": "boolean" + } + }, + "type": "object" + } + } + }, + "description": "Scrape results" + } + }, + "summary": "Scrape a webpage" + } + } + } +} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_26.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_26.json new file mode 100644 index 00000000..b642e9c0 --- /dev/null +++ b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_26.json @@ -0,0 +1,115 @@ +{ + "info": { + "title": "Firecrawl API", + "version": "1.0.0" + }, + "openapi": "3.0.0", + "paths": { + "/scrape": { + "post": { + "requestBody": { + "content": { + "application/json": { + "example": { + "extractorOptions": { + "extractionPrompt": "Extract company info.", + "extractionSchema": { + "properties": { + "company_description": { + "type": "string" + }, + "company_industry": { + "type": "string" + }, + "who_they_serve": { + "type": "string" + } + }, + "required": [ + "company_description", + "company_industry", + "who_they_serve" + ], + "type": "object" + }, + "mode": "llm-extraction" + }, + "pageOptions": { + "onlyMainContent": true + }, + "url": "https://example.com" + }, + "schema": { + "properties": { + "extractorOptions": { + "properties": { + "extractionPrompt": { + "description": "Prompt for LLM extraction.", + "type": "string" + }, + "extractionSchema": { + "properties": { + "properties": { + "company_description": { + "type": "string" + }, + "company_industry": { + "type": "string" + }, + "who_they_serve": { + "type": "string" + } + }, + "required": [ + "company_description", + "company_industry", + "who_they_serve" + ], + "type": { + "type": "string" + } + }, + "type": "object" + }, + "mode": { + "description": "Extraction mode.", + "type": "string" + } + }, + "type": "object" + }, + "pageOptions": { + "properties": { + "onlyMainContent": { + "type": "boolean" + } + }, + "type": "object" + }, + "url": { + "description": "URL to scrape.", + "type": "string" + } + }, + "type": "object" + } + } + } + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "type": "object" + } + } + }, + "description": "Successful scrape." + } + }, + "summary": "Scrape data from URL." + } + } + } +} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_3.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_3.json new file mode 100644 index 00000000..bcf94159 --- /dev/null +++ b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_3.json @@ -0,0 +1,185 @@ +{ + "components": { + "securitySchemes": { + "bearerAuth": { + "scheme": "bearer", + "type": "http" + } + } + }, + "info": { + "title": "Firecrawl API", + "version": "v0" + }, + "openapi": "3.0.0", + "paths": { + "/v0/scrape": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "extractorOptions": { + "description": "Options for extraction", + "properties": { + "extractionPrompt": { + "description": "Prompt for LLM extraction", + "type": "string" + }, + "extractionSchema": { + "description": "Schema for LLM extraction", + "type": "object" + }, + "mode": { + "description": "Extraction mode", + "enum": [ + "markdown", + "llm-extraction", + "llm-extraction-from-raw-html", + "llm-extraction-from-markdown" + ], + "type": "string" + } + }, + "type": "object" + }, + "pageOptions": { + "properties": { + "fullPageScreenshot": { + "description": "Include full page screenshot", + "type": "boolean" + }, + "headers": { + "description": "Headers for request", + "type": "object" + }, + "includeHtml": { + "description": "Include HTML content", + "type": "boolean" + }, + "includeRawHtml": { + "description": "Include raw HTML content", + "type": "boolean" + }, + "onlyIncludeTags": { + "description": "Include only these tags", + "items": { + "type": "string" + }, + "type": "array" + }, + "onlyMainContent": { + "description": "Only return main content", + "type": "boolean" + }, + "removeTags": { + "description": "Remove these tags", + "items": { + "type": "string" + }, + "type": "array" + }, + "replaceAllPathsWithAbsolutePaths": { + "description": "Replace relative paths", + "type": "boolean" + }, + "screenshot": { + "description": "Include screenshot", + "type": "boolean" + }, + "waitFor": { + "description": "Wait time in ms", + "type": "integer" + } + }, + "type": "object" + }, + "timeout": { + "description": "Timeout in ms", + "type": "integer" + }, + "url": { + "description": "URL to scrape", + "type": "string" + } + }, + "type": "object" + } + } + }, + "required": true + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "data": { + "properties": { + "content": { + "type": "string" + }, + "html": { + "type": "string" + }, + "llm_extraction": { + "type": "object" + }, + "markdown": { + "type": "string" + }, + "metadata": { + "properties": { + "description": { + "type": "string" + }, + "language": { + "type": "string" + }, + "pageError": { + "type": "string" + }, + "pageStatusCode": { + "type": "integer" + }, + "sourceURL": { + "type": "string" + }, + "title": { + "type": "string" + } + }, + "type": "object" + }, + "rawHtml": { + "type": "string" + }, + "warning": { + "type": "string" + } + }, + "type": "object" + }, + "success": { + "type": "boolean" + } + }, + "type": "object" + } + } + }, + "description": "Successful scrape" + } + }, + "security": [ + { + "bearerAuth": [] + } + ], + "summary": "Scrape a webpage" + } + } + } +} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_30.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_30.json new file mode 100644 index 00000000..bc542e2a --- /dev/null +++ b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_30.json @@ -0,0 +1,212 @@ +{ + "info": { + "title": "Firecrawl API", + "version": "1.0.0" + }, + "openapi": "3.0.0", + "paths": { + "/crawl": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "crawlerOptions": { + "description": "Crawl job options", + "properties": { + "excludes": { + "description": "Pages to exclude", + "items": { + "type": "string" + }, + "type": "array" + }, + "includes": { + "description": "Pages to include", + "items": { + "type": "string" + }, + "type": "array" + }, + "limit": { + "description": "Max pages to crawl", + "type": "integer" + } + }, + "type": "object" + }, + "pageOptions": { + "description": "Page scraping options", + "properties": { + "onlyMainContent": { + "description": "Only scrape main content", + "type": "boolean" + } + }, + "type": "object" + }, + "url": { + "description": "URL to crawl", + "type": "string" + } + }, + "required": [ + "url" + ], + "type": "object" + } + } + } + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "description": "Crawl job result", + "type": "object" + } + } + }, + "description": "Crawl job result" + } + }, + "summary": "Crawl a website" + } + }, + "/crawl/{jobId}/cancel": { + "post": { + "parameters": [ + { + "description": "Crawl job ID", + "in": "path", + "name": "jobId", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "description": "Cancellation status", + "type": "object" + } + } + }, + "description": "Cancellation status" + } + }, + "summary": "Cancel crawl job" + } + }, + "/crawl/{jobId}/status": { + "get": { + "parameters": [ + { + "description": "Crawl job ID", + "in": "path", + "name": "jobId", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "description": "Crawl status", + "type": "object" + } + } + }, + "description": "Crawl status" + } + }, + "summary": "Check crawl status" + } + }, + "/scrape": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "extractorOptions": { + "description": "LLM extraction options", + "properties": { + "extractionSchema": { + "description": "JSON schema for extraction", + "type": "object" + } + }, + "type": "object" + }, + "url": { + "description": "URL to scrape", + "type": "string" + } + }, + "required": [ + "url" + ], + "type": "object" + } + } + } + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "description": "Scraped data", + "type": "object" + } + } + }, + "description": "Scraped data" + } + }, + "summary": "Scrape a single URL" + } + }, + "/search": { + "get": { + "parameters": [ + { + "description": "Search query", + "in": "query", + "name": "query", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "description": "Search results", + "type": "object" + } + } + }, + "description": "Search results" + } + }, + "summary": "Search and scrape" + } + } + } +} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_31.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_31.json new file mode 100644 index 00000000..07f71759 --- /dev/null +++ b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_31.json @@ -0,0 +1,199 @@ +{ + "info": { + "title": "Firecrawl API", + "version": "1.0.0" + }, + "openapi": "3.0.0", + "paths": { + "/crawl": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "crawlerOptions": { + "properties": { + "excludes": { + "description": "Paths to exclude", + "items": { + "type": "string" + }, + "type": "array" + }, + "includes": { + "description": "Paths to include", + "items": { + "type": "string" + }, + "type": "array" + }, + "limit": { + "description": "Maximum pages to crawl", + "type": "integer" + } + }, + "type": "object" + }, + "pageOptions": { + "properties": { + "onlyMainContent": { + "description": "Extract only main content", + "type": "boolean" + } + }, + "type": "object" + }, + "url": { + "description": "Starting URL for crawl", + "type": "string" + } + }, + "type": "object" + } + } + } + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "jobId": { + "description": "Unique job identifier", + "type": "string" + } + }, + "type": "object" + } + } + }, + "description": "Crawl job started" + } + }, + "summary": "Crawl a website" + } + }, + "/crawl/{jobId}/status": { + "get": { + "parameters": [ + { + "in": "path", + "name": "jobId", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "status": { + "description": "Current job status", + "type": "string" + } + }, + "type": "object" + } + } + }, + "description": "Crawl job status" + } + }, + "summary": "Check crawl status" + } + }, + "/scrape": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "extractorOptions": { + "properties": { + "extractionSchema": { + "description": "Zod schema for extraction", + "type": "object" + } + }, + "type": "object" + }, + "url": { + "description": "URL to scrape", + "type": "string" + } + }, + "type": "object" + } + } + } + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "data": { + "description": "Extracted data", + "type": "object" + } + }, + "type": "object" + } + } + }, + "description": "Scraped data" + } + }, + "summary": "Scrape a single URL" + } + }, + "/search": { + "get": { + "parameters": [ + { + "in": "query", + "name": "query", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "items": { + "properties": { + "content": { + "description": "Page content (optional)", + "type": "string" + }, + "url": { + "description": "Result URL", + "type": "string" + } + }, + "type": "object" + }, + "type": "array" + } + } + }, + "description": "Search results" + } + }, + "summary": "Search for a query" + } + } + } +} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_33.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_33.json new file mode 100644 index 00000000..b45ae841 --- /dev/null +++ b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_33.json @@ -0,0 +1,202 @@ +{ + "info": { + "title": "Firecrawl API", + "version": "1.0.0" + }, + "openapi": "3.0.0", + "paths": { + "/crawl": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "crawlerOptions": { + "description": "Options for crawling", + "properties": { + "excludes": { + "description": "URLs to exclude", + "items": { + "type": "string" + }, + "type": "array" + }, + "includes": { + "description": "URLs to include", + "items": { + "type": "string" + }, + "type": "array" + }, + "limit": { + "description": "Maximum pages to crawl", + "type": "integer" + } + }, + "type": "object" + }, + "pageOptions": { + "description": "Options for page content", + "properties": { + "onlyMainContent": { + "description": "Extract only main content", + "type": "boolean" + } + }, + "type": "object" + }, + "url": { + "description": "URL to crawl", + "type": "string" + } + }, + "type": "object" + } + } + } + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "jobId": { + "description": "Unique crawl job ID", + "type": "string" + } + }, + "type": "object" + } + } + }, + "description": "Crawl job started." + } + }, + "summary": "Crawl a website." + } + }, + "/crawl/{jobId}": { + "get": { + "parameters": [ + { + "in": "path", + "name": "jobId", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "status": { + "description": "Current job status", + "type": "string" + } + }, + "type": "object" + } + } + }, + "description": "Crawl job status." + } + }, + "summary": "Check crawl job status." + } + }, + "/scrape": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "extractorOptions": { + "description": "Options for data extraction", + "properties": { + "extractionSchema": { + "description": "Pydantic schema", + "type": "object" + }, + "mode": { + "description": "Extraction mode", + "type": "string" + } + }, + "type": "object" + }, + "pageOptions": { + "description": "Options for page content", + "properties": { + "onlyMainContent": { + "description": "Extract only main content", + "type": "boolean" + } + }, + "type": "object" + }, + "url": { + "description": "URL to scrape", + "type": "string" + } + }, + "type": "object" + } + } + } + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "type": "object" + } + } + }, + "description": "Scraped data." + } + }, + "summary": "Scrape a single URL." + } + }, + "/search": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "query": { + "description": "Search query", + "type": "string" + } + }, + "type": "object" + } + } + } + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "type": "object" + } + } + }, + "description": "Search results." + } + }, + "summary": "Search the web." + } + } + } +} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_34.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_34.json new file mode 100644 index 00000000..3bafda42 --- /dev/null +++ b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_34.json @@ -0,0 +1,201 @@ +{ + "info": { + "title": "Firecrawl API", + "version": "0.1" + }, + "openapi": "3.0.0", + "paths": { + "/crawl": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "crawlerOptions": { + "description": "Crawl job options", + "properties": { + "excludes": { + "description": "URLs to exclude", + "items": { + "type": "string" + }, + "type": "array" + }, + "includes": { + "description": "URLs to include", + "items": { + "type": "string" + }, + "type": "array" + }, + "limit": { + "description": "Maximum pages to crawl", + "type": "integer" + } + }, + "type": "object" + }, + "pageOptions": { + "description": "Page scraping options", + "properties": { + "onlyMainContent": { + "description": "Only scrape main content", + "type": "boolean" + } + }, + "type": "object" + }, + "url": { + "description": "URL to crawl", + "type": "string" + } + }, + "type": "object" + } + } + } + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "type": "object" + } + } + }, + "description": "Crawl job started" + } + }, + "summary": "Crawl a website." + } + }, + "/crawl/{job_id}/cancel": { + "post": { + "parameters": [ + { + "description": "Crawl job ID", + "in": "path", + "name": "job_id", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "type": "object" + } + } + }, + "description": "Cancellation status" + } + }, + "summary": "Cancel crawl job." + } + }, + "/crawl/{job_id}/status": { + "get": { + "parameters": [ + { + "description": "Crawl job ID", + "in": "path", + "name": "job_id", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "type": "object" + } + } + }, + "description": "Crawl status" + } + }, + "summary": "Check crawl status." + } + }, + "/scrape": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "extractorOptions": { + "description": "LLM extraction options", + "properties": { + "extractionSchema": { + "description": "JSON schema for extraction", + "type": "object" + } + }, + "type": "object" + }, + "url": { + "description": "URL to scrape", + "type": "string" + } + }, + "type": "object" + } + } + } + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "type": "object" + } + } + }, + "description": "Scraped data" + } + }, + "summary": "Scrape a single URL." + } + }, + "/search": { + "get": { + "parameters": [ + { + "description": "Search query", + "in": "query", + "name": "query", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "type": "object" + } + } + }, + "description": "Search results" + } + }, + "summary": "Search and scrape results." + } + } + } +} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_35.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_35.json new file mode 100644 index 00000000..890d31b1 --- /dev/null +++ b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_35.json @@ -0,0 +1,245 @@ +{ + "info": { + "title": "Firecrawl API", + "version": "1.0.0" + }, + "openapi": "3.0.0", + "paths": { + "/check-crawl-status": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "jobId": { + "description": "Crawl job ID", + "type": "string" + } + }, + "type": "object" + } + } + } + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "current": { + "description": "Current page count", + "type": "integer" + }, + "data": { + "description": "Crawl data", + "items": { + "properties": { + "content": { + "description": "Raw content", + "type": "string" + }, + "markdown": { + "description": "Markdown content", + "type": "string" + }, + "metadata": { + "description": "Page metadata", + "properties": { + "description": { + "description": "Page description", + "type": "string" + }, + "language": { + "description": "Page language", + "type": "string" + }, + "sourceURL": { + "description": "Page URL", + "type": "string" + }, + "title": { + "description": "Page title", + "type": "string" + } + }, + "type": "object" + }, + "provider": { + "description": "Content provider", + "type": "string" + } + }, + "type": "object" + }, + "type": "array" + }, + "status": { + "description": "Crawl status", + "type": "string" + }, + "total": { + "description": "Total page count", + "type": "integer" + } + }, + "type": "object" + } + } + }, + "description": "Crawl job status." + } + }, + "summary": "Check crawl job status." + } + }, + "/crawl": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "crawlerOptions": { + "description": "Crawler options", + "properties": { + "excludes": { + "description": "URLs to exclude", + "items": { + "type": "string" + }, + "type": "array" + } + }, + "type": "object" + }, + "url": { + "description": "URL to crawl", + "type": "string" + } + }, + "type": "object" + } + } + } + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "jobId": { + "description": "Job ID", + "type": "string" + } + }, + "type": "object" + } + } + }, + "description": "Crawl job submitted." + } + }, + "summary": "Crawl a URL." + } + }, + "/scrape-url": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "extractorOptions": { + "description": "Extractor options", + "properties": { + "extractionSchema": { + "description": "Extraction schema", + "type": "string" + }, + "mode": { + "description": "Extraction mode", + "type": "string" + } + }, + "type": "object" + }, + "pageOptions": { + "description": "Page options", + "properties": { + "onlyMainContent": { + "description": "Only main content", + "type": "boolean" + } + }, + "type": "object" + }, + "url": { + "description": "URL to scrape", + "type": "string" + } + }, + "type": "object" + } + } + } + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "data": { + "description": "Scraped data", + "properties": { + "content": { + "description": "Raw content", + "type": "string" + }, + "html": { + "description": "HTML content", + "type": "string" + }, + "llm_extraction": { + "description": "LLM extraction results", + "type": "object" + }, + "markdown": { + "description": "Markdown content", + "type": "string" + }, + "metadata": { + "description": "Page metadata", + "type": "object" + }, + "rawHtml": { + "description": "Raw HTML content", + "type": "string" + }, + "warning": { + "description": "Warning message", + "type": "string" + } + }, + "type": "object" + }, + "success": { + "description": "Request success", + "type": "boolean" + } + }, + "type": "object" + } + } + }, + "description": "Scraped data." + } + }, + "summary": "Scrape a single URL." + } + } + } +} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_4.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_4.json new file mode 100644 index 00000000..daf53932 --- /dev/null +++ b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_4.json @@ -0,0 +1,129 @@ +{ + "components": { + "securitySchemes": { + "Bearer": { + "scheme": "bearer", + "type": "http" + } + } + }, + "info": { + "title": "Firecrawl API", + "version": "v0" + }, + "openapi": "3.0.0", + "paths": { + "/search": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "pageOptions": { + "properties": { + "fetchPageContent": { + "description": "Fetch content of each page.", + "type": "boolean" + }, + "includeHtml": { + "description": "Include HTML content.", + "type": "boolean" + }, + "includeRawHtml": { + "description": "Include raw HTML content.", + "type": "boolean" + }, + "onlyMainContent": { + "description": "Only return main content.", + "type": "boolean" + } + }, + "type": "object" + }, + "query": { + "description": "The query to search for", + "type": "string" + }, + "searchOptions": { + "properties": { + "limit": { + "description": "Maximum number of results.", + "type": "integer" + } + }, + "type": "object" + } + }, + "type": "object" + } + } + }, + "required": true + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "data": { + "items": { + "properties": { + "content": { + "type": "string" + }, + "markdown": { + "type": "string" + }, + "metadata": { + "properties": { + "description": { + "type": "string" + }, + "language": { + "type": "string" + }, + "sourceURL": { + "type": "string" + }, + "title": { + "type": "string" + } + }, + "type": "object" + }, + "url": { + "type": "string" + } + }, + "type": "object" + }, + "type": "array" + }, + "success": { + "type": "boolean" + } + }, + "type": "object" + } + } + }, + "description": "Successful search." + } + }, + "security": [ + { + "Bearer": [] + } + ], + "summary": "Search the web." + } + } + }, + "servers": [ + { + "url": "https://api.firecrawl.dev/v0" + } + ] +} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_5.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_5.json new file mode 100644 index 00000000..4fae28c0 --- /dev/null +++ b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_5.json @@ -0,0 +1,186 @@ +{ + "info": { + "title": "Firecrawl API", + "version": "v0" + }, + "openapi": "3.0.0", + "paths": { + "/crawl/status/{jobId}": { + "get": { + "parameters": [ + { + "description": "ID of crawl job", + "in": "path", + "name": "jobId", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "current": { + "description": "Current page number", + "type": "integer" + }, + "data": { + "description": "Data from the job", + "items": { + "properties": { + "content": { + "type": "string" + }, + "html": { + "description": "HTML content", + "nullable": true, + "type": "string" + }, + "index": { + "description": "Page number crawled", + "type": "integer" + }, + "markdown": { + "type": "string" + }, + "metadata": { + "properties": { + "description": { + "type": "string" + }, + "language": { + "nullable": true, + "type": "string" + }, + "pageError": { + "description": "Error message of page", + "nullable": true, + "type": "string" + }, + "pageStatusCode": { + "description": "Status code of page", + "type": "integer" + }, + "sourceURL": { + "type": "string" + }, + "title": { + "type": "string" + }, + "{any other metadata}": { + "type": "string" + } + }, + "type": "object" + }, + "rawHtml": { + "description": "Raw HTML content", + "nullable": true, + "type": "string" + } + }, + "type": "object" + }, + "type": "array" + }, + "partial_data": { + "description": "Partial documents (streaming)", + "items": { + "properties": { + "content": { + "type": "string" + }, + "html": { + "description": "HTML content", + "nullable": true, + "type": "string" + }, + "index": { + "description": "Page number crawled", + "type": "integer" + }, + "markdown": { + "type": "string" + }, + "metadata": { + "properties": { + "description": { + "type": "string" + }, + "language": { + "nullable": true, + "type": "string" + }, + "pageError": { + "description": "Error message of page", + "nullable": true, + "type": "string" + }, + "pageStatusCode": { + "description": "Status code of page", + "type": "integer" + }, + "sourceURL": { + "type": "string" + }, + "title": { + "type": "string" + }, + "{any other metadata}": { + "type": "string" + } + }, + "type": "object" + }, + "rawHtml": { + "description": "Raw HTML content", + "nullable": true, + "type": "string" + } + }, + "type": "object" + }, + "type": "array" + }, + "status": { + "description": "Status of the job", + "type": "string" + }, + "total": { + "description": "Total number of pages", + "type": "integer" + } + }, + "type": "object" + } + } + }, + "description": "Successful operation" + } + }, + "security": [ + { + "Authorization": [] + } + ], + "summary": "Get crawl job status" + } + } + }, + "securitySchemes": { + "Authorization": { + "bearerFormat": "Bearer ", + "scheme": "bearer", + "type": "http" + } + }, + "servers": [ + { + "url": "https://api.firecrawl.dev/v0" + } + ] +} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_7.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_7.json new file mode 100644 index 00000000..b74b9886 --- /dev/null +++ b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_7.json @@ -0,0 +1,86 @@ +{ + "info": { + "title": "Firecrawl API", + "version": "v0" + }, + "openapi": "3.0.0", + "paths": { + "/v0/search": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "pageOptions": { + "properties": { + "fetchPageContent": { + "description": "Fetch page content", + "type": "boolean" + } + }, + "type": "object" + }, + "query": { + "description": "Search term", + "type": "string" + } + }, + "type": "object" + } + } + } + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "data": { + "items": { + "properties": { + "markdown": { + "type": "string" + }, + "metadata": { + "properties": { + "description": { + "type": "string" + }, + "language": { + "type": "string" + }, + "sourceURL": { + "type": "string" + }, + "title": { + "type": "string" + } + }, + "type": "object" + }, + "url": { + "type": "string" + } + }, + "type": "object" + }, + "type": "array" + }, + "success": { + "type": "boolean" + } + }, + "type": "object" + } + } + }, + "description": "Successful search" + } + }, + "summary": "Search and extract content" + } + } + } +} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_8.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_8.json new file mode 100644 index 00000000..2d5f40e2 --- /dev/null +++ b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_8.json @@ -0,0 +1,59 @@ +{ + "info": { + "title": "Firecrawl API", + "version": "v0" + }, + "openapi": "3.0.0", + "paths": { + "/test": { + "get": { + "description": "Returns a test message.", + "responses": { + "200": { + "content": { + "text/plain": { + "schema": { + "example": "Hello, world!", + "type": "string" + } + } + }, + "description": "Successful operation" + } + }, + "summary": "Test endpoint" + } + }, + "/v0/crawl": { + "post": { + "description": "Processes crawl job for URL.", + "requestBody": { + "content": { + "application/json": { + "example": { + "url": "https://docs.firecrawl.dev" + }, + "schema": { + "properties": { + "url": { + "description": "Website URL", + "type": "string" + } + }, + "type": "object" + } + } + }, + "description": "URL to crawl", + "required": true + }, + "responses": { + "200": { + "description": "Crawl initiated." + } + }, + "summary": "Crawl a given URL." + } + } + } +} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/combined_api_spec.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/combined_api_spec.json new file mode 100644 index 00000000..77d67234 --- /dev/null +++ b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/combined_api_spec.json @@ -0,0 +1,738 @@ +{ + "components": { + "schemas": {} + }, + "info": { + "title": "https://docs.firecrawl.dev API Specification", + "version": "1.0.0" + }, + "openapi": "3.0.0", + "paths": { + "/check_crawl_status": { + "post": { + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "current": { + "type": "integer" + }, + "data": { + "items": { + "properties": { + "content": { + "type": "string" + }, + "markdown": { + "type": "string" + }, + "metadata": { + "properties": { + "description": { + "type": "string" + }, + "language": { + "type": "string" + }, + "sourceURL": { + "type": "string" + }, + "title": { + "type": "string" + } + }, + "type": "object" + }, + "provider": { + "type": "string" + } + }, + "type": "object" + }, + "type": "array" + }, + "status": { + "type": "string" + }, + "total": { + "type": "integer" + } + }, + "type": "object" + } + } + }, + "description": "Crawl job status" + } + }, + "summary": "Check crawl job status" + } + }, + "/crawl": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "crawlerOptions": { + "properties": { + "allowBackwardCrawling": { + "description": "Allow backward crawling", + "type": "boolean" + }, + "allowExternalContentLinks": { + "description": "Allow external links", + "type": "boolean" + }, + "excludes": { + "description": "URL patterns to exclude", + "items": { + "type": "string" + }, + "type": "array" + }, + "generateImgAltText": { + "description": "Generate alt text for images", + "type": "boolean" + }, + "ignoreSitemap": { + "description": "Ignore website sitemap", + "type": "boolean" + }, + "includes": { + "description": "URL patterns to include", + "items": { + "type": "string" + }, + "type": "array" + }, + "limit": { + "description": "Maximum pages to crawl", + "type": "integer" + }, + "maxDepth": { + "description": "Maximum crawl depth", + "type": "integer" + }, + "mode": { + "description": "Crawling mode", + "enum": [ + "default", + "fast" + ], + "type": "string" + }, + "returnOnlyUrls": { + "description": "Return only crawled URLs", + "type": "boolean" + } + }, + "type": "object" + }, + "pageOptions": { + "properties": { + "fullPageScreenshot": { + "description": "Include full page screenshot", + "type": "boolean" + }, + "headers": { + "description": "Headers for requests", + "type": "object" + }, + "includeHtml": { + "description": "Include HTML content", + "type": "boolean" + }, + "includeRawHtml": { + "description": "Include raw HTML content", + "type": "boolean" + }, + "onlyIncludeTags": { + "description": "Include only specific tags", + "items": { + "type": "string" + }, + "type": "array" + }, + "onlyMainContent": { + "description": "Return only main content", + "type": "boolean" + }, + "removeTags": { + "description": "Remove specific tags", + "items": { + "type": "string" + }, + "type": "array" + }, + "replaceAllPathsWithAbsolutePaths": { + "description": "Use absolute paths", + "type": "boolean" + }, + "screenshot": { + "description": "Include page screenshot", + "type": "boolean" + }, + "waitFor": { + "description": "Wait for page load (ms)", + "type": "integer" + } + }, + "type": "object" + }, + "url": { + "description": "Base URL to crawl", + "type": "string" + } + }, + "type": "object" + } + } + } + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "jobId": { + "description": "Job ID of the crawl", + "type": "string" + } + }, + "type": "object" + } + } + }, + "description": "Crawl request successful" + } + }, + "security": [ + { + "Bearer": [] + } + ], + "summary": "Crawl a website" + } + }, + "/crawl/cancel/{jobId}": { + "delete": { + "parameters": [ + { + "description": "ID of crawl job", + "in": "path", + "name": "jobId", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "status": { + "type": "string" + } + }, + "type": "object" + } + } + }, + "description": "Returns cancelled." + } + }, + "security": [ + { + "Bearer": [] + } + ], + "summary": "Cancel crawl job" + } + }, + "/search": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "pageOptions": { + "properties": { + "fetchPageContent": { + "description": "Fetch content of each page.", + "type": "boolean" + }, + "includeHtml": { + "description": "Include HTML content.", + "type": "boolean" + }, + "includeRawHtml": { + "description": "Include raw HTML content.", + "type": "boolean" + }, + "onlyMainContent": { + "description": "Only return main content.", + "type": "boolean" + } + }, + "type": "object" + }, + "query": { + "description": "The query to search for", + "type": "string" + }, + "searchOptions": { + "properties": { + "limit": { + "description": "Maximum number of results.", + "type": "integer" + } + }, + "type": "object" + } + }, + "type": "object" + } + } + }, + "required": true + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "data": { + "items": { + "properties": { + "content": { + "type": "string" + }, + "markdown": { + "type": "string" + }, + "metadata": { + "properties": { + "description": { + "type": "string" + }, + "language": { + "type": "string" + }, + "sourceURL": { + "type": "string" + }, + "title": { + "type": "string" + } + }, + "type": "object" + }, + "url": { + "type": "string" + } + }, + "type": "object" + }, + "type": "array" + }, + "success": { + "type": "boolean" + } + }, + "type": "object" + } + } + }, + "description": "Successful search." + } + }, + "security": [ + { + "Bearer": [] + } + ], + "summary": "Search the web." + } + }, + "/v0/crawl": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "crawlerOptions": { + "description": "Crawling options.", + "properties": { + "excludes": { + "description": "URL patterns to exclude.", + "items": { + "type": "string" + }, + "type": "array" + }, + "includes": { + "description": "URL patterns to include.", + "items": { + "type": "string" + }, + "type": "array" + }, + "limit": { + "description": "Maximum pages to crawl.", + "type": "integer" + }, + "maxDepth": { + "description": "Maximum crawl depth.", + "type": "integer" + }, + "mode": { + "description": "Crawling mode.", + "enum": [ + "default", + "fast" + ], + "type": "string" + }, + "returnOnlyUrls": { + "description": "Return only URLs.", + "type": "boolean" + } + }, + "type": "object" + }, + "pageOptions": { + "description": "Page scraping options.", + "properties": { + "includeHtml": { + "description": "Include HTML content.", + "type": "boolean" + }, + "includeRawHtml": { + "description": "Include raw HTML content.", + "type": "boolean" + }, + "onlyMainContent": { + "description": "Only main content.", + "type": "boolean" + }, + "screenshot": { + "description": "Include page screenshot.", + "type": "boolean" + }, + "waitFor": { + "description": "Wait time in milliseconds.", + "type": "integer" + } + }, + "type": "object" + }, + "url": { + "description": "Base URL to crawl.", + "type": "string" + } + }, + "type": "object" + } + } + } + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "jobId": { + "description": "Crawl job ID.", + "type": "string" + } + }, + "type": "object" + } + } + }, + "description": "Crawl job initiated." + } + }, + "summary": "Crawl multiple pages." + } + }, + "/v0/crawl/status/{jobId}": { + "get": { + "parameters": [ + { + "description": "Crawl job ID.", + "in": "path", + "name": "jobId", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "description": "Crawl job status." + } + }, + "summary": "Check crawl job status." + } + }, + "/v0/scrape": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "extractorOptions": { + "description": "Options for extraction", + "properties": { + "extractionPrompt": { + "description": "Prompt for LLM extraction", + "type": "string" + }, + "extractionSchema": { + "description": "Schema for LLM extraction", + "type": "object" + }, + "mode": { + "description": "Extraction mode", + "enum": [ + "markdown", + "llm-extraction", + "llm-extraction-from-raw-html", + "llm-extraction-from-markdown" + ], + "type": "string" + } + }, + "type": "object" + }, + "pageOptions": { + "properties": { + "fullPageScreenshot": { + "description": "Include full page screenshot", + "type": "boolean" + }, + "headers": { + "description": "Headers for request", + "type": "object" + }, + "includeHtml": { + "description": "Include HTML content", + "type": "boolean" + }, + "includeRawHtml": { + "description": "Include raw HTML content", + "type": "boolean" + }, + "onlyIncludeTags": { + "description": "Include only these tags", + "items": { + "type": "string" + }, + "type": "array" + }, + "onlyMainContent": { + "description": "Only return main content", + "type": "boolean" + }, + "removeTags": { + "description": "Remove these tags", + "items": { + "type": "string" + }, + "type": "array" + }, + "replaceAllPathsWithAbsolutePaths": { + "description": "Replace relative paths", + "type": "boolean" + }, + "screenshot": { + "description": "Include screenshot", + "type": "boolean" + }, + "waitFor": { + "description": "Wait time in ms", + "type": "integer" + } + }, + "type": "object" + }, + "timeout": { + "description": "Timeout in ms", + "type": "integer" + }, + "url": { + "description": "URL to scrape", + "type": "string" + } + }, + "type": "object" + } + } + }, + "required": true + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "data": { + "properties": { + "content": { + "type": "string" + }, + "html": { + "type": "string" + }, + "llm_extraction": { + "type": "object" + }, + "markdown": { + "type": "string" + }, + "metadata": { + "properties": { + "description": { + "type": "string" + }, + "language": { + "type": "string" + }, + "pageError": { + "type": "string" + }, + "pageStatusCode": { + "type": "integer" + }, + "sourceURL": { + "type": "string" + }, + "title": { + "type": "string" + } + }, + "type": "object" + }, + "rawHtml": { + "type": "string" + }, + "warning": { + "type": "string" + } + }, + "type": "object" + }, + "success": { + "type": "boolean" + } + }, + "type": "object" + } + } + }, + "description": "Successful scrape" + } + }, + "security": [ + { + "bearerAuth": [] + } + ], + "summary": "Scrape a webpage" + } + }, + "/v0/search": { + "post": { + "requestBody": { + "content": { + "application/json": { + "schema": { + "properties": { + "pageOptions": { + "properties": { + "fetchPageContent": { + "description": "Fetch page content", + "type": "boolean" + } + }, + "type": "object" + }, + "query": { + "description": "Search term", + "type": "string" + } + }, + "type": "object" + } + } + } + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "properties": { + "data": { + "items": { + "properties": { + "markdown": { + "type": "string" + }, + "metadata": { + "properties": { + "description": { + "type": "string" + }, + "language": { + "type": "string" + }, + "sourceURL": { + "type": "string" + }, + "title": { + "type": "string" + } + }, + "type": "object" + }, + "url": { + "type": "string" + } + }, + "type": "object" + }, + "type": "array" + }, + "success": { + "type": "boolean" + } + }, + "type": "object" + } + } + }, + "description": "Successful search" + } + }, + "summary": "Search and extract content" + } + } + } +} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/turning_docs_into_api_specs.ipynb b/examples/turning_docs_into_api_specs/turning_docs_into_api_specs.ipynb new file mode 100644 index 00000000..1b97f67b --- /dev/null +++ b/examples/turning_docs_into_api_specs/turning_docs_into_api_specs.ipynb @@ -0,0 +1,287 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/ericciarla/projects/python_projects/agents_testing/.conda/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "import os\n", + "import datetime\n", + "import time\n", + "from firecrawl import FirecrawlApp\n", + "import json\n", + "import google.generativeai as genai\n", + "from dotenv import load_dotenv\n", + "\n", + "# Load environment variables\n", + "load_dotenv()\n", + "\n", + "# Retrieve API keys from environment variables\n", + "google_api_key = os.getenv(\"GOOGLE_API_KEY\")\n", + "firecrawl_api_key = os.getenv(\"FIRECRAWL_API_KEY\")\n", + "\n", + "# Configure the Google Generative AI module with the API key\n", + "genai.configure(api_key=google_api_key)\n", + "model = genai.GenerativeModel(\"gemini-1.5-pro-001\")\n", + "\n", + "# Set the docs URL\n", + "docs_url=\"https://docs.firecrawl.dev\"\n", + "\n", + "# Initialize the FirecrawlApp with your API key\n", + "app = FirecrawlApp(api_key=firecrawl_api_key)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "36\n" + ] + } + ], + "source": [ + "# Crawl all pages on docs\n", + "params = {\n", + " \"pageOptions\": {\n", + " \"onlyMainContent\": True\n", + " },\n", + "}\n", + "crawl_result = app.crawl_url(docs_url, params=params)\n", + "\n", + "print(len(crawl_result))" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "prompt_instructions = f\"\"\"Given the following API documentation content, generate an OpenAPI 3.0 specification in JSON format ONLY if you are 100% confident and clear about all details. Focus on extracting the main endpoints, their HTTP methods, parameters, request bodies, and responses. The specification should follow OpenAPI 3.0 structure and conventions. Include only the 200 response for each endpoint. Limit all descriptions to 5 words or less.\n", + "\n", + "If there is ANY uncertainty, lack of complete information, or if you are not 100% confident about ANY part of the specification, return an empty JSON object {{}}.\n", + "\n", + "Do not make anything up. Only include information that is explicitly provided in the documentation. If any detail is unclear or missing, do not attempt to fill it in.\n", + "\n", + "API Documentation Content:\n", + "{{content}}\n", + "\n", + "Generate the OpenAPI 3.0 specification in JSON format ONLY if you are 100% confident about every single detail. Include only the JSON object, no additional text, and ensure it has no errors in the JSON format so it can be parsed. Remember to include only the 200 response for each endpoint and keep all descriptions to 5 words maximum.\n", + "\n", + "Once again, if there is ANY doubt, uncertainty, or lack of complete information, return an empty JSON object {{}}.\n", + "\n", + "To reiterate: accuracy is paramount. Do not make anything up. If you are not 100% clear or confident about the entire OpenAPI spec, return an empty JSON object {{}}.\n", + "\"\"\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "API specification saved to docs.firecrawl.dev/api_spec_0.json\n", + "API specification saved to docs.firecrawl.dev/api_spec_1.json\n", + "API specification saved to docs.firecrawl.dev/api_spec_2.json\n", + "API specification saved to docs.firecrawl.dev/api_spec_3.json\n", + "API specification saved to docs.firecrawl.dev/api_spec_4.json\n", + "An error occurred for page 5: 'content'\n", + "No API specification found for page 6\n", + "API specification saved to docs.firecrawl.dev/api_spec_7.json\n", + "No API specification found for page 8\n", + "No API specification found for page 9\n", + "API specification saved to docs.firecrawl.dev/api_spec_10.json\n", + "No API specification found for page 11\n", + "No API specification found for page 12\n", + "API specification saved to docs.firecrawl.dev/api_spec_13.json\n", + "No API specification found for page 14\n", + "No API specification found for page 15\n", + "No API specification found for page 16\n", + "No API specification found for page 17\n", + "No API specification found for page 18\n", + "No API specification found for page 19\n", + "No API specification found for page 20\n", + "No API specification found for page 21\n", + "No API specification found for page 22\n", + "No API specification found for page 23\n", + "No API specification found for page 24\n", + "No API specification found for page 25\n", + "No API specification found for page 26\n", + "No API specification found for page 27\n", + "No API specification found for page 28\n", + "No API specification found for page 29\n", + "No API specification found for page 30\n", + "No API specification found for page 31\n", + "No API specification found for page 32\n", + "No API specification found for page 33\n", + "No API specification found for page 34\n", + "No API specification found for page 35\n", + "Total API specifications collected: 8\n" + ] + } + ], + "source": [ + "# Create a folder for storing API specs\n", + "import os\n", + "import urllib.parse\n", + "\n", + "folder_name = urllib.parse.urlparse(docs_url).netloc\n", + "os.makedirs(folder_name, exist_ok=True)\n", + "\n", + "# Initialize a list to store all API specs\n", + "all_api_specs = []\n", + "\n", + "# Process each page in crawl_result\n", + "for index, result in enumerate(crawl_result):\n", + " if 'content' in result:\n", + " # Update prompt_instructions with the current page's content\n", + " current_prompt = prompt_instructions.replace(\"{content}\", result['content'])\n", + " try:\n", + " # Query the model\n", + " response = model.generate_content([current_prompt])\n", + " response_dict = response.to_dict()\n", + " response_text = response_dict['candidates'][0]['content']['parts'][0]['text']\n", + " \n", + " # Remove the ```json code wrap if present\n", + " response_text = response_text.strip().removeprefix('```json').removesuffix('```').strip()\n", + " \n", + " # Parse JSON\n", + " json_data = json.loads(response_text)\n", + " \n", + " # Save non-empty API specs\n", + " if json_data != {}:\n", + " output_file = os.path.join(folder_name, f'api_spec_{index}.json')\n", + " with open(output_file, 'w') as f:\n", + " json.dump(json_data, f, indent=2, sort_keys=True)\n", + " print(f\"API specification saved to {output_file}\")\n", + " \n", + " # Add the API spec to the list\n", + " all_api_specs.append(json_data)\n", + " else:\n", + " print(f\"No API specification found for page {index}\")\n", + " \n", + " except json.JSONDecodeError:\n", + " print(f\"Error parsing JSON response for page {index}\")\n", + " except Exception as e:\n", + " print(f\"An error occurred for page {index}: {str(e)}\")\n", + "\n", + "# Print the total number of API specs collected\n", + "print(f\"Total API specifications collected: {len(all_api_specs)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Combined API specification saved to docs.firecrawl.dev/combined_api_spec.json\n", + "Total paths in combined spec: 8\n", + "Total schemas in combined spec: 0\n" + ] + } + ], + "source": [ + "# Combine all API specs and keep the most filled out spec for each path and method\n", + "combined_spec = {\n", + " \"openapi\": \"3.0.0\",\n", + " \"info\": {\n", + " \"title\": f\"{docs_url} API Specification\",\n", + " \"version\": \"1.0.0\"\n", + " },\n", + " \"paths\": {},\n", + " \"components\": {\n", + " \"schemas\": {}\n", + " }\n", + "}\n", + "\n", + "def count_properties(obj):\n", + " if isinstance(obj, dict):\n", + " return sum(count_properties(v) for v in obj.values()) + len(obj)\n", + " elif isinstance(obj, list):\n", + " return sum(count_properties(item) for item in obj)\n", + " else:\n", + " return 1\n", + "\n", + "for spec in all_api_specs:\n", + " if \"paths\" in spec:\n", + " for path, methods in spec[\"paths\"].items():\n", + " if path not in combined_spec[\"paths\"]:\n", + " combined_spec[\"paths\"][path] = {}\n", + " for method, details in methods.items():\n", + " if method not in combined_spec[\"paths\"][path] or count_properties(details) > count_properties(combined_spec[\"paths\"][path][method]):\n", + " combined_spec[\"paths\"][path][method] = details\n", + "\n", + " if \"components\" in spec and \"schemas\" in spec[\"components\"]:\n", + " for schema_name, schema in spec[\"components\"][\"schemas\"].items():\n", + " if schema_name not in combined_spec[\"components\"][\"schemas\"] or count_properties(schema) > count_properties(combined_spec[\"components\"][\"schemas\"][schema_name]):\n", + " combined_spec[\"components\"][\"schemas\"][schema_name] = schema\n", + "\n", + "# Save the combined API spec\n", + "output_file = os.path.join(folder_name, 'combined_api_spec.json')\n", + "with open(output_file, 'w') as f:\n", + " json.dump(combined_spec, f, indent=2, sort_keys=True)\n", + "\n", + "print(f\"Combined API specification saved to {output_file}\")\n", + "print(f\"Total paths in combined spec: {len(combined_spec['paths'])}\")\n", + "print(f\"Total schemas in combined spec: {len(combined_spec['components']['schemas'])}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# note: turn this into a simple web app like roast my site\n", + "- select which methods you want to add\n", + "- generate a UI for each method\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 48056ea1bd731e3023fdb840d1867b5128fe5da3 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Mon, 2 Sep 2024 14:15:56 -0300 Subject: [PATCH 02/62] feat: added go html to md parser --- .../lib/__tests__/html-to-markdown.test.ts | 29 ++++ apps/api/src/lib/go-html-to-md/go.mod | 11 ++ apps/api/src/lib/go-html-to-md/go.sum | 81 +++++++++ .../src/lib/go-html-to-md/html-to-markdown.go | 41 +++++ apps/api/src/lib/html-to-markdown.ts | 162 +++++++++++------- 5 files changed, 260 insertions(+), 64 deletions(-) create mode 100644 apps/api/src/lib/__tests__/html-to-markdown.test.ts create mode 100644 apps/api/src/lib/go-html-to-md/go.mod create mode 100644 apps/api/src/lib/go-html-to-md/go.sum create mode 100644 apps/api/src/lib/go-html-to-md/html-to-markdown.go diff --git a/apps/api/src/lib/__tests__/html-to-markdown.test.ts b/apps/api/src/lib/__tests__/html-to-markdown.test.ts new file mode 100644 index 00000000..00db7758 --- /dev/null +++ b/apps/api/src/lib/__tests__/html-to-markdown.test.ts @@ -0,0 +1,29 @@ +import { parseMarkdown } from '../html-to-markdown'; + +describe('parseMarkdown', () => { + it('should correctly convert simple HTML to Markdown', async () => { + const html = '

Hello, world!

'; + const expectedMarkdown = 'Hello, world!'; + await expect(parseMarkdown(html)).resolves.toBe(expectedMarkdown); + }); + + it('should convert complex HTML with nested elements to Markdown', async () => { + const html = '

Hello bold world!

  • List item
'; + const expectedMarkdown = 'Hello **bold** world!\n\n- List item'; + await expect(parseMarkdown(html)).resolves.toBe(expectedMarkdown); + }); + + it('should return empty string when input is empty', async () => { + const html = ''; + const expectedMarkdown = ''; + await expect(parseMarkdown(html)).resolves.toBe(expectedMarkdown); + }); + + it('should handle null input gracefully', async () => { + const html = null; + const expectedMarkdown = ''; + await expect(parseMarkdown(html)).resolves.toBe(expectedMarkdown); + }); + + +}); diff --git a/apps/api/src/lib/go-html-to-md/go.mod b/apps/api/src/lib/go-html-to-md/go.mod new file mode 100644 index 00000000..40cce17d --- /dev/null +++ b/apps/api/src/lib/go-html-to-md/go.mod @@ -0,0 +1,11 @@ +module html-to-markdown.go + +go 1.22.6 + +require github.com/JohannesKaufmann/html-to-markdown v1.6.0 + +require ( + github.com/PuerkitoBio/goquery v1.9.2 // indirect + github.com/andybalholm/cascadia v1.3.2 // indirect + golang.org/x/net v0.25.0 // indirect +) diff --git a/apps/api/src/lib/go-html-to-md/go.sum b/apps/api/src/lib/go-html-to-md/go.sum new file mode 100644 index 00000000..59bcf2f9 --- /dev/null +++ b/apps/api/src/lib/go-html-to-md/go.sum @@ -0,0 +1,81 @@ +github.com/JohannesKaufmann/html-to-markdown v1.6.0 h1:04VXMiE50YYfCfLboJCLcgqF5x+rHJnb1ssNmqpLH/k= +github.com/JohannesKaufmann/html-to-markdown v1.6.0/go.mod h1:NUI78lGg/a7vpEJTz/0uOcYMaibytE4BUOQS8k78yPQ= +github.com/PuerkitoBio/goquery v1.9.2 h1:4/wZksC3KgkQw7SQgkKotmKljk0M6V8TUvA8Wb4yPeE= +github.com/PuerkitoBio/goquery v1.9.2/go.mod h1:GHPCaP0ODyyxqcNoFGYlAprUFH81NuRPd0GX3Zu2Mvk= +github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss= +github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= +github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= +github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/sebdah/goldie/v2 v2.5.3 h1:9ES/mNN+HNUbNWpVAlrzuZ7jE+Nrczbj8uFRjM7624Y= +github.com/sebdah/goldie/v2 v2.5.3/go.mod h1:oZ9fp0+se1eapSRjfYbsV/0Hqhbuu3bJVvKI/NNtssI= +github.com/sergi/go-diff v1.0.0/go.mod h1:0CfEIISq7TuYL3j771MWULgwwjU+GofnZX9QAmXWZgo= +github.com/sergi/go-diff v1.3.1 h1:xkr+Oxo4BOQKmkn/B9eMK0g5Kg/983T9DqqPHwYqD+8= +github.com/sergi/go-diff v1.3.1/go.mod h1:aMJSSKb2lpPvRNec0+w3fl7LP9IOFzdc9Pa4NFbPK1I= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +github.com/yuin/goldmark v1.7.1 h1:3bajkSilaCbjdKVsKdZjZCLBNPL9pYzrCakKaf4U49U= +github.com/yuin/goldmark v1.7.1/go.mod h1:uzxRWxtg69N339t3louHJ7+O03ezfj6PlliRlaOzY1E= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU= +golang.org/x/crypto v0.22.0/go.mod h1:vr6Su+7cTlO45qkww3VDJlzDn0ctJvRgYbC2NvXHt+M= +golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns= +golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= +golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= +golang.org/x/net v0.24.0/go.mod h1:2Q7sJY5mzlzWjKtYUEXSlBWCdyaioyXzRB2RtU8KVE8= +golang.org/x/net v0.25.0 h1:d/OCCoBEUq33pjydKrGQhw7IlUPI2Oylr+8qLx49kac= +golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.19.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= +golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY= +golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo= +golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk= +golang.org/x/term v0.19.0/go.mod h1:2CuTdWZ7KHSQwUzKva0cbMg6q2DMI3Mmxp+gKJbskEk= +golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= +golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= +gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= diff --git a/apps/api/src/lib/go-html-to-md/html-to-markdown.go b/apps/api/src/lib/go-html-to-md/html-to-markdown.go new file mode 100644 index 00000000..474c8324 --- /dev/null +++ b/apps/api/src/lib/go-html-to-md/html-to-markdown.go @@ -0,0 +1,41 @@ +package main + +import ( + "flag" + "fmt" + "log" + "sync" + + md "github.com/JohannesKaufmann/html-to-markdown" + "github.com/JohannesKaufmann/html-to-markdown/plugin" +) + +func convertHTMLToMarkdown(html string, wg *sync.WaitGroup, results chan<- string) { + defer wg.Done() + converter := md.NewConverter("", true, nil) + converter.Use(plugin.GitHubFlavored()) + + markdown, err := converter.ConvertString(html) + if err != nil { + log.Fatal(err) + } + results <- markdown +} + +func main() { + html := flag.String("html", "", "") + flag.Parse() + + var wg sync.WaitGroup + results := make(chan string, 1) + + wg.Add(1) + go convertHTMLToMarkdown(*html, &wg, results) + + wg.Wait() + close(results) + + for markdown := range results { + fmt.Println(markdown) + } +} diff --git a/apps/api/src/lib/html-to-markdown.ts b/apps/api/src/lib/html-to-markdown.ts index 002cb7be..04fec4c6 100644 --- a/apps/api/src/lib/html-to-markdown.ts +++ b/apps/api/src/lib/html-to-markdown.ts @@ -1,74 +1,108 @@ -export async function parseMarkdown(html: string) { - var TurndownService = require("turndown"); - var turndownPluginGfm = require('joplin-turndown-plugin-gfm') +import { spawn } from 'node:child_process'; +import { join } from 'node:path'; +export async function parseMarkdown(html: string): Promise { + if (!html) { + return ''; + } + + if (process.env.USE_GO_MARKDOWN_PARSER == "true") { + const goScriptPath = join(__dirname, 'go-html-to-md/html-to-markdown.go'); + const goModDir = join(__dirname, 'go-html-to-md'); + const child = spawn('go', ['run', goScriptPath, '--html', html], { + cwd: goModDir, + }); + + return new Promise((resolve, reject) => { + let data = ''; + + child.stdout.on('data', (chunk) => { + data += chunk.toString(); // Convert Buffer to string + }); + + child.stderr.on('data', (chunk) => { + reject(chunk.toString()); // Convert Buffer to string before rejecting + }); + + child.on('close', (code) => { + if (code === 0) { + resolve(data.trim()); + } else { + reject(new Error(`Process exited with code ${code}`)); + } + }); + }); + } else { + var TurndownService = require("turndown"); + var turndownPluginGfm = require('joplin-turndown-plugin-gfm') + + const turndownService = new TurndownService(); + turndownService.addRule("inlineLink", { + filter: function (node, options) { + return ( + options.linkStyle === "inlined" && + node.nodeName === "A" && + node.getAttribute("href") + ); + }, + replacement: function (content, node) { + var href = node.getAttribute("href").trim(); + var title = node.title ? ' "' + node.title + '"' : ""; + return "[" + content.trim() + "](" + href + title + ")\n"; + }, + }); + var gfm = turndownPluginGfm.gfm; + turndownService.use(gfm); + let markdownContent = ""; + const turndownPromise = new Promise((resolve, reject) => { + try { + const result = turndownService.turndown(html); + resolve(result); + } catch (error) { + reject("Error converting HTML to Markdown: " + error); + } + }); + + const timeoutPromise = new Promise((resolve, reject) => { + const timeout = 5000; // Timeout in milliseconds + setTimeout(() => reject("Conversion timed out after " + timeout + "ms"), timeout); + }); - const turndownService = new TurndownService(); - turndownService.addRule("inlineLink", { - filter: function (node, options) { - return ( - options.linkStyle === "inlined" && - node.nodeName === "A" && - node.getAttribute("href") - ); - }, - replacement: function (content, node) { - var href = node.getAttribute("href").trim(); - var title = node.title ? ' "' + node.title + '"' : ""; - return "[" + content.trim() + "](" + href + title + ")\n"; - }, - }); - var gfm = turndownPluginGfm.gfm; - turndownService.use(gfm); - let markdownContent = ""; - const turndownPromise = new Promise((resolve, reject) => { try { - const result = turndownService.turndown(html); - resolve(result); + markdownContent = await Promise.race([turndownPromise, timeoutPromise]); } catch (error) { - reject("Error converting HTML to Markdown: " + error); + console.error(error); + return ""; // Optionally return an empty string or handle the error as needed } - }); - const timeoutPromise = new Promise((resolve, reject) => { - const timeout = 5000; // Timeout in milliseconds - setTimeout(() => reject("Conversion timed out after " + timeout + "ms"), timeout); - }); + // multiple line links + let insideLinkContent = false; + let newMarkdownContent = ""; + let linkOpenCount = 0; + for (let i = 0; i < markdownContent.length; i++) { + const char = markdownContent[i]; - try { - markdownContent = await Promise.race([turndownPromise, timeoutPromise]); - } catch (error) { - console.error(error); - return ""; // Optionally return an empty string or handle the error as needed + if (char == "[") { + linkOpenCount++; + } else if (char == "]") { + linkOpenCount = Math.max(0, linkOpenCount - 1); + } + insideLinkContent = linkOpenCount > 0; + + if (insideLinkContent && char == "\n") { + newMarkdownContent += "\\" + "\n"; + } else { + newMarkdownContent += char; + } + } + markdownContent = newMarkdownContent; + + // Remove [Skip to Content](#page) and [Skip to content](#skip) + markdownContent = markdownContent.replace( + /\[Skip to Content\]\(#[^\)]*\)/gi, + "" + ); + return markdownContent; } - - // multiple line links - let insideLinkContent = false; - let newMarkdownContent = ""; - let linkOpenCount = 0; - for (let i = 0; i < markdownContent.length; i++) { - const char = markdownContent[i]; - - if (char == "[") { - linkOpenCount++; - } else if (char == "]") { - linkOpenCount = Math.max(0, linkOpenCount - 1); - } - insideLinkContent = linkOpenCount > 0; - - if (insideLinkContent && char == "\n") { - newMarkdownContent += "\\" + "\n"; - } else { - newMarkdownContent += char; - } - } - markdownContent = newMarkdownContent; - - // Remove [Skip to Content](#page) and [Skip to content](#skip) - markdownContent = markdownContent.replace( - /\[Skip to Content\]\(#[^\)]*\)/gi, - "" - ); - return markdownContent; } From 2a8f55e533175d75381c699c68526763dfe5892a Mon Sep 17 00:00:00 2001 From: Andrei Bobkov Date: Tue, 3 Sep 2024 11:12:28 +0200 Subject: [PATCH 03/62] perf(js-sdk): remove whole `z` import and instead use type-only import --- apps/js-sdk/firecrawl/src/index.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 1d1bb4ee..95b4eebd 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -1,5 +1,5 @@ import axios, { AxiosResponse, AxiosRequestHeaders } from "axios"; -import { z } from "zod"; +import type { ZodSchema } from "zod"; import { zodToJsonSchema } from "zod-to-json-schema"; import { WebSocket } from "isows"; import { TypedEventTarget } from "typescript-event-target"; @@ -81,7 +81,7 @@ export interface ScrapeParams { onlyMainContent?: boolean; extract?: { prompt?: string; - schema?: z.ZodSchema | any; + schema?: ZodSchema | any; systemPrompt?: string; }; waitFor?: number; From 2b0e447bc26ec94f930af68de4d0ad4e6d6fb08f Mon Sep 17 00:00:00 2001 From: Andrei Bobkov Date: Tue, 3 Sep 2024 11:13:48 +0200 Subject: [PATCH 04/62] perf(js-sdk): move `dotenv` and `uuid` to `devDependencies` --- apps/js-sdk/firecrawl/package-lock.json | 12 +++++++----- apps/js-sdk/firecrawl/package.json | 4 ++-- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/apps/js-sdk/firecrawl/package-lock.json b/apps/js-sdk/firecrawl/package-lock.json index ce6a1a4a..7c2ecbfd 100644 --- a/apps/js-sdk/firecrawl/package-lock.json +++ b/apps/js-sdk/firecrawl/package-lock.json @@ -1,19 +1,17 @@ { "name": "@mendable/firecrawl-js", - "version": "1.1.0", + "version": "1.2.1", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "@mendable/firecrawl-js", - "version": "1.1.0", + "version": "1.2.1", "license": "MIT", "dependencies": { "axios": "^1.6.8", - "dotenv": "^16.4.5", "isows": "^1.0.4", "typescript-event-target": "^1.1.1", - "uuid": "^9.0.1", "zod": "^3.23.8", "zod-to-json-schema": "^3.23.0" }, @@ -25,9 +23,11 @@ "@types/mocha": "^10.0.6", "@types/node": "^20.12.12", "@types/uuid": "^9.0.8", + "dotenv": "^16.4.5", "jest": "^29.7.0", "ts-jest": "^29.2.2", - "typescript": "^5.4.5" + "typescript": "^5.4.5", + "uuid": "^9.0.1" } }, "node_modules/@ampproject/remapping": { @@ -1657,6 +1657,7 @@ "version": "16.4.5", "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.4.5.tgz", "integrity": "sha512-ZmdL2rui+eB2YwhsWzjInR8LldtZHGDoQ1ugH85ppHKwpUHL7j7rN0Ti9NCnGiQbhaZ11FpR+7ao1dNsmduNUg==", + "dev": true, "engines": { "node": ">=12" }, @@ -3794,6 +3795,7 @@ "version": "9.0.1", "resolved": "https://registry.npmjs.org/uuid/-/uuid-9.0.1.tgz", "integrity": "sha512-b+1eJOlsR9K8HJpow9Ok3fiWOWSIcIzXodvv0rQjVoOVNpWMpxf1wZNpt4y9h10odCNrqnYp1OBzRktckBe3sA==", + "dev": true, "funding": [ "https://github.com/sponsors/broofa", "https://github.com/sponsors/ctavan" diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index e68b3014..62120b35 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -29,10 +29,8 @@ "license": "MIT", "dependencies": { "axios": "^1.6.8", - "dotenv": "^16.4.5", "isows": "^1.0.4", "typescript-event-target": "^1.1.1", - "uuid": "^9.0.1", "zod": "^3.23.8", "zod-to-json-schema": "^3.23.0" }, @@ -41,6 +39,8 @@ }, "homepage": "https://github.com/mendableai/firecrawl#readme", "devDependencies": { + "uuid": "^9.0.1", + "dotenv": "^16.4.5", "@jest/globals": "^29.7.0", "@types/axios": "^0.14.0", "@types/dotenv": "^8.2.0", From 291d9e375b27e442e5928e6e5a62e1a96e35d674 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 3 Sep 2024 10:56:07 -0300 Subject: [PATCH 05/62] now using compiled go/C lib with koffi --- apps/api/Dockerfile | 12 +- apps/api/package.json | 1 + apps/api/pnpm-lock.yaml | 8 + apps/api/src/lib/go-html-to-md/README.md | 7 + apps/api/src/lib/go-html-to-md/go.mod | 5 +- apps/api/src/lib/go-html-to-md/go.sum | 12 ++ .../src/lib/go-html-to-md/html-to-markdown.go | 28 +-- apps/api/src/lib/html-to-markdown.ts | 178 +++++++++--------- apps/test-suite/tests/scrape.test.ts | 11 +- 9 files changed, 144 insertions(+), 118 deletions(-) create mode 100644 apps/api/src/lib/go-html-to-md/README.md diff --git a/apps/api/Dockerfile b/apps/api/Dockerfile index 3ffede0d..a4a2c76b 100644 --- a/apps/api/Dockerfile +++ b/apps/api/Dockerfile @@ -17,8 +17,15 @@ RUN pnpm install RUN --mount=type=secret,id=SENTRY_AUTH_TOKEN \ bash -c 'export SENTRY_AUTH_TOKEN="$(cat /run/secrets/SENTRY_AUTH_TOKEN)"; if [ -z $SENTRY_AUTH_TOKEN ]; then pnpm run build:nosentry; else pnpm run build; fi' -# Install packages needed for deployment +# Install Go 1.19 +FROM golang:1.19 AS go-base +COPY src/lib/go-html-to-md /app/src/lib/go-html-to-md +# Install Go dependencies and build +RUN cd /app/src/lib/go-html-to-md && \ + go mod tidy && \ + go build -o html-to-markdown.so -buildmode=c-shared html-to-markdown.go && \ + chmod +x html-to-markdown.so FROM base RUN apt-get update -qq && \ @@ -26,8 +33,7 @@ RUN apt-get update -qq && \ rm -rf /var/lib/apt/lists /var/cache/apt/archives COPY --from=prod-deps /app/node_modules /app/node_modules COPY --from=build /app /app - - +COPY --from=go-base /app/src/lib/go-html-to-md/html-to-markdown.so /app/src/lib/go-html-to-md/html-to-markdown.so # Start the server by default, this can be overwritten at runtime diff --git a/apps/api/package.json b/apps/api/package.json index bac13e79..dc26b34b 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -86,6 +86,7 @@ "joplin-turndown-plugin-gfm": "^1.0.12", "json-schema-to-zod": "^2.3.0", "keyword-extractor": "^0.0.28", + "koffi": "^2.9.0", "langchain": "^0.2.8", "languagedetect": "^2.0.0", "logsnag": "^1.0.0", diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index 2762a84c..b8f876a8 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -122,6 +122,9 @@ importers: keyword-extractor: specifier: ^0.0.28 version: 0.0.28 + koffi: + specifier: ^2.9.0 + version: 2.9.0 langchain: specifier: ^0.2.8 version: 0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0) @@ -3170,6 +3173,9 @@ packages: resolution: {integrity: sha512-eTIzlVOSUR+JxdDFepEYcBMtZ9Qqdef+rnzWdRZuMbOywu5tO2w2N7rqjoANZ5k9vywhL6Br1VRjUIgTQx4E8w==} engines: {node: '>=6'} + koffi@2.9.0: + resolution: {integrity: sha512-KCsuJ2gM58n6bNdR2Z7gqsh/3TchxxQFbVgax2/UvAjRTgwNSYAJDx9E3jrkBP4jEDHWRCfE47Y2OG+/fiSvEw==} + langchain@0.2.8: resolution: {integrity: sha512-kb2IOMA71xH8e6EXFg0l4S+QSMC/c796pj1+7mPBkR91HHwoyHZhFRrBaZv4tV+Td+Ba91J2uEDBmySklZLpNQ==} engines: {node: '>=18'} @@ -8492,6 +8498,8 @@ snapshots: kleur@3.0.3: {} + koffi@2.9.0: {} + langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0): dependencies: '@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8)) diff --git a/apps/api/src/lib/go-html-to-md/README.md b/apps/api/src/lib/go-html-to-md/README.md new file mode 100644 index 00000000..4ad510c3 --- /dev/null +++ b/apps/api/src/lib/go-html-to-md/README.md @@ -0,0 +1,7 @@ +To build the go-html-to-md library, run the following command: + +```bash +cd apps/api/src/lib/go-html-to-md +go build -o html-to-markdown.so -buildmode=c-shared html-to-markdown.go +chmod +x html-to-markdown.so +``` \ No newline at end of file diff --git a/apps/api/src/lib/go-html-to-md/go.mod b/apps/api/src/lib/go-html-to-md/go.mod index 40cce17d..0836f441 100644 --- a/apps/api/src/lib/go-html-to-md/go.mod +++ b/apps/api/src/lib/go-html-to-md/go.mod @@ -1,11 +1,14 @@ module html-to-markdown.go -go 1.22.6 +go 1.19 require github.com/JohannesKaufmann/html-to-markdown v1.6.0 require ( github.com/PuerkitoBio/goquery v1.9.2 // indirect github.com/andybalholm/cascadia v1.3.2 // indirect + github.com/kr/pretty v0.3.0 // indirect golang.org/x/net v0.25.0 // indirect + gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c // indirect + gopkg.in/yaml.v2 v2.4.0 // indirect ) diff --git a/apps/api/src/lib/go-html-to-md/go.sum b/apps/api/src/lib/go-html-to-md/go.sum index 59bcf2f9..7961629d 100644 --- a/apps/api/src/lib/go-html-to-md/go.sum +++ b/apps/api/src/lib/go-html-to-md/go.sum @@ -4,14 +4,22 @@ github.com/PuerkitoBio/goquery v1.9.2 h1:4/wZksC3KgkQw7SQgkKotmKljk0M6V8TUvA8Wb4 github.com/PuerkitoBio/goquery v1.9.2/go.mod h1:GHPCaP0ODyyxqcNoFGYlAprUFH81NuRPd0GX3Zu2Mvk= github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss= github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU= +github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= +github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= +github.com/kr/pretty v0.3.0 h1:WgNl7dwNpEZ6jJ9k1snq4pZsg7DOEN8hP9Xw0Tsjwk0= +github.com/kr/pretty v0.3.0/go.mod h1:640gp4NfQd8pI5XOwp5fnNeVWj67G7CFk/SaSQn7NBk= github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= +github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= +github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/rogpeppe/go-internal v1.6.1 h1:/FiVV8dS/e+YqF2JvO3yXRFbBLTIuSDkuC7aBOAvL+k= +github.com/rogpeppe/go-internal v1.6.1/go.mod h1:xXDCJY+GAPziupqXw64V24skbSoqbTEfhy4qGm1nDQc= github.com/sebdah/goldie/v2 v2.5.3 h1:9ES/mNN+HNUbNWpVAlrzuZ7jE+Nrczbj8uFRjM7624Y= github.com/sebdah/goldie/v2 v2.5.3/go.mod h1:oZ9fp0+se1eapSRjfYbsV/0Hqhbuu3bJVvKI/NNtssI= github.com/sergi/go-diff v1.0.0/go.mod h1:0CfEIISq7TuYL3j771MWULgwwjU+GofnZX9QAmXWZgo= @@ -75,7 +83,11 @@ golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= +gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI= gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= diff --git a/apps/api/src/lib/go-html-to-md/html-to-markdown.go b/apps/api/src/lib/go-html-to-md/html-to-markdown.go index 474c8324..9905a69a 100644 --- a/apps/api/src/lib/go-html-to-md/html-to-markdown.go +++ b/apps/api/src/lib/go-html-to-md/html-to-markdown.go @@ -1,41 +1,25 @@ package main import ( - "flag" - "fmt" + "C" "log" - "sync" md "github.com/JohannesKaufmann/html-to-markdown" "github.com/JohannesKaufmann/html-to-markdown/plugin" ) -func convertHTMLToMarkdown(html string, wg *sync.WaitGroup, results chan<- string) { - defer wg.Done() +//export ConvertHTMLToMarkdown +func ConvertHTMLToMarkdown(html *C.char) *C.char { converter := md.NewConverter("", true, nil) converter.Use(plugin.GitHubFlavored()) - markdown, err := converter.ConvertString(html) + markdown, err := converter.ConvertString(C.GoString(html)) if err != nil { log.Fatal(err) } - results <- markdown + return C.CString(markdown) } func main() { - html := flag.String("html", "", "") - flag.Parse() - - var wg sync.WaitGroup - results := make(chan string, 1) - - wg.Add(1) - go convertHTMLToMarkdown(*html, &wg, results) - - wg.Wait() - close(results) - - for markdown := range results { - fmt.Println(markdown) - } + // This function is required for the main package } diff --git a/apps/api/src/lib/html-to-markdown.ts b/apps/api/src/lib/html-to-markdown.ts index 04fec4c6..4c7cffdd 100644 --- a/apps/api/src/lib/html-to-markdown.ts +++ b/apps/api/src/lib/html-to-markdown.ts @@ -1,108 +1,106 @@ -import { spawn } from 'node:child_process'; -import { join } from 'node:path'; +import koffi from 'koffi'; +import { join } from 'path'; +import "../services/sentry" +import * as Sentry from "@sentry/node"; + +import dotenv from 'dotenv'; +import { Logger } from './logger'; +dotenv.config(); export async function parseMarkdown(html: string): Promise { if (!html) { return ''; } - if (process.env.USE_GO_MARKDOWN_PARSER == "true") { - const goScriptPath = join(__dirname, 'go-html-to-md/html-to-markdown.go'); - const goModDir = join(__dirname, 'go-html-to-md'); - const child = spawn('go', ['run', goScriptPath, '--html', html], { - cwd: goModDir, - }); + try { + if (process.env.USE_GO_MARKDOWN_PARSER == "true") { + const goExecutablePath = join(__dirname, 'go-html-to-md/html-to-markdown.so'); + const lib = koffi.load(goExecutablePath); + + const convert = lib.func('Convert', 'string', ['string']); - return new Promise((resolve, reject) => { - let data = ''; - - child.stdout.on('data', (chunk) => { - data += chunk.toString(); // Convert Buffer to string + let markdownContent = await new Promise((resolve, reject) => { + convert.async(html, (err: Error, res: string) => { + if (err) { + reject(err); + } else { + resolve(res); + } + }); }); - child.stderr.on('data', (chunk) => { - reject(chunk.toString()); // Convert Buffer to string before rejecting - }); - - child.on('close', (code) => { - if (code === 0) { - resolve(data.trim()); - } else { - reject(new Error(`Process exited with code ${code}`)); - } - }); - }); - } else { - var TurndownService = require("turndown"); - var turndownPluginGfm = require('joplin-turndown-plugin-gfm') - - const turndownService = new TurndownService(); - turndownService.addRule("inlineLink", { - filter: function (node, options) { - return ( - options.linkStyle === "inlined" && - node.nodeName === "A" && - node.getAttribute("href") - ); - }, - replacement: function (content, node) { - var href = node.getAttribute("href").trim(); - var title = node.title ? ' "' + node.title + '"' : ""; - return "[" + content.trim() + "](" + href + title + ")\n"; - }, - }); - var gfm = turndownPluginGfm.gfm; - turndownService.use(gfm); - let markdownContent = ""; - const turndownPromise = new Promise((resolve, reject) => { - try { - const result = turndownService.turndown(html); - resolve(result); - } catch (error) { - reject("Error converting HTML to Markdown: " + error); - } - }); - - const timeoutPromise = new Promise((resolve, reject) => { - const timeout = 5000; // Timeout in milliseconds - setTimeout(() => reject("Conversion timed out after " + timeout + "ms"), timeout); - }); - - try { - markdownContent = await Promise.race([turndownPromise, timeoutPromise]); - } catch (error) { - console.error(error); - return ""; // Optionally return an empty string or handle the error as needed + markdownContent = processMultiLineLinks(markdownContent); + markdownContent = removeSkipToContentLinks(markdownContent); + return markdownContent; } + } catch (error) { + Sentry.captureException(error); + Logger.error(`Error converting HTML to Markdown with Go parser: ${error}`); + } - // multiple line links - let insideLinkContent = false; - let newMarkdownContent = ""; - let linkOpenCount = 0; - for (let i = 0; i < markdownContent.length; i++) { - const char = markdownContent[i]; + // Fallback to TurndownService if Go parser fails or is not enabled + var TurndownService = require("turndown"); + var turndownPluginGfm = require('joplin-turndown-plugin-gfm'); - if (char == "[") { - linkOpenCount++; - } else if (char == "]") { - linkOpenCount = Math.max(0, linkOpenCount - 1); - } - insideLinkContent = linkOpenCount > 0; + const turndownService = new TurndownService(); + turndownService.addRule("inlineLink", { + filter: function (node, options) { + return ( + options.linkStyle === "inlined" && + node.nodeName === "A" && + node.getAttribute("href") + ); + }, + replacement: function (content, node) { + var href = node.getAttribute("href").trim(); + var title = node.title ? ' "' + node.title + '"' : ""; + return "[" + content.trim() + "](" + href + title + ")\n"; + }, + }); + var gfm = turndownPluginGfm.gfm; + turndownService.use(gfm); - if (insideLinkContent && char == "\n") { - newMarkdownContent += "\\" + "\n"; - } else { - newMarkdownContent += char; - } - } - markdownContent = newMarkdownContent; + try { + let markdownContent = await turndownService.turndown(html); + markdownContent = processMultiLineLinks(markdownContent); + markdownContent = removeSkipToContentLinks(markdownContent); - // Remove [Skip to Content](#page) and [Skip to content](#skip) - markdownContent = markdownContent.replace( - /\[Skip to Content\]\(#[^\)]*\)/gi, - "" - ); return markdownContent; + } catch (error) { + console.error("Error converting HTML to Markdown: ", error); + return ""; // Optionally return an empty string or handle the error as needed } } + +function processMultiLineLinks(markdownContent: string): string { + let insideLinkContent = false; + let newMarkdownContent = ""; + let linkOpenCount = 0; + for (let i = 0; i < markdownContent.length; i++) { + const char = markdownContent[i]; + + if (char == "[") { + linkOpenCount++; + } else if (char == "]") { + linkOpenCount = Math.max(0, linkOpenCount - 1); + } + insideLinkContent = linkOpenCount > 0; + + if (insideLinkContent && char == "\n") { + newMarkdownContent += "\\" + "\n"; + } else { + newMarkdownContent += char; + } + } + return newMarkdownContent; +} + +function removeSkipToContentLinks(markdownContent: string): string { + // Remove [Skip to Content](#page) and [Skip to content](#skip) + const newMarkdownContent = markdownContent.replace( + /\[Skip to Content\]\(#[^\)]*\)/gi, + "" + ); + return newMarkdownContent; +} \ No newline at end of file diff --git a/apps/test-suite/tests/scrape.test.ts b/apps/test-suite/tests/scrape.test.ts index ec7b7202..8b2e15d1 100644 --- a/apps/test-suite/tests/scrape.test.ts +++ b/apps/test-suite/tests/scrape.test.ts @@ -31,6 +31,7 @@ describe("Scraping Checkup (E2E)", () => { describe("Scraping website tests with a dataset", () => { it("Should scrape the website and prompt it against OpenAI", async () => { + let totalTimeTaken = 0; let passedTests = 0; const batchSize = 15; // Adjusted to comply with the rate limit of 15 per minute const batchPromises = []; @@ -51,11 +52,16 @@ describe("Scraping Checkup (E2E)", () => { const batchPromise = Promise.all( batch.map(async (websiteData: WebsiteData) => { try { + const startTime = new Date().getTime(); const scrapedContent = await request(TEST_URL || "") - .post("/v0/scrape") + .post("/v1/scrape") .set("Content-Type", "application/json") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .send({ url: websiteData.website, pageOptions: { onlyMainContent: true } }); + .send({ url: websiteData.website }); + + const endTime = new Date().getTime(); + const timeTaken = endTime - startTime; + totalTimeTaken += timeTaken; if (scrapedContent.statusCode !== 200) { console.error(`Failed to scrape ${websiteData.website} ${scrapedContent.statusCode}`); @@ -165,6 +171,7 @@ describe("Scraping Checkup (E2E)", () => { const timeTaken = (endTime - startTime) / 1000; console.log(`Score: ${score}%`); console.log(`Total tokens: ${totalTokens}`); + console.log(`Total time taken: ${totalTimeTaken} miliseconds`); await logErrors(errorLog, timeTaken, totalTokens, score, websitesData.length); From 411d7f31c5347b1524632b6743e3b37508817b9a Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 3 Sep 2024 11:56:24 -0300 Subject: [PATCH 06/62] fix(sdks): fetch next/pagination --- apps/python-sdk/firecrawl/__init__.py | 2 +- apps/python-sdk/firecrawl/firecrawl.py | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py index 4b3807be..59c5fe35 100644 --- a/apps/python-sdk/firecrawl/__init__.py +++ b/apps/python-sdk/firecrawl/__init__.py @@ -13,7 +13,7 @@ import os from .firecrawl import FirecrawlApp -__version__ = "1.2.1" +__version__ = "1.2.2" # Define the logger for the Firecrawl project logger: logging.Logger = logging.getLogger("firecrawl") diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 75245e8d..d65b0341 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -346,6 +346,12 @@ class FirecrawlApp: status_data = status_response.json() if status_data['status'] == 'completed': if 'data' in status_data: + data = status_data['data'] + while 'next' in status_data: + status_response = self._get_request(status_data['next'], headers) + status_data = status_response.json() + data.extend(status_data['data']) + status_data['data'] = data return status_data else: raise Exception('Crawl job completed but no data was returned') From 6ccc22ba2fd77e5fe993c2dd64b8c82900bb352a Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 3 Sep 2024 11:57:19 -0300 Subject: [PATCH 07/62] fix(sdk): js next pagination --- apps/js-sdk/firecrawl/package.json | 2 +- apps/js-sdk/firecrawl/src/index.ts | 13 ++++++++++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index e68b3014..7114a625 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.2.1", + "version": "1.2.2", "description": "JavaScript SDK for Firecrawl API", "main": "build/cjs/index.js", "types": "types/index.d.ts", diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 1d1bb4ee..8b16adfb 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -454,20 +454,27 @@ export default class FirecrawlApp { checkInterval: number ): Promise { while (true) { - const statusResponse: AxiosResponse = await this.getRequest( + let statusResponse: AxiosResponse = await this.getRequest( `${this.apiUrl}/v1/crawl/${id}`, headers ); if (statusResponse.status === 200) { - const statusData = statusResponse.data; + let statusData = statusResponse.data; if (statusData.status === "completed") { if ("data" in statusData) { + let data = statusData.data; + while ('next' in statusData) { + statusResponse = await this.getRequest(statusData.next, headers); + statusData = statusResponse.data; + data = data.concat(statusData.data); + } + statusData.data = data; return statusData; } else { throw new Error("Crawl job completed but no data was returned"); } } else if ( - ["active", "paused", "pending", "queued", "scraping"].includes(statusData.status) + ["active", "paused", "pending", "queued", "waiting", "scraping"].includes(statusData.status) ) { checkInterval = Math.max(checkInterval, 2); await new Promise((resolve) => From ebf403548487249bdb2d8ca2bfe086f950736622 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 3 Sep 2024 13:15:21 -0300 Subject: [PATCH 08/62] added log so we can check --- apps/api/src/lib/html-to-markdown.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/apps/api/src/lib/html-to-markdown.ts b/apps/api/src/lib/html-to-markdown.ts index 4c7cffdd..e4f8f692 100644 --- a/apps/api/src/lib/html-to-markdown.ts +++ b/apps/api/src/lib/html-to-markdown.ts @@ -32,6 +32,7 @@ export async function parseMarkdown(html: string): Promise { markdownContent = processMultiLineLinks(markdownContent); markdownContent = removeSkipToContentLinks(markdownContent); + Logger.info(`HTML to Markdown conversion using Go parser successful`); return markdownContent; } } catch (error) { From d60fa6e0849fe13538cffaf36bafec55bc1c6ff6 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 3 Sep 2024 14:08:07 -0300 Subject: [PATCH 09/62] fixed dockerfile and function name. it's working --- apps/api/Dockerfile | 9 ++++----- apps/api/src/lib/html-to-markdown.ts | 6 +++++- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/apps/api/Dockerfile b/apps/api/Dockerfile index a4a2c76b..527a6dc7 100644 --- a/apps/api/Dockerfile +++ b/apps/api/Dockerfile @@ -17,11 +17,11 @@ RUN pnpm install RUN --mount=type=secret,id=SENTRY_AUTH_TOKEN \ bash -c 'export SENTRY_AUTH_TOKEN="$(cat /run/secrets/SENTRY_AUTH_TOKEN)"; if [ -z $SENTRY_AUTH_TOKEN ]; then pnpm run build:nosentry; else pnpm run build; fi' -# Install Go 1.19 +# Install Go FROM golang:1.19 AS go-base COPY src/lib/go-html-to-md /app/src/lib/go-html-to-md -# Install Go dependencies and build +# Install Go dependencies and build parser lib RUN cd /app/src/lib/go-html-to-md && \ go mod tidy && \ go build -o html-to-markdown.so -buildmode=c-shared html-to-markdown.go && \ @@ -33,9 +33,8 @@ RUN apt-get update -qq && \ rm -rf /var/lib/apt/lists /var/cache/apt/archives COPY --from=prod-deps /app/node_modules /app/node_modules COPY --from=build /app /app -COPY --from=go-base /app/src/lib/go-html-to-md/html-to-markdown.so /app/src/lib/go-html-to-md/html-to-markdown.so - +COPY --from=go-base /app/src/lib/go-html-to-md/html-to-markdown.so /app/dist/src/lib/go-html-to-md/html-to-markdown.so # Start the server by default, this can be overwritten at runtime EXPOSE 8080 -ENV PUPPETEER_EXECUTABLE_PATH="/usr/bin/chromium" +ENV PUPPETEER_EXECUTABLE_PATH="/usr/bin/chromium" \ No newline at end of file diff --git a/apps/api/src/lib/html-to-markdown.ts b/apps/api/src/lib/html-to-markdown.ts index e4f8f692..a5f69962 100644 --- a/apps/api/src/lib/html-to-markdown.ts +++ b/apps/api/src/lib/html-to-markdown.ts @@ -8,6 +8,10 @@ import dotenv from 'dotenv'; import { Logger } from './logger'; dotenv.config(); +// TODO: test with invalid html +// TODO: create a singleton for the converter +// TODO: add a timeout to the Go parser + export async function parseMarkdown(html: string): Promise { if (!html) { return ''; @@ -18,7 +22,7 @@ export async function parseMarkdown(html: string): Promise { const goExecutablePath = join(__dirname, 'go-html-to-md/html-to-markdown.so'); const lib = koffi.load(goExecutablePath); - const convert = lib.func('Convert', 'string', ['string']); + const convert = lib.func('ConvertHTMLToMarkdown', 'string', ['string']); let markdownContent = await new Promise((resolve, reject) => { convert.async(html, (err: Error, res: string) => { From c5e1d77a8253e471a7b021229ac6c85743ba8343 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 3 Sep 2024 15:21:45 -0300 Subject: [PATCH 10/62] added invalid html tests --- apps/api/src/lib/__tests__/html-to-markdown.test.ts | 13 ++++++++++++- apps/api/src/lib/html-to-markdown.ts | 1 - 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/apps/api/src/lib/__tests__/html-to-markdown.test.ts b/apps/api/src/lib/__tests__/html-to-markdown.test.ts index 00db7758..3c68c959 100644 --- a/apps/api/src/lib/__tests__/html-to-markdown.test.ts +++ b/apps/api/src/lib/__tests__/html-to-markdown.test.ts @@ -25,5 +25,16 @@ describe('parseMarkdown', () => { await expect(parseMarkdown(html)).resolves.toBe(expectedMarkdown); }); - + it('should handle various types of invalid HTML gracefully', async () => { + const invalidHtmls = [ + { html: '

Unclosed tag', expected: 'Unclosed tag' }, + { html: '

Missing closing div', expected: 'Missing closing div' }, + { html: '

Wrong nesting

', expected: '**Wrong nesting**' }, + { html: 'Link without closing tag', expected: '[Link without closing tag](http://example.com)' } + ]; + + for (const { html, expected } of invalidHtmls) { + await expect(parseMarkdown(html)).resolves.toBe(expected); + } + }); }); diff --git a/apps/api/src/lib/html-to-markdown.ts b/apps/api/src/lib/html-to-markdown.ts index a5f69962..103948f4 100644 --- a/apps/api/src/lib/html-to-markdown.ts +++ b/apps/api/src/lib/html-to-markdown.ts @@ -8,7 +8,6 @@ import dotenv from 'dotenv'; import { Logger } from './logger'; dotenv.config(); -// TODO: test with invalid html // TODO: create a singleton for the converter // TODO: add a timeout to the Go parser From 7561fd279f7d48ea0572b14175d61008a615da5a Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 3 Sep 2024 17:08:12 -0300 Subject: [PATCH 11/62] Nick: debug the billing email system for free credits --- apps/api/src/services/billing/credit_billing.ts | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/apps/api/src/services/billing/credit_billing.ts b/apps/api/src/services/billing/credit_billing.ts index 22dc72df..d828a54e 100644 --- a/apps/api/src/services/billing/credit_billing.ts +++ b/apps/api/src/services/billing/credit_billing.ts @@ -255,7 +255,9 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) { const creditLimit = FREE_CREDITS; const creditUsagePercentage = (totalCreditsUsed + credits) / creditLimit; - if (creditUsagePercentage >= 0.8) { + // Add a check to ensure totalCreditsUsed is greater than 0 + if (totalCreditsUsed > 0 && creditUsagePercentage >= 0.8 && creditUsagePercentage < 1) { + Logger.info(`Sending notification for team ${team_id}. Total credits used: ${totalCreditsUsed}, Credit usage percentage: ${creditUsagePercentage}`); await sendNotification( team_id, NotificationType.APPROACHING_LIMIT, From 3072d4a33305c888b6650bd895526319c942f40e Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 3 Sep 2024 21:02:41 -0300 Subject: [PATCH 12/62] Nick: fixed .sort coupons and sentry to withAuth --- apps/api/src/lib/withAuth.ts | 2 ++ apps/api/src/services/billing/credit_billing.ts | 5 +++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/apps/api/src/lib/withAuth.ts b/apps/api/src/lib/withAuth.ts index 1979907e..90cfb449 100644 --- a/apps/api/src/lib/withAuth.ts +++ b/apps/api/src/lib/withAuth.ts @@ -1,5 +1,6 @@ import { AuthResponse } from "../../src/types"; import { Logger } from "./logger"; +import * as Sentry from "@sentry/node"; let warningCount = 0; @@ -18,6 +19,7 @@ export function withAuth( try { return await originalFunction(...args); } catch (error) { + Sentry.captureException(error); Logger.error(`Error in withAuth function: ${error}`); return { success: false, error: error.message } as T; } diff --git a/apps/api/src/services/billing/credit_billing.ts b/apps/api/src/services/billing/credit_billing.ts index d828a54e..2cfea85a 100644 --- a/apps/api/src/services/billing/credit_billing.ts +++ b/apps/api/src/services/billing/credit_billing.ts @@ -40,14 +40,15 @@ export async function supaBillTeam(team_id: string, credits: number) { ]); let couponCredits = 0; + let sortedCoupons = []; + if (coupons && coupons.length > 0) { couponCredits = coupons.reduce( (total, coupon) => total + coupon.credits, 0 ); + sortedCoupons = [...coupons].sort((a, b) => b.credits - a.credits); } - - let sortedCoupons = coupons.sort((a, b) => b.credits - a.credits); // using coupon credits: if (couponCredits > 0) { // if there is no subscription and they have enough coupon credits From 049a11187d0a0143721c8ba745889e928ced4b38 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 3 Sep 2024 21:09:32 -0300 Subject: [PATCH 13/62] Nick: --- apps/api/src/controllers/v0/scrape.ts | 12 ++++-------- apps/api/src/controllers/v0/search.ts | 16 ++++------------ apps/api/src/controllers/v1/map.ts | 6 +++++- apps/api/src/controllers/v1/scrape.ts | 12 ++++-------- apps/api/src/main/runWebScraper.ts | 13 ++++--------- apps/api/src/services/billing/credit_billing.ts | 9 ++++----- 6 files changed, 25 insertions(+), 43 deletions(-) diff --git a/apps/api/src/controllers/v0/scrape.ts b/apps/api/src/controllers/v0/scrape.ts index 40df5021..bc91da18 100644 --- a/apps/api/src/controllers/v0/scrape.ts +++ b/apps/api/src/controllers/v0/scrape.ts @@ -244,14 +244,10 @@ export async function scrapeController(req: Request, res: Response) { } if (creditsToBeBilled > 0) { // billing for doc done on queue end, bill only for llm extraction - const billingResult = await billTeam(team_id, creditsToBeBilled); - if (!billingResult.success) { - return res.status(402).json({ - success: false, - error: - "Failed to bill team. Insufficient credits or subscription not found.", - }); - } + billTeam(team_id, creditsToBeBilled).catch(error => { + Logger.error(`Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`); + // Optionally, you could notify an admin or add to a retry queue here + }); } } diff --git a/apps/api/src/controllers/v0/search.ts b/apps/api/src/controllers/v0/search.ts index 825abbe1..5ef2b767 100644 --- a/apps/api/src/controllers/v0/search.ts +++ b/apps/api/src/controllers/v0/search.ts @@ -54,18 +54,10 @@ export async function searchHelper( if (justSearch) { - const billingResult = await billTeam( - team_id, - res.length - ); - if (!billingResult.success) { - return { - success: false, - error: - "Failed to bill team. Insufficient credits or subscription not found.", - returnCode: 402, - }; - } + billTeam(team_id, res.length).catch(error => { + Logger.error(`Failed to bill team ${team_id} for ${res.length} credits: ${error}`); + // Optionally, you could notify an admin or add to a retry queue here + }); return { success: true, data: res, returnCode: 200 }; } diff --git a/apps/api/src/controllers/v1/map.ts b/apps/api/src/controllers/v1/map.ts index 21e91840..4c94f041 100644 --- a/apps/api/src/controllers/v1/map.ts +++ b/apps/api/src/controllers/v1/map.ts @@ -18,6 +18,7 @@ import { fireEngineMap } from "../../search/fireEngine"; import { billTeam } from "../../services/billing/credit_billing"; import { logJob } from "../../services/logging/log_job"; import { performCosineSimilarity } from "../../lib/map-cosine"; +import { Logger } from "../../lib/logger"; configDotenv(); @@ -100,7 +101,10 @@ export async function mapController( // remove duplicates that could be due to http/https or www links = removeDuplicateUrls(links); - await billTeam(req.auth.team_id, 1); + billTeam(req.auth.team_id, 1).catch(error => { + Logger.error(`Failed to bill team ${req.auth.team_id} for 1 credit: ${error}`); + // Optionally, you could notify an admin or add to a retry queue here + }); const endTime = new Date().getTime(); const timeTakenInSeconds = (endTime - startTime) / 1000; diff --git a/apps/api/src/controllers/v1/scrape.ts b/apps/api/src/controllers/v1/scrape.ts index 9fba1a45..0835cc2a 100644 --- a/apps/api/src/controllers/v1/scrape.ts +++ b/apps/api/src/controllers/v1/scrape.ts @@ -106,14 +106,10 @@ export async function scrapeController( creditsToBeBilled = 50; } - const billingResult = await billTeam(req.auth.team_id, creditsToBeBilled); - if (!billingResult.success) { - return res.status(402).json({ - success: false, - error: - "Failed to bill team. Insufficient credits or subscription not found.", - }); - } + billTeam(req.auth.team_id, creditsToBeBilled).catch(error => { + Logger.error(`Failed to bill team ${req.auth.team_id} for ${creditsToBeBilled} credits: ${error}`); + // Optionally, you could notify an admin or add to a retry queue here + }); if (!pageOptions || !pageOptions.includeRawHtml) { if (doc && doc.rawHtml) { diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index 2268f9ed..cd199fa1 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -118,15 +118,10 @@ export async function runWebScraper({ : docs; if(is_scrape === false) { - const billingResult = await billTeam(team_id, filteredDocs.length); - if (!billingResult.success) { - // throw new Error("Failed to bill team, no subscription was found"); - return { - success: false, - message: "Failed to bill team, no subscription was found", - docs: [], - }; - } + billTeam(team_id, filteredDocs.length).catch(error => { + Logger.error(`Failed to bill team ${team_id} for ${filteredDocs.length} credits: ${error}`); + // Optionally, you could notify an admin or add to a retry queue here + }); } diff --git a/apps/api/src/services/billing/credit_billing.ts b/apps/api/src/services/billing/credit_billing.ts index 2cfea85a..ab00eab9 100644 --- a/apps/api/src/services/billing/credit_billing.ts +++ b/apps/api/src/services/billing/credit_billing.ts @@ -465,8 +465,8 @@ async function createCreditUsage({ subscription_id?: string; credits: number; }) { - const { data: credit_usage } = await supabase_service - .from("credit_usage") + await supabase_service + .from("credit_usage") .insert([ { team_id, @@ -474,8 +474,7 @@ async function createCreditUsage({ subscription_id: subscription_id || null, created_at: new Date(), }, - ]) - .select(); + ]); - return { success: true, credit_usage }; + return { success: true }; } From 653b76fe3dc13f7271260386a09120ac4823ae71 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 3 Sep 2024 23:33:29 -0300 Subject: [PATCH 14/62] Update README.md --- README.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 89ed0127..3803716f 100644 --- a/README.md +++ b/README.md @@ -391,7 +391,7 @@ With LLM extraction, you can easily extract structured data from any URL. We sup from firecrawl.firecrawl import FirecrawlApp -app = FirecrawlApp(api_key="fc-YOUR_API_KEY", version="v0") +app = FirecrawlApp(api_key="fc-YOUR_API_KEY") class ArticleSchema(BaseModel): title: str @@ -466,8 +466,7 @@ import FirecrawlApp from "@mendable/firecrawl-js"; import { z } from "zod"; const app = new FirecrawlApp({ - apiKey: "fc-YOUR_API_KEY", - version: "v0" + apiKey: "fc-YOUR_API_KEY" }); // Define schema to extract contents into From 28df35382949f40e6eba41977c49f221aa0876e7 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Wed, 4 Sep 2024 09:16:31 -0300 Subject: [PATCH 15/62] fix(cicd): wait and moved rust publish --- .github/workflows/fly.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/fly.yml b/.github/workflows/fly.yml index ba4a099e..0fd1c12f 100644 --- a/.github/workflows/fly.yml +++ b/.github/workflows/fly.yml @@ -57,6 +57,9 @@ jobs: run: npm run workers & working-directory: ./apps/api id: start_workers + - name: Wait for the application to be ready + run: | + sleep 10 - name: Run E2E tests run: | npm run test:prod @@ -338,6 +341,7 @@ jobs: build-and-publish-rust-sdk: name: Build and publish Rust SDK runs-on: ubuntu-latest + needs: deploy steps: - name: Checkout repository From d836ba67821235fb3363065d01fb36f0094044f9 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Wed, 4 Sep 2024 09:35:56 -0300 Subject: [PATCH 16/62] added log to check response on cicd --- apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts index dd7d4f16..40017d2b 100644 --- a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts @@ -32,6 +32,10 @@ describe("E2E Tests for v1 API Routes", () => { const response: ScrapeResponseRequestTest = await request(TEST_URL).post( "/v1/scrape" ); + console.log({ + response: response.body, + statusCode: response.statusCode, + }) expect(response.statusCode).toBe(401); }); From 74ac8915cd912b37e779f44bd428e550abda5b6d Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Wed, 4 Sep 2024 09:41:38 -0300 Subject: [PATCH 17/62] details --- apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts index 40017d2b..9d504f1f 100644 --- a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts @@ -34,6 +34,7 @@ describe("E2E Tests for v1 API Routes", () => { ); console.log({ response: response.body, + details: response.body.success == false ? response.body.details : null, statusCode: response.statusCode, }) expect(response.statusCode).toBe(401); From 57aa6d18525a34607730b926f3e014f1d9bbac1b Mon Sep 17 00:00:00 2001 From: Eric Ciarla <43451761+ericciarla@users.noreply.github.com> Date: Wed, 4 Sep 2024 09:56:41 -0400 Subject: [PATCH 18/62] Update README.md --- README.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 3803716f..63dd6ea5 100644 --- a/README.md +++ b/README.md @@ -14,10 +14,9 @@ GitHub Contributors - - Open Source + + Visit firecrawl.dev -

From ad950a6c9d641aaeb0550b702a64825c7c11838e Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Wed, 4 Sep 2024 11:29:32 -0300 Subject: [PATCH 19/62] fixed controller res and tests --- .../__tests__/e2e_v1_withAuth/index.test.ts | 26 ++++++++----------- apps/api/src/routes/v1.ts | 24 +++++++++++------ 2 files changed, 27 insertions(+), 23 deletions(-) diff --git a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts index 9d504f1f..2d27462f 100644 --- a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts @@ -29,14 +29,10 @@ describe("E2E Tests for v1 API Routes", () => { describe("POST /v1/scrape", () => { it.concurrent("should require authorization", async () => { - const response: ScrapeResponseRequestTest = await request(TEST_URL).post( - "/v1/scrape" - ); - console.log({ - response: response.body, - details: response.body.success == false ? response.body.details : null, - statusCode: response.statusCode, - }) + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/scrape") + .send({ url: "https://firecrawl.dev"}) + expect(response.statusCode).toBe(401); }); @@ -456,9 +452,9 @@ describe("E2E Tests for v1 API Routes", () => { describe("POST /v1/map", () => { it.concurrent("should require authorization", async () => { - const response: ScrapeResponseRequestTest = await request(TEST_URL).post( - "/v1/map" - ); + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/map") + .send({ url: "https://firecrawl.dev" }); expect(response.statusCode).toBe(401); }); @@ -614,9 +610,9 @@ describe("POST /v1/map", () => { describe("POST /v1/crawl", () => { it.concurrent("should require authorization", async () => { - const response: ScrapeResponseRequestTest = await request(TEST_URL).post( - "/v1/crawl" - ); + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/crawl") + .send({ url: "https://firecrawl.dev" }); expect(response.statusCode).toBe(401); }); @@ -868,7 +864,7 @@ describe("GET /v1/crawl/:jobId", () => { .post("/v1/crawl") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") - .send({ url: "https://docs.mendable.ai" }); + .send({ url: "https://docs.firecrawl.dev" }); expect(crawlResponse.statusCode).toBe(200); let isCompleted = false; diff --git a/apps/api/src/routes/v1.ts b/apps/api/src/routes/v1.ts index 9dcbf111..daa9bf43 100644 --- a/apps/api/src/routes/v1.ts +++ b/apps/api/src/routes/v1.ts @@ -33,7 +33,9 @@ function checkCreditsMiddleware(minimum?: number): (req: RequestWithAuth, res: R const { success, message, remainingCredits } = await checkTeamCredits(req.auth.team_id, minimum); if (!success) { Logger.error(`Insufficient credits: ${JSON.stringify({ team_id: req.auth.team_id, minimum, remainingCredits })}`); - return res.status(402).json({ success: false, error: "Insufficient credits" }); + if (!res.headersSent) { + return res.status(402).json({ success: false, error: "Insufficient credits" }); + } } req.account = { remainingCredits } next(); @@ -52,7 +54,9 @@ export function authMiddleware(rateLimiterMode: RateLimiterMode): (req: RequestW ); if (!success) { - return res.status(status).json({ success: false, error }); + if (!res.headersSent) { + return res.status(status).json({ success: false, error }); + } } req.auth = { team_id, plan }; @@ -67,7 +71,9 @@ function idempotencyMiddleware(req: Request, res: Response, next: NextFunction) if (req.headers["x-idempotency-key"]) { const isIdempotencyValid = await validateIdempotencyKey(req); if (!isIdempotencyValid) { - return res.status(409).json({ success: false, error: "Idempotency key already used" }); + if (!res.headersSent) { + return res.status(409).json({ success: false, error: "Idempotency key already used" }); + } } createIdempotencyKey(req); } @@ -78,7 +84,9 @@ function idempotencyMiddleware(req: Request, res: Response, next: NextFunction) function blocklistMiddleware(req: Request, res: Response, next: NextFunction) { if (req.body.url && isUrlBlocked(req.body.url)) { - return res.status(403).json({ success: false, error: "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." }); + if (!res.headersSent) { + return res.status(403).json({ success: false, error: "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." }); + } } next(); } @@ -96,26 +104,26 @@ export const v1Router = express.Router(); v1Router.post( "/scrape", - blocklistMiddleware, authMiddleware(RateLimiterMode.Scrape), checkCreditsMiddleware(1), + blocklistMiddleware, wrap(scrapeController) ); v1Router.post( "/crawl", - blocklistMiddleware, authMiddleware(RateLimiterMode.Crawl), - idempotencyMiddleware, checkCreditsMiddleware(), + blocklistMiddleware, + idempotencyMiddleware, wrap(crawlController) ); v1Router.post( "/map", - blocklistMiddleware, authMiddleware(RateLimiterMode.Map), checkCreditsMiddleware(1), + blocklistMiddleware, wrap(mapController) ); From f98a8541c2e5de16733286482487b84922140db1 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Wed, 4 Sep 2024 11:41:50 -0300 Subject: [PATCH 20/62] fix(cicd): added use_db_auth to deploy workflow --- .github/workflows/fly.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/fly.yml b/.github/workflows/fly.yml index 0fd1c12f..9209309f 100644 --- a/.github/workflows/fly.yml +++ b/.github/workflows/fly.yml @@ -28,6 +28,7 @@ env: NPM_TOKEN: ${{ secrets.NPM_TOKEN }} CRATES_IO_TOKEN: ${{ secrets.CRATES_IO_TOKEN }} SENTRY_AUTH_TOKEN: ${{ secrets.SENTRY_AUTH_TOKEN }} + USE_DB_AUTHENTICATION: ${{ secrets.USE_DB_AUTHENTICATION }} jobs: pre-deploy-e2e-tests: From a2a63e42cc69ef314ca89aca9eb9ea121d77af9c Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 4 Sep 2024 12:03:58 -0300 Subject: [PATCH 21/62] Rm print map --- apps/python-sdk/firecrawl/firecrawl.py | 1 - 1 file changed, 1 deletion(-) diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index d65b0341..254f4c70 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -238,7 +238,6 @@ class FirecrawlApp: ) if response.status_code == 200: response = response.json() - print(response) if response['success'] and 'links' in response: return response['links'] else: From 364ba9f90172393bb3c88086cc6eed9edfceab83 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Wed, 4 Sep 2024 12:05:02 -0300 Subject: [PATCH 22/62] fix(cicd): mendable->firecrawl and waitfor --- apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts | 10 +++++----- apps/api/src/__tests__/e2e_withAuth/index.test.ts | 10 ++++------ 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts index 2d27462f..e51b349d 100644 --- a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts @@ -390,7 +390,7 @@ describe("E2E Tests for v1 API Routes", () => { const scrapeRequest: ScrapeRequest = { url: "https://ycombinator.com/companies", formats: ["markdown"], - waitFor: 5000 + waitFor: 8000 }; const response: ScrapeResponseRequestTest = await request(TEST_URL) @@ -560,7 +560,9 @@ describe("POST /v1/map", () => { const links = response.body.links as unknown[]; expect(Array.isArray(links)).toBe(true); expect(links.length).toBeGreaterThan(0); - expect(links[0]).toContain("docs.firecrawl.dev"); + + const containsDocsFirecrawlDev = links.some((link: string) => link.includes("docs.firecrawl.dev")); + expect(containsDocsFirecrawlDev).toBe(true); }, 10000) it.concurrent("should return a successful response with a valid API key and search and not allowSubdomains and www", async () => { @@ -894,9 +896,7 @@ describe("GET /v1/crawl/:jobId", () => { expect(completedResponse.body.data[0]).not.toHaveProperty("content"); expect(completedResponse.body.data[0]).toHaveProperty("markdown"); expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - expect(completedResponse.body.data[0].metadata.statusCode).toBe( - 200 - ); + expect(completedResponse.body.data[0].metadata.statusCode).toBe(200); expect( completedResponse.body.data[0].metadata.error ).toBeUndefined(); diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 330f8130..26caf63e 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -659,7 +659,7 @@ describe("E2E Tests for v0 API Routes", () => { .post("/v0/crawl") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") - .send({ url: "https://mendable.ai/blog" }); + .send({ url: "https://firecrawl.dev/blog" }); expect(crawlResponse.statusCode).toBe(200); let isCompleted = false; @@ -689,10 +689,8 @@ describe("E2E Tests for v0 API Routes", () => { expect(completedResponse.body.data[0]).toHaveProperty("content"); expect(completedResponse.body.data[0]).toHaveProperty("markdown"); expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - expect(completedResponse.body.data[0].content).toContain("Mendable"); - expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe( - 200 - ); + expect(completedResponse.body.data[0].content).toContain("Firecrawl"); + expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200); expect( completedResponse.body.data[0].metadata.pageError ).toBeUndefined(); @@ -701,7 +699,7 @@ describe("E2E Tests for v0 API Routes", () => { (doc) => doc.metadata && doc.metadata.sourceURL && - doc.metadata.sourceURL.includes("mendable.ai/blog") + doc.metadata.sourceURL.includes("firecrawl.dev/blog") ); expect(childrenLinks.length).toBe(completedResponse.body.data.length); From 1eb993a93b3e00ab6685e5326c2fb41797cfd8ff Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 4 Sep 2024 12:09:28 -0300 Subject: [PATCH 23/62] Update __init__.py --- apps/python-sdk/firecrawl/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py index 59c5fe35..f178cd61 100644 --- a/apps/python-sdk/firecrawl/__init__.py +++ b/apps/python-sdk/firecrawl/__init__.py @@ -13,7 +13,7 @@ import os from .firecrawl import FirecrawlApp -__version__ = "1.2.2" +__version__ = "1.2.3" # Define the logger for the Firecrawl project logger: logging.Logger = logging.getLogger("firecrawl") From 3f462eabe9c976c88b5c27c3b9743647cc8f3244 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Wed, 4 Sep 2024 12:27:46 -0300 Subject: [PATCH 24/62] fix(cicd): --- apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts index e51b349d..880d34a1 100644 --- a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts @@ -535,7 +535,9 @@ describe("POST /v1/map", () => { const links = response.body.links as unknown[]; expect(Array.isArray(links)).toBe(true); expect(links.length).toBeGreaterThan(0); - expect(links[0]).toContain("docs.firecrawl.dev"); + + const containsDocsFirecrawlDev = links.some((link: string) => link.includes("docs.firecrawl.dev")); + expect(containsDocsFirecrawlDev).toBe(true); }); it.concurrent("should return a successful response with a valid API key and search and allowSubdomains and www", async () => { From 5ecb2436932f00529a0f4116a388f1d874dff8d7 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 4 Sep 2024 15:19:45 -0300 Subject: [PATCH 25/62] Nick: --- .../src/services/billing/credit_billing.ts | 143 +++++++++++++----- apps/api/src/services/queue-jobs.ts | 2 +- 2 files changed, 103 insertions(+), 42 deletions(-) diff --git a/apps/api/src/services/billing/credit_billing.ts b/apps/api/src/services/billing/credit_billing.ts index ab00eab9..9ea0435e 100644 --- a/apps/api/src/services/billing/credit_billing.ts +++ b/apps/api/src/services/billing/credit_billing.ts @@ -5,7 +5,7 @@ import { supabase_service } from "../supabase"; import { Logger } from "../../lib/logger"; import { getValue, setValue } from "../redis"; import { redlock } from "../redlock"; - +import * as Sentry from "@sentry/node"; const FREE_CREDITS = 500; @@ -176,9 +176,24 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) { return { success: true, message: "Preview team, no credits used", remainingCredits: Infinity }; } - // Retrieve the team's active subscription and check for available coupons concurrently - const [{ data: subscription, error: subscriptionError }, { data: coupons }] = - await Promise.all([ + + let cacheKeySubscription = `subscription_${team_id}`; + let cacheKeyCoupons = `coupons_${team_id}`; + + // Try to get data from cache first + const [cachedSubscription, cachedCoupons] = await Promise.all([ + getValue(cacheKeySubscription), + getValue(cacheKeyCoupons) + ]); + + let subscription, subscriptionError, coupons; + + if (cachedSubscription && cachedCoupons) { + subscription = JSON.parse(cachedSubscription); + coupons = JSON.parse(cachedCoupons); + } else { + // If not in cache, retrieve from database + const [subscriptionResult, couponsResult] = await Promise.all([ supabase_service .from("subscriptions") .select("id, price_id, current_period_start, current_period_end") @@ -192,6 +207,18 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) { .eq("status", "active"), ]); + subscription = subscriptionResult.data; + subscriptionError = subscriptionResult.error; + coupons = couponsResult.data; + + // Cache the results for a minute, sub can be null and that's fine + await setValue(cacheKeySubscription, JSON.stringify(subscription), 60); // Cache for 1 minute, even if null + + if (coupons) { + await setValue(cacheKeyCoupons, JSON.stringify(coupons), 60); // Cache for 1 minute + } + } + let couponCredits = 0; if (coupons && coupons.length > 0) { couponCredits = coupons.reduce( @@ -212,41 +239,54 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) { let creditUsages; let creditUsageError; - let retries = 0; - const maxRetries = 3; - const retryInterval = 2000; // 2 seconds + let totalCreditsUsed = 0; + const cacheKeyCreditUsage = `credit_usage_${team_id}`; - while (retries < maxRetries) { - const result = await supabase_service - .from("credit_usage") - .select("credits_used") - .is("subscription_id", null) - .eq("team_id", team_id); + // Try to get credit usage from cache + const cachedCreditUsage = await getValue(cacheKeyCreditUsage); - creditUsages = result.data; - creditUsageError = result.error; + if (cachedCreditUsage) { + totalCreditsUsed = parseInt(cachedCreditUsage); + } else { + let retries = 0; + const maxRetries = 3; + const retryInterval = 2000; // 2 seconds - if (!creditUsageError) { - break; + while (retries < maxRetries) { + const result = await supabase_service + .from("credit_usage") + .select("credits_used") + .is("subscription_id", null) + .eq("team_id", team_id); + + creditUsages = result.data; + creditUsageError = result.error; + + if (!creditUsageError) { + break; + } + + retries++; + if (retries < maxRetries) { + await new Promise(resolve => setTimeout(resolve, retryInterval)); + } } - retries++; - if (retries < maxRetries) { - await new Promise(resolve => setTimeout(resolve, retryInterval)); + if (creditUsageError) { + Logger.error(`Credit usage error after ${maxRetries} attempts: ${creditUsageError}`); + throw new Error( + `Failed to retrieve credit usage for team_id: ${team_id}` + ); } - } - if (creditUsageError) { - Logger.error(`Credit usage error after ${maxRetries} attempts: ${creditUsageError}`); - throw new Error( - `Failed to retrieve credit usage for team_id: ${team_id}` + totalCreditsUsed = creditUsages.reduce( + (acc, usage) => acc + usage.credits_used, + 0 ); - } - const totalCreditsUsed = creditUsages.reduce( - (acc, usage) => acc + usage.credits_used, - 0 - ); + // Cache the result for 30 seconds + await setValue(cacheKeyCreditUsage, totalCreditsUsed.toString(), 30); + } Logger.info(`totalCreditsUsed: ${totalCreditsUsed}`); @@ -312,7 +352,7 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) { if (creditUsages && creditUsages.length > 0) { totalCreditsUsed = creditUsages[0].total_credits_used; - await setValue(cacheKey, totalCreditsUsed.toString(), 1800); // Cache for 30 minutes + await setValue(cacheKey, totalCreditsUsed.toString(), 500); // Cache for 8 minutes // Logger.info(`Cache set for credit usage: ${totalCreditsUsed}`); } } @@ -325,17 +365,38 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) { // Adjust total credits used by subtracting coupon value const adjustedCreditsUsed = Math.max(0, totalCreditsUsed - couponCredits); - // Get the price details - const { data: price, error: priceError } = await supabase_service - .from("prices") - .select("credits") - .eq("id", subscription.price_id) - .single(); - if (priceError) { - throw new Error( - `Failed to retrieve price for price_id: ${subscription.price_id}` - ); + // Get the price details from cache or database + const priceCacheKey = `price_${subscription.price_id}`; + let price; + + try { + const cachedPrice = await getValue(priceCacheKey); + if (cachedPrice) { + price = JSON.parse(cachedPrice); + } else { + const { data, error: priceError } = await supabase_service + .from("prices") + .select("credits") + .eq("id", subscription.price_id) + .single(); + + if (priceError) { + throw new Error( + `Failed to retrieve price for price_id: ${subscription.price_id}` + ); + } + + price = data; + // There are only 21 records, so this is super fine + // Cache the price for a long time (e.g., 1 day) + await setValue(priceCacheKey, JSON.stringify(price), 86400); + } + } catch (error) { + Logger.error(`Error retrieving or caching price: ${error}`); + Sentry.captureException(error); + // If errors, just assume it's a big number so user don't get an error + price = { credits: 1000000 }; } const creditLimit = price.credits; diff --git a/apps/api/src/services/queue-jobs.ts b/apps/api/src/services/queue-jobs.ts index 941b571d..7a698772 100644 --- a/apps/api/src/services/queue-jobs.ts +++ b/apps/api/src/services/queue-jobs.ts @@ -67,6 +67,6 @@ export function waitForJob(jobId: string, timeout: number) { reject((await getScrapeQueue().getJob(jobId)).failedReason); } } - }, 1000); + }, 500); }) } From cb8571abad6d0388daf9b66e7db76a22116df6df Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Wed, 4 Sep 2024 15:57:57 -0300 Subject: [PATCH 26/62] fix: enforced dotenv config --- apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts | 4 ++-- apps/api/src/controllers/v0/crawl-cancel.ts | 2 ++ apps/api/src/controllers/v0/crawl-status.ts | 2 ++ apps/api/src/controllers/v1/crawl-cancel.ts | 2 ++ apps/api/src/controllers/v1/crawl-status.ts | 2 ++ apps/api/src/lib/logger.ts | 3 +++ apps/api/src/lib/scrape-events.ts | 2 ++ apps/api/src/lib/withAuth.ts | 2 ++ apps/api/src/main/runWebScraper.ts | 2 ++ apps/api/src/services/logging/crawl_log.ts | 3 ++- apps/api/src/services/logging/log_job.ts | 2 ++ apps/api/src/services/logging/scrape_log.ts | 2 ++ apps/api/src/services/queue-worker.ts | 2 ++ apps/api/src/services/supabase.ts | 2 ++ apps/api/src/services/webhook.ts | 2 ++ apps/test-suite/utils/supabase.ts | 3 ++- 16 files changed, 33 insertions(+), 4 deletions(-) diff --git a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts index 880d34a1..913f9408 100644 --- a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts @@ -1,11 +1,11 @@ import request from "supertest"; -import dotenv from "dotenv"; +import { configDotenv } from "dotenv"; import { ScrapeRequest, ScrapeResponseRequestTest, } from "../../controllers/v1/types"; -dotenv.config(); +configDotenv(); const TEST_URL = "http://127.0.0.1:3002"; describe("E2E Tests for v1 API Routes", () => { diff --git a/apps/api/src/controllers/v0/crawl-cancel.ts b/apps/api/src/controllers/v0/crawl-cancel.ts index bf1c2d0a..efcd454a 100644 --- a/apps/api/src/controllers/v0/crawl-cancel.ts +++ b/apps/api/src/controllers/v0/crawl-cancel.ts @@ -5,6 +5,8 @@ import { supabase_service } from "../../../src/services/supabase"; import { Logger } from "../../../src/lib/logger"; import { getCrawl, saveCrawl } from "../../../src/lib/crawl-redis"; import * as Sentry from "@sentry/node"; +import { configDotenv } from "dotenv"; +configDotenv(); export async function crawlCancelController(req: Request, res: Response) { try { diff --git a/apps/api/src/controllers/v0/crawl-status.ts b/apps/api/src/controllers/v0/crawl-status.ts index b0649cd0..a3f3f16f 100644 --- a/apps/api/src/controllers/v0/crawl-status.ts +++ b/apps/api/src/controllers/v0/crawl-status.ts @@ -6,6 +6,8 @@ import { Logger } from "../../../src/lib/logger"; import { getCrawl, getCrawlJobs } from "../../../src/lib/crawl-redis"; import { supabaseGetJobsById } from "../../../src/lib/supabase-jobs"; import * as Sentry from "@sentry/node"; +import { configDotenv } from "dotenv"; +configDotenv(); export async function getJobs(ids: string[]) { const jobs = (await Promise.all(ids.map(x => getScrapeQueue().getJob(x)))).filter(x => x); diff --git a/apps/api/src/controllers/v1/crawl-cancel.ts b/apps/api/src/controllers/v1/crawl-cancel.ts index 06a5b26e..21fc7cf9 100644 --- a/apps/api/src/controllers/v1/crawl-cancel.ts +++ b/apps/api/src/controllers/v1/crawl-cancel.ts @@ -5,6 +5,8 @@ import { supabase_service } from "../../services/supabase"; import { Logger } from "../../lib/logger"; import { getCrawl, saveCrawl } from "../../lib/crawl-redis"; import * as Sentry from "@sentry/node"; +import { configDotenv } from "dotenv"; +configDotenv(); export async function crawlCancelController(req: Request, res: Response) { try { diff --git a/apps/api/src/controllers/v1/crawl-status.ts b/apps/api/src/controllers/v1/crawl-status.ts index 845f616c..05144a9b 100644 --- a/apps/api/src/controllers/v1/crawl-status.ts +++ b/apps/api/src/controllers/v1/crawl-status.ts @@ -3,6 +3,8 @@ import { CrawlStatusParams, CrawlStatusResponse, ErrorResponse, legacyDocumentCo import { getCrawl, getCrawlExpiry, getCrawlJobs, getDoneJobsOrdered, getDoneJobsOrderedLength } from "../../lib/crawl-redis"; import { getScrapeQueue } from "../../services/queue-service"; import { supabaseGetJobById, supabaseGetJobsById } from "../../lib/supabase-jobs"; +import { configDotenv } from "dotenv"; +configDotenv(); export async function getJob(id: string) { const job = await getScrapeQueue().getJob(id); diff --git a/apps/api/src/lib/logger.ts b/apps/api/src/lib/logger.ts index fb0468c2..cb8b4119 100644 --- a/apps/api/src/lib/logger.ts +++ b/apps/api/src/lib/logger.ts @@ -1,3 +1,6 @@ +import { configDotenv } from "dotenv"; +configDotenv(); + enum LogLevel { NONE = 'NONE', // No logs will be output. ERROR = 'ERROR', // For logging error messages that indicate a failure in a specific operation. diff --git a/apps/api/src/lib/scrape-events.ts b/apps/api/src/lib/scrape-events.ts index ed011b78..ad70dfef 100644 --- a/apps/api/src/lib/scrape-events.ts +++ b/apps/api/src/lib/scrape-events.ts @@ -2,6 +2,8 @@ import { Job } from "bullmq"; import type { baseScrapers } from "../scraper/WebScraper/single_url"; import { supabase_service as supabase } from "../services/supabase"; import { Logger } from "./logger"; +import { configDotenv } from "dotenv"; +configDotenv(); export type ScrapeErrorEvent = { type: "error", diff --git a/apps/api/src/lib/withAuth.ts b/apps/api/src/lib/withAuth.ts index 90cfb449..b45b8973 100644 --- a/apps/api/src/lib/withAuth.ts +++ b/apps/api/src/lib/withAuth.ts @@ -1,6 +1,8 @@ import { AuthResponse } from "../../src/types"; import { Logger } from "./logger"; import * as Sentry from "@sentry/node"; +import { configDotenv } from "dotenv"; +configDotenv(); let warningCount = 0; diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index cd199fa1..f67a1cd0 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -12,6 +12,8 @@ import { Document } from "../lib/entities"; import { supabase_service } from "../services/supabase"; import { Logger } from "../lib/logger"; import { ScrapeEvents } from "../lib/scrape-events"; +import { configDotenv } from "dotenv"; +configDotenv(); export async function startWebScraperPipeline({ job, diff --git a/apps/api/src/services/logging/crawl_log.ts b/apps/api/src/services/logging/crawl_log.ts index f19b0297..3850e05b 100644 --- a/apps/api/src/services/logging/crawl_log.ts +++ b/apps/api/src/services/logging/crawl_log.ts @@ -1,6 +1,7 @@ import { supabase_service } from "../supabase"; import { Logger } from "../../../src/lib/logger"; -import "dotenv/config"; +import { configDotenv } from "dotenv"; +configDotenv(); export async function logCrawl(job_id: string, team_id: string) { const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; diff --git a/apps/api/src/services/logging/log_job.ts b/apps/api/src/services/logging/log_job.ts index d4494f09..4d8ee014 100644 --- a/apps/api/src/services/logging/log_job.ts +++ b/apps/api/src/services/logging/log_job.ts @@ -4,6 +4,8 @@ import { FirecrawlJob } from "../../types"; import { posthog } from "../posthog"; import "dotenv/config"; import { Logger } from "../../lib/logger"; +import { configDotenv } from "dotenv"; +configDotenv(); export async function logJob(job: FirecrawlJob) { try { diff --git a/apps/api/src/services/logging/scrape_log.ts b/apps/api/src/services/logging/scrape_log.ts index 30d8fd1e..fbe41653 100644 --- a/apps/api/src/services/logging/scrape_log.ts +++ b/apps/api/src/services/logging/scrape_log.ts @@ -3,6 +3,8 @@ import { ScrapeLog } from "../../types"; import { supabase_service } from "../supabase"; import { PageOptions } from "../../lib/entities"; import { Logger } from "../../lib/logger"; +import { configDotenv } from "dotenv"; +configDotenv(); export async function logScrape( scrapeLog: ScrapeLog, diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 6488759f..ad0e4ad5 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -36,6 +36,8 @@ import { } from "../../src/lib/job-priority"; import { PlanType } from "../types"; import { getJobs } from "../../src/controllers/v1/crawl-status"; +import { configDotenv } from "dotenv"; +configDotenv(); if (process.env.ENV === "production") { initSDK({ diff --git a/apps/api/src/services/supabase.ts b/apps/api/src/services/supabase.ts index 414d1925..7636717e 100644 --- a/apps/api/src/services/supabase.ts +++ b/apps/api/src/services/supabase.ts @@ -1,5 +1,7 @@ import { createClient, SupabaseClient } from "@supabase/supabase-js"; import { Logger } from "../lib/logger"; +import { configDotenv } from "dotenv"; +configDotenv(); // SupabaseService class initializes the Supabase client conditionally based on environment variables. class SupabaseService { diff --git a/apps/api/src/services/webhook.ts b/apps/api/src/services/webhook.ts index 56dd5c58..06e5649d 100644 --- a/apps/api/src/services/webhook.ts +++ b/apps/api/src/services/webhook.ts @@ -3,6 +3,8 @@ import { legacyDocumentConverter } from "../../src/controllers/v1/types"; import { Logger } from "../../src/lib/logger"; import { supabase_service } from "./supabase"; import { WebhookEventType } from "../types"; +import { configDotenv } from "dotenv"; +configDotenv(); export const callWebhook = async ( teamId: string, diff --git a/apps/test-suite/utils/supabase.ts b/apps/test-suite/utils/supabase.ts index 3e66a991..a1549e24 100644 --- a/apps/test-suite/utils/supabase.ts +++ b/apps/test-suite/utils/supabase.ts @@ -1,5 +1,6 @@ import { createClient, SupabaseClient } from "@supabase/supabase-js"; -import "dotenv/config"; +import { configDotenv } from "dotenv"; +configDotenv(); // SupabaseService class initializes the Supabase client conditionally based on environment variables. class SupabaseService { From 78edf13ec6f52c12956b576712c4ca663a5d16ad Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Wed, 4 Sep 2024 16:31:42 -0300 Subject: [PATCH 27/62] test: usedbauth envs wth --- apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts index 913f9408..5631adf0 100644 --- a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts @@ -22,6 +22,13 @@ describe("E2E Tests for v1 API Routes", () => { const response: ScrapeResponseRequestTest = await request(TEST_URL).get( "/is-production" ); + + console.log('process.env.USE_DB_AUTHENTICATION', process.env.USE_DB_AUTHENTICATION); + console.log('?', process.env.USE_DB_AUTHENTICATION === 'true'); + const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; + console.log('useDbAuthentication', useDbAuthentication); + console.log('!useDbAuthentication', !useDbAuthentication); + expect(response.statusCode).toBe(200); expect(response.body).toHaveProperty("isProduction"); }); From 85b824e122a095595b9f188902eb771590392f06 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Wed, 4 Sep 2024 16:35:32 -0300 Subject: [PATCH 28/62] test: what about false false? --- apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts index 5631adf0..8aabf748 100644 --- a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts @@ -26,7 +26,7 @@ describe("E2E Tests for v1 API Routes", () => { console.log('process.env.USE_DB_AUTHENTICATION', process.env.USE_DB_AUTHENTICATION); console.log('?', process.env.USE_DB_AUTHENTICATION === 'true'); const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; - console.log('useDbAuthentication', useDbAuthentication); + console.log('!!useDbAuthentication', !!useDbAuthentication); console.log('!useDbAuthentication', !useDbAuthentication); expect(response.statusCode).toBe(200); From 28c5635502ebfdc98852ecf576cf7b9aa27f48e8 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 4 Sep 2024 16:45:56 -0300 Subject: [PATCH 29/62] Update ci.yml --- .github/workflows/ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b2e42e4a..ff22858b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -28,6 +28,7 @@ env: HYPERDX_API_KEY: ${{ secrets.HYPERDX_API_KEY }} HDX_NODE_BETA_MODE: 1 FIRE_ENGINE_BETA_URL: ${{ secrets.FIRE_ENGINE_BETA_URL }} + USE_DB_AUTHENTICATION: ${{ secrets.USE_DB_AUTHENTICATION }} jobs: From a0113dac3753725500d659f632bfe77a67e8e191 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 4 Sep 2024 16:54:20 -0300 Subject: [PATCH 30/62] Update credit_billing.ts --- apps/api/src/services/billing/credit_billing.ts | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/apps/api/src/services/billing/credit_billing.ts b/apps/api/src/services/billing/credit_billing.ts index 9ea0435e..53031de9 100644 --- a/apps/api/src/services/billing/credit_billing.ts +++ b/apps/api/src/services/billing/credit_billing.ts @@ -213,10 +213,8 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) { // Cache the results for a minute, sub can be null and that's fine await setValue(cacheKeySubscription, JSON.stringify(subscription), 60); // Cache for 1 minute, even if null - - if (coupons) { - await setValue(cacheKeyCoupons, JSON.stringify(coupons), 60); // Cache for 1 minute - } + await setValue(cacheKeyCoupons, JSON.stringify(coupons), 60); // Cache for 1 minute + } let couponCredits = 0; From 82cb80c8170b299c83cf954b94c6d9c30c2166c0 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 4 Sep 2024 23:46:18 -0300 Subject: [PATCH 31/62] Update map.ts --- apps/api/src/controllers/v1/map.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/api/src/controllers/v1/map.ts b/apps/api/src/controllers/v1/map.ts index 4c94f041..e6abd9ae 100644 --- a/apps/api/src/controllers/v1/map.ts +++ b/apps/api/src/controllers/v1/map.ts @@ -62,8 +62,8 @@ export async function mapController( : `site:${req.body.url}`; // www. seems to exclude subdomains in some cases const mapResults = await fireEngineMap(mapUrl, { - // limit to 50 results (beta) - numResults: Math.min(limit, 50), + // limit to 100 results (beta) + numResults: Math.min(limit, 100), }); if (mapResults.length > 0) { From eb03a81152883b5371b2a4519f98bae1d71065dd Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 5 Sep 2024 12:55:04 -0300 Subject: [PATCH 32/62] Update crawl-status.ts --- apps/api/src/controllers/v1/crawl-status.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/apps/api/src/controllers/v1/crawl-status.ts b/apps/api/src/controllers/v1/crawl-status.ts index 05144a9b..ad4d21d2 100644 --- a/apps/api/src/controllers/v1/crawl-status.ts +++ b/apps/api/src/controllers/v1/crawl-status.ts @@ -94,7 +94,8 @@ export async function crawlStatusController(req: RequestWithAuth x.returnvalue); - const nextURL = new URL(`${req.protocol}://${req.get("host")}/v1/crawl/${req.params.jobId}`); + const protocol = process.env.ENV === "local" ? req.protocol : "https"; + const nextURL = new URL(`${protocol}://${req.get("host")}/v1/crawl/${req.params.jobId}`); nextURL.searchParams.set("skip", (start + data.length).toString()); From 7561bfe173c2a7934ef1ab7b64d1a79f3177833b Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Thu, 5 Sep 2024 12:59:32 -0300 Subject: [PATCH 33/62] added envs to github action workflows --- .github/workflows/ci.yml | 2 +- .github/workflows/fly-direct.yml | 6 ++++++ .github/workflows/fly.yml | 1 + 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ff22858b..8a9a74cc 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -29,7 +29,7 @@ env: HDX_NODE_BETA_MODE: 1 FIRE_ENGINE_BETA_URL: ${{ secrets.FIRE_ENGINE_BETA_URL }} USE_DB_AUTHENTICATION: ${{ secrets.USE_DB_AUTHENTICATION }} - + ENV: ${{ secrets.ENV }} jobs: pre-deploy: diff --git a/.github/workflows/fly-direct.yml b/.github/workflows/fly-direct.yml index 8ec675fa..2473642c 100644 --- a/.github/workflows/fly-direct.yml +++ b/.github/workflows/fly-direct.yml @@ -22,7 +22,13 @@ env: SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }} SUPABASE_URL: ${{ secrets.SUPABASE_URL }} TEST_API_KEY: ${{ secrets.TEST_API_KEY }} + PYPI_USERNAME: ${{ secrets.PYPI_USERNAME }} + PYPI_PASSWORD: ${{ secrets.PYPI_PASSWORD }} + NPM_TOKEN: ${{ secrets.NPM_TOKEN }} + CRATES_IO_TOKEN: ${{ secrets.CRATES_IO_TOKEN }} SENTRY_AUTH_TOKEN: ${{ secrets.SENTRY_AUTH_TOKEN }} + USE_DB_AUTHENTICATION: ${{ secrets.USE_DB_AUTHENTICATION }} + ENV: ${{ secrets.ENV }} jobs: deploy: diff --git a/.github/workflows/fly.yml b/.github/workflows/fly.yml index 9209309f..7b45921a 100644 --- a/.github/workflows/fly.yml +++ b/.github/workflows/fly.yml @@ -29,6 +29,7 @@ env: CRATES_IO_TOKEN: ${{ secrets.CRATES_IO_TOKEN }} SENTRY_AUTH_TOKEN: ${{ secrets.SENTRY_AUTH_TOKEN }} USE_DB_AUTHENTICATION: ${{ secrets.USE_DB_AUTHENTICATION }} + ENV: ${{ secrets.ENV }} jobs: pre-deploy-e2e-tests: From c6f1d8099296496bcd650dac6f8cc52643e34aaa Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 5 Sep 2024 13:03:43 -0300 Subject: [PATCH 34/62] Update crawl.ts --- apps/api/src/controllers/v1/crawl.ts | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/apps/api/src/controllers/v1/crawl.ts b/apps/api/src/controllers/v1/crawl.ts index c2d5bdca..e0883fa8 100644 --- a/apps/api/src/controllers/v1/crawl.ts +++ b/apps/api/src/controllers/v1/crawl.ts @@ -155,10 +155,12 @@ export async function crawlController( await callWebhook(req.auth.team_id, id, null, req.body.webhook, true, "crawl.started"); } + const protocol = process.env.ENV === "local" ? req.protocol : "https"; + return res.status(200).json({ success: true, id, - url: `${req.protocol}://${req.get("host")}/v1/crawl/${id}`, + url: `${protocol}://${req.get("host")}/v1/crawl/${id}`, }); } From b301ffc922561fd363c5207d44e33c6b96e69c9a Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Thu, 5 Sep 2024 13:57:26 -0300 Subject: [PATCH 35/62] added missing variables --- apps/api/src/scraper/WebScraper/index.ts | 3 +++ apps/api/src/scraper/WebScraper/single_url.ts | 3 +++ 2 files changed, 6 insertions(+) diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index fc828224..8bd7d493 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -589,6 +589,9 @@ export class WebScraperDataProvider { includeLinks: options.pageOptions?.includeLinks ?? true, fullPageScreenshot: options.pageOptions?.fullPageScreenshot ?? false, screenshot: options.pageOptions?.screenshot ?? false, + useFastMode: options.pageOptions?.useFastMode ?? false, + disableJSDom: options.pageOptions?.disableJSDom ?? false, + atsv: options.pageOptions?.atsv ?? false }; this.extractorOptions = options.extractorOptions ?? { mode: "markdown" }; this.replaceAllPathsWithAbsolutePaths = diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 11e1fe37..f39f045f 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -146,6 +146,9 @@ export async function scrapSingleUrl( parsePDF: pageOptions.parsePDF ?? true, removeTags: pageOptions.removeTags ?? [], onlyIncludeTags: pageOptions.onlyIncludeTags ?? [], + useFastMode: pageOptions.useFastMode ?? false, + disableJSDom: pageOptions.disableJSDom ?? false, + atsv: pageOptions.atsv ?? false } if (extractorOptions) { From 8c1097e9e19f60213cc66299b6452c9141ba9365 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Thu, 5 Sep 2024 14:16:31 -0300 Subject: [PATCH 36/62] fix: pageOptions --- apps/api/src/lib/entities.ts | 2 +- apps/api/src/scraper/WebScraper/index.ts | 2 +- apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts | 3 ++- apps/api/src/scraper/WebScraper/single_url.ts | 3 ++- 4 files changed, 6 insertions(+), 4 deletions(-) diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index dfd17c63..d7ec2a83 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -28,7 +28,7 @@ export type PageOptions = { onlyIncludeTags?: string | string[]; includeLinks?: boolean; useFastMode?: boolean; // beta - disableJSDom?: boolean; // beta + disableJsDom?: boolean; // beta atsv?: boolean; // beta }; diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 8bd7d493..2f7efa47 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -590,7 +590,7 @@ export class WebScraperDataProvider { fullPageScreenshot: options.pageOptions?.fullPageScreenshot ?? false, screenshot: options.pageOptions?.screenshot ?? false, useFastMode: options.pageOptions?.useFastMode ?? false, - disableJSDom: options.pageOptions?.disableJSDom ?? false, + disableJsDom: options.pageOptions?.disableJsDom ?? false, atsv: options.pageOptions?.atsv ?? false }; this.extractorOptions = options.extractorOptions ?? { mode: "markdown" }; diff --git a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts index aa86ad5e..d5f764b5 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts @@ -104,12 +104,13 @@ export async function scrapWithFireEngine({ screenshot: screenshotParam, fullPageScreenshot: fullPageScreenshotParam, headers: headers, - pageOptions: pageOptions, disableJsDom: pageOptions?.disableJsDom ?? false, priority, engine, instantReturn: true, ...fireEngineOptionsParam, + atsv: pageOptions?.atsv ?? false, + scrollXPaths: pageOptions?.scrollXPaths ?? [], }, { headers: { diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index f39f045f..8bafd203 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -147,7 +147,7 @@ export async function scrapSingleUrl( removeTags: pageOptions.removeTags ?? [], onlyIncludeTags: pageOptions.onlyIncludeTags ?? [], useFastMode: pageOptions.useFastMode ?? false, - disableJSDom: pageOptions.disableJSDom ?? false, + disableJsDom: pageOptions.disableJsDom ?? false, atsv: pageOptions.atsv ?? false } @@ -203,6 +203,7 @@ export async function scrapSingleUrl( fireEngineOptions: { engine: engine, atsv: pageOptions.atsv, + disableJsDom: pageOptions.disableJsDom, }, priority, teamId, From cb630bfc341725f46ceb12c0169c7dd7d0ebdb6c Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 5 Sep 2024 14:24:10 -0300 Subject: [PATCH 37/62] Update fireEngine.ts --- apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts index d5f764b5..e7361c5c 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts @@ -69,15 +69,15 @@ export async function scrapWithFireEngine({ let engine = engineParam; // do we want fireEngineOptions as first choice? - Logger.info( - `⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { wait: ${waitParam}, screenshot: ${screenshotParam}, fullPageScreenshot: ${fullPageScreenshot}, method: ${fireEngineOptionsParam?.method ?? "null"} }` - ); - if (pageOptions?.useFastMode) { fireEngineOptionsParam.engine = "tlsclient"; engine = "tlsclient"; } + Logger.info( + `⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { wait: ${waitParam}, screenshot: ${screenshotParam}, fullPageScreenshot: ${fullPageScreenshot}, method: ${fireEngineOptionsParam?.method ?? "null"} }` + ); + // atsv is only available for beta customers const betaCustomersString = process.env.BETA_CUSTOMERS; const betaCustomers = betaCustomersString ? betaCustomersString.split(",") : []; @@ -96,6 +96,7 @@ export async function scrapWithFireEngine({ const _response = await Sentry.startSpan({ name: "Call to fire-engine" }, async span => { + return await axiosInstance.post( process.env.FIRE_ENGINE_BETA_URL + endpoint, { From b3f21d437b8035304b2b9f075e39cabd097a5e3a Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 5 Sep 2024 15:30:10 -0300 Subject: [PATCH 38/62] Update README.md --- README.md | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 63dd6ea5..c624543b 100644 --- a/README.md +++ b/README.md @@ -402,15 +402,11 @@ class TopArticlesSchema(BaseModel): top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories") data = app.scrape_url('https://news.ycombinator.com', { - 'extractorOptions': { - 'extractionSchema': TopArticlesSchema.model_json_schema(), - 'mode': 'llm-extraction' - }, - 'pageOptions':{ - 'onlyMainContent': True + 'extract': { + 'schema': TopArticlesSchema.model_json_schema() } }) -print(data["llm_extraction"]) +print(data["extract"]) ``` ## Using the Node SDK From 82d6bf4ec8e8d62102436f28b2705e2c532bedc7 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Thu, 5 Sep 2024 16:14:21 -0300 Subject: [PATCH 39/62] feat(go-parser): singleton --- apps/api/src/lib/html-to-markdown.ts | 47 +++++++++++++++++++--------- 1 file changed, 32 insertions(+), 15 deletions(-) diff --git a/apps/api/src/lib/html-to-markdown.ts b/apps/api/src/lib/html-to-markdown.ts index 103948f4..a542a434 100644 --- a/apps/api/src/lib/html-to-markdown.ts +++ b/apps/api/src/lib/html-to-markdown.ts @@ -8,9 +8,38 @@ import dotenv from 'dotenv'; import { Logger } from './logger'; dotenv.config(); -// TODO: create a singleton for the converter // TODO: add a timeout to the Go parser +class GoMarkdownConverter { + private static instance: GoMarkdownConverter; + private convert: any; + + private constructor() { + const goExecutablePath = join(__dirname, 'go-html-to-md/html-to-markdown.so'); + const lib = koffi.load(goExecutablePath); + this.convert = lib.func('ConvertHTMLToMarkdown', 'string', ['string']); + } + + public static getInstance(): GoMarkdownConverter { + if (!GoMarkdownConverter.instance) { + GoMarkdownConverter.instance = new GoMarkdownConverter(); + } + return GoMarkdownConverter.instance; + } + + public async convertHTMLToMarkdown(html: string): Promise { + return new Promise((resolve, reject) => { + this.convert.async(html, (err: Error, res: string) => { + if (err) { + reject(err); + } else { + resolve(res); + } + }); + }); + } +} + export async function parseMarkdown(html: string): Promise { if (!html) { return ''; @@ -18,20 +47,8 @@ export async function parseMarkdown(html: string): Promise { try { if (process.env.USE_GO_MARKDOWN_PARSER == "true") { - const goExecutablePath = join(__dirname, 'go-html-to-md/html-to-markdown.so'); - const lib = koffi.load(goExecutablePath); - - const convert = lib.func('ConvertHTMLToMarkdown', 'string', ['string']); - - let markdownContent = await new Promise((resolve, reject) => { - convert.async(html, (err: Error, res: string) => { - if (err) { - reject(err); - } else { - resolve(res); - } - }); - }); + const converter = GoMarkdownConverter.getInstance(); + let markdownContent = await converter.convertHTMLToMarkdown(html); markdownContent = processMultiLineLinks(markdownContent); markdownContent = removeSkipToContentLinks(markdownContent); From 4fa917f2b3c14b7c7f59a79b509debd298c19da3 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 5 Sep 2024 16:45:23 -0300 Subject: [PATCH 40/62] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index c624543b..96878ea2 100644 --- a/README.md +++ b/README.md @@ -402,6 +402,7 @@ class TopArticlesSchema(BaseModel): top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories") data = app.scrape_url('https://news.ycombinator.com', { + 'formats': ['extract'], 'extract': { 'schema': TopArticlesSchema.model_json_schema() } From aa2cf686f4c891e5fe5b8be8eb15050bff01d261 Mon Sep 17 00:00:00 2001 From: Tadashi Shigeoka Date: Fri, 6 Sep 2024 21:41:31 +0900 Subject: [PATCH 41/62] [Docs] upgraded the path of the self-hosted documentation URL to `/v1`. --- SELF_HOST.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SELF_HOST.md b/SELF_HOST.md index f631cf18..2fa87776 100644 --- a/SELF_HOST.md +++ b/SELF_HOST.md @@ -106,7 +106,7 @@ You should be able to see the Bull Queue Manager UI on `http://localhost:3002/ad If you’d like to test the crawl endpoint, you can run this: ```bash - curl -X POST http://localhost:3002/v0/crawl \ + curl -X POST http://localhost:3002/v1/crawl \ -H 'Content-Type: application/json' \ -d '{ "url": "https://mendable.ai" From 2044e71fcf1fb811a94f8aae1b87acdfaaaac2be Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Fri, 6 Sep 2024 15:26:33 -0400 Subject: [PATCH 42/62] Docs to API Spec --- .../turning_docs_into_api_specs/api_spec.json | 771 ------------------ .../combined_api_spec.json | 510 ++++++++++++ .../dify_api_spec.json | 164 ---- .../docs.firecrawl.dev/api_spec_0.json | 211 ----- .../docs.firecrawl.dev/api_spec_1.json | 165 ---- .../docs.firecrawl.dev/api_spec_10.json | 93 --- .../docs.firecrawl.dev/api_spec_11.json | 131 --- .../docs.firecrawl.dev/api_spec_13.json | 87 -- .../docs.firecrawl.dev/api_spec_15.json | 83 -- .../docs.firecrawl.dev/api_spec_16.json | 200 ----- .../docs.firecrawl.dev/api_spec_2.json | 54 -- .../docs.firecrawl.dev/api_spec_22.json | 166 ---- .../docs.firecrawl.dev/api_spec_25.json | 229 ------ .../docs.firecrawl.dev/api_spec_26.json | 115 --- .../docs.firecrawl.dev/api_spec_3.json | 185 ----- .../docs.firecrawl.dev/api_spec_30.json | 212 ----- .../docs.firecrawl.dev/api_spec_31.json | 199 ----- .../docs.firecrawl.dev/api_spec_33.json | 202 ----- .../docs.firecrawl.dev/api_spec_34.json | 201 ----- .../docs.firecrawl.dev/api_spec_35.json | 245 ------ .../docs.firecrawl.dev/api_spec_4.json | 129 --- .../docs.firecrawl.dev/api_spec_5.json | 186 ----- .../docs.firecrawl.dev/api_spec_7.json | 86 -- .../docs.firecrawl.dev/api_spec_8.json | 59 -- .../docs.firecrawl.dev/combined_api_spec.json | 738 ----------------- .../turning_docs_into_api_specs.ipynb | 287 ------- .../turning_docs_into_api_specs.py | 137 ++++ 27 files changed, 647 insertions(+), 5198 deletions(-) delete mode 100644 examples/turning_docs_into_api_specs/api_spec.json create mode 100644 examples/turning_docs_into_api_specs/combined_api_spec.json delete mode 100644 examples/turning_docs_into_api_specs/dify_api_spec.json delete mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_0.json delete mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_1.json delete mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_10.json delete mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_11.json delete mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_13.json delete mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_15.json delete mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_16.json delete mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_2.json delete mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_22.json delete mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_25.json delete mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_26.json delete mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_3.json delete mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_30.json delete mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_31.json delete mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_33.json delete mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_34.json delete mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_35.json delete mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_4.json delete mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_5.json delete mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_7.json delete mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_8.json delete mode 100644 examples/turning_docs_into_api_specs/docs.firecrawl.dev/combined_api_spec.json delete mode 100644 examples/turning_docs_into_api_specs/turning_docs_into_api_specs.ipynb create mode 100644 examples/turning_docs_into_api_specs/turning_docs_into_api_specs.py diff --git a/examples/turning_docs_into_api_specs/api_spec.json b/examples/turning_docs_into_api_specs/api_spec.json deleted file mode 100644 index d866efd3..00000000 --- a/examples/turning_docs_into_api_specs/api_spec.json +++ /dev/null @@ -1,771 +0,0 @@ -{ - "info": { - "title": "Firecrawl API", - "version": "v0" - }, - "openapi": "3.0.0", - "paths": { - "/crawl": { - "post": { - "/crawl/cancel/{jobId}": { - "/crawl/status/{jobId}": { - "get": { - "/scrape": { - "/search": { - "post": { - "components": { - "securitySchemes": { - "Authorization": { - "bearerFormat": "JWT", - "scheme": "bearer", - "type": "http" - } - } - }, - "description": "Send a request to perform a web search and get scraped results from the top pages.", - "operationId": "searchWeb", - "parameters": [], - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "pageOptions": { - "description": "Options for controlling the scraping behavior of search result pages.", - "properties": { - "fetchPageContent": { - "default": true, - "description": "Fetch the content of each page. If false, defaults to a basic fast serp API.", - "type": "boolean" - }, - "includeHtml": { - "default": false, - "description": "Include the HTML version of the content on page. Will output a html key in the response.", - "type": "boolean" - }, - "includeRawHtml": { - "default": false, - "description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.", - "type": "boolean" - }, - "onlyMainContent": { - "default": false, - "description": "Only return the main content of the page excluding headers, navs, footers, etc.", - "type": "boolean" - } - }, - "type": "object" - }, - "query": { - "description": "The search query.", - "required": true, - "type": "string" - }, - "searchOptions": { - "description": "Options for controlling the search.", - "properties": { - "limit": { - "description": "Maximum number of search results to return.", - "type": "integer" - } - }, - "type": "object" - } - }, - "type": "object" - } - } - }, - "responses": { - "200": { - "402": { - "description": "Payment required." - }, - "429": { - "description": "Rate limit exceeded." - }, - "500": { - "description": "Internal server error." - }, - "content": { - "application/json": { - "schema": { - "properties": { - "data": { - "description": "An array of search results.", - "items": { - "properties": { - "content": { - "description": "Raw content of the search result page.", - "type": "string" - }, - "markdown": { - "description": "Markdown content of the search result page.", - "type": "string" - }, - "metadata": { - "description": "Metadata extracted from the search result page.", - "properties": { - "description": { - "description": "Page description.", - "type": "string" - }, - "language": { - "description": "Page language.", - "nullable": true, - "type": "string" - }, - "sourceURL": { - "description": "Source URL of the search result page.", - "type": "string" - }, - "title": { - "description": "Page title.", - "type": "string" - } - }, - "type": "object" - }, - "url": { - "description": "URL of the search result.", - "type": "string" - } - }, - "type": "object" - }, - "type": "array" - }, - "success": { - "description": "Indicates if the search was successful.", - "type": "boolean" - } - }, - "type": "object" - } - } - }, - "description": "Web search completed successfully." - } - } - }, - "summary": "Search the Web" - } - }, - "post": { - "description": "Send a request to scrape a single URL and get its content.", - "operationId": "scrapeURL", - "parameters": [], - "requestBody": { - "402": { - "description": "Payment required." - }, - "429": { - "description": "Rate limit exceeded." - }, - "500": { - "description": "Internal server error." - }, - "content": { - "application/json": { - "schema": { - "properties": { - "extractorOptions": { - "description": "Options for extraction of structured information from the page content. Note: LLM-based extraction is not performed by default and only occurs when explicitly configured. The 'markdown' mode simply returns the scraped markdown and is the default mode for scraping.", - "properties": { - "extractionPrompt": { - "description": "A prompt describing what information to extract from the page, applicable for LLM extraction modes.", - "type": "string" - }, - "extractionSchema": { - "description": "The schema for the data to be extracted, required only for LLM extraction modes.", - "type": "object" - }, - "mode": { - "default": "markdown", - "description": "The extraction mode to use. 'markdown': Returns the scraped markdown content, does not perform LLM extraction. 'llm-extraction': Extracts information from the cleaned and parsed content using LLM. 'llm-extraction-from-raw-html': Extracts information directly from the raw HTML using LLM. 'llm-extraction-from-markdown': Extracts information from the markdown content using LLM.", - "enum": [ - "markdown", - "llm-extraction", - "llm-extraction-from-raw-html", - "llm-extraction-from-markdown" - ], - "type": "string" - } - }, - "type": "object" - }, - "pageOptions": { - "description": "Options for controlling the scraping behavior.", - "properties": { - "fullPageScreenshot": { - "default": false, - "description": "Include a full page screenshot of the page that you are scraping.", - "type": "boolean" - }, - "headers": { - "description": "Headers to send with the request. Can be used to send cookies, user-agent, etc.", - "type": "object" - }, - "includeHtml": { - "default": false, - "description": "Include the HTML version of the content on page. Will output a html key in the response.", - "type": "boolean" - }, - "includeRawHtml": { - "default": false, - "description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.", - "type": "boolean" - }, - "onlyIncludeTags": { - "description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'", - "items": { - "type": "string" - }, - "type": "array" - }, - "onlyMainContent": { - "default": false, - "description": "Only return the main content of the page excluding headers, navs, footers, etc.", - "type": "boolean" - }, - "removeTags": { - "description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'", - "items": { - "type": "string" - }, - "type": "array" - }, - "replaceAllPathsWithAbsolutePaths": { - "default": false, - "description": "Replace all relative paths with absolute paths for images and links", - "type": "boolean" - }, - "screenshot": { - "default": false, - "description": "Include a screenshot of the top of the page that you are scraping.", - "type": "boolean" - }, - "waitFor": { - "default": 0, - "description": "Wait x amount of milliseconds for the page to load to fetch content", - "type": "integer" - } - }, - "type": "object" - }, - "timeout": { - "default": 30000, - "description": "Timeout in milliseconds for the request", - "type": "integer" - }, - "url": { - "description": "The URL to scrape.", - "required": true, - "type": "string" - } - }, - "type": "object" - } - } - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "data": { - "properties": { - "content": { - "description": "Raw content of the page.", - "type": "string" - }, - "html": { - "description": "HTML version of the page content, only present if `includeHtml` was set to `true` in the request.", - "nullable": true, - "type": "string" - }, - "llm_extraction": { - "description": "Extracted data from the page using the specified schema, only present if an LLM extraction mode was used.", - "nullable": true, - "type": "object" - }, - "markdown": { - "description": "Markdown version of the page content.", - "type": "string" - }, - "metadata": { - "properties": { - " ": { - "description": "Any other extracted metadata.", - "type": "string" - }, - "description": { - "description": "Page description.", - "type": "string" - }, - "language": { - "description": "Page language.", - "nullable": true, - "type": "string" - }, - "pageError": { - "description": "Error message if there was an error scraping the page.", - "nullable": true, - "type": "string" - }, - "pageStatusCode": { - "description": "HTTP status code of the page.", - "type": "integer" - }, - "sourceURL": { - "description": "Source URL of the page.", - "type": "string" - }, - "title": { - "description": "Page title.", - "type": "string" - } - }, - "type": "object" - }, - "rawHtml": { - "description": "Raw HTML content of the page, only present if `includeRawHtml` was set to `true` in the request.", - "nullable": true, - "type": "string" - }, - "warning": { - "description": "Warning message from the LLM extraction process, if any.", - "nullable": true, - "type": "string" - } - }, - "type": "object" - }, - "success": { - "description": "Indicates whether the scraping was successful.", - "type": "boolean" - } - }, - "type": "object" - } - } - }, - "description": "URL scraped successfully." - } - } - }, - "summary": "Scrape a URL" - } - }, - "description": "Send a request to get the status and results of a crawl job.", - "operationId": "getCrawlJobStatus", - "parameters": [ - { - "description": "ID of the crawl job to check.", - "in": "path", - "name": "jobId", - "required": true, - "schema": { - "type": "string" - } - } - ], - "requestBody": { - "content": {} - }, - "responses": { - "200": { - "402": { - "description": "Payment required." - }, - "429": { - "description": "Rate limit exceeded." - }, - "500": { - "description": "Internal server error." - }, - "content": { - "application/json": { - "schema": { - "properties": { - "current": { - "description": "The number of pages crawled so far.", - "type": "integer" - }, - "data": { - "description": "The crawl results. Only available when the crawl job is completed.", - "items": { - "properties": { - "content": { - "description": "Raw content of the page.", - "type": "string" - }, - "html": { - "description": "HTML version of the page content, only present if `includeHtml` was set to `true` in the crawl request.", - "type": "string" - }, - "index": { - "description": "The index of the crawled page in the results.", - "type": "integer" - }, - "markdown": { - "description": "Markdown content of the page.", - "type": "string" - }, - "metadata": { - "description": "Metadata extracted from the page.", - "properties": { - " ": { - "description": "Any other extracted metadata.", - "type": "string" - }, - "description": { - "description": "Page description.", - "type": "string" - }, - "language": { - "description": "Page language.", - "type": "string" - }, - "pageError": { - "description": "Error message if there was an error scraping the page.", - "type": "string" - }, - "pageStatusCode": { - "description": "HTTP status code of the page.", - "type": "integer" - }, - "sourceURL": { - "description": "Source URL of the page.", - "type": "string" - }, - "title": { - "description": "Page title.", - "type": "string" - } - }, - "type": "object" - }, - "rawHtml": { - "description": "Raw HTML content of the page, only present if `includeRawHtml` was set to `true` in the crawl request.", - "type": "string" - } - }, - "type": "object" - }, - "type": "array" - }, - "partial_data": { - "description": "Partial results streamed as the crawl progresses. This feature is in alpha and may change.", - "items": { - "properties": { - "content": { - "description": "Raw content of the page.", - "type": "string" - }, - "html": { - "description": "HTML version of the page content, only present if `includeHtml` was set to `true` in the crawl request.", - "type": "string" - }, - "index": { - "description": "The index of the crawled page in the results.", - "type": "integer" - }, - "markdown": { - "description": "Markdown content of the page.", - "type": "string" - }, - "metadata": { - "description": "Metadata extracted from the page.", - "properties": { - " ": { - "description": "Any other extracted metadata.", - "type": "string" - }, - "description": { - "description": "Page description.", - "type": "string" - }, - "language": { - "description": "Page language.", - "type": "string" - }, - "pageError": { - "description": "Error message if there was an error scraping the page.", - "type": "string" - }, - "pageStatusCode": { - "description": "HTTP status code of the page.", - "type": "integer" - }, - "sourceURL": { - "description": "Source URL of the page.", - "type": "string" - }, - "title": { - "description": "Page title.", - "type": "string" - } - }, - "type": "object" - }, - "rawHtml": { - "description": "Raw HTML content of the page, only present if `includeRawHtml` was set to `true` in the crawl request.", - "type": "string" - } - }, - "type": "object" - }, - "type": "array" - }, - "status": { - "description": "Status of the crawl job. Can be 'completed', 'active', 'failed', or 'paused'.", - "enum": [ - "completed", - "active", - "failed", - "paused" - ], - "type": "string" - }, - "total": { - "description": "The total estimated number of pages to crawl.", - "type": "integer" - } - }, - "type": "object" - } - } - }, - "description": "Crawl job status retrieved." - } - }, - "summary": "Get Crawl Job Status" - } - }, - "delete": { - "description": "Send a request to cancel a running crawl job.", - "operationId": "cancelCrawlJob", - "parameters": [ - { - "description": "ID of the crawl job to cancel.", - "in": "path", - "name": "jobId", - "required": true, - "schema": { - "type": "string" - } - } - ], - "requestBody": { - "content": {} - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "status": { - "description": "The status of the crawl job cancellation request, usually 'cancelled'.", - "type": "string" - } - }, - "type": "object" - } - } - }, - "description": "Crawl job cancellation request submitted." - }, - "402": { - "description": "Payment required." - }, - "429": { - "description": "Rate limit exceeded." - }, - "500": { - "description": "Internal server error." - } - }, - "summary": "Cancel a Crawl Job" - } - }, - "description": "Send a request to crawl a URL and all accessible subpages. This submits a crawl job and returns a job ID to check the status of the crawl.", - "operationId": "crawlWebsite", - "parameters": [], - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "crawlerOptions": { - "description": "Options for controlling the crawling behavior.", - "properties": { - "allowBackwardCrawling": { - "default": false, - "description": "Enables the crawler to navigate from a specific URL to previously linked pages. For instance, from 'example.com/product/123' back to 'example.com/product'", - "type": "boolean" - }, - "allowExternalContentLinks": { - "default": false, - "description": "Allows the crawler to follow links to external websites.", - "type": "boolean" - }, - "excludes": { - "description": "URL patterns to exclude", - "items": { - "type": "string" - }, - "type": "array" - }, - "generateImgAltText": { - "default": false, - "description": "Generate alt text for images using LLMs (must have a paid plan)", - "type": "boolean" - }, - "ignoreSitemap": { - "default": false, - "description": "Ignore the website sitemap when crawling", - "type": "boolean" - }, - "includes": { - "description": "URL patterns to include", - "items": { - "type": "string" - }, - "type": "array" - }, - "limit": { - "default": 10000, - "description": "Maximum number of pages to crawl", - "type": "integer" - }, - "maxDepth": { - "description": "Maximum depth to crawl relative to the entered URL. A maxDepth of 0 scrapes only the entered URL. A maxDepth of 1 scrapes the entered URL and all pages one level deep. A maxDepth of 2 scrapes the entered URL and all pages up to two levels deep. Higher values follow the same pattern.", - "type": "integer" - }, - "mode": { - "default": "default", - "description": "The crawling mode to use. Fast mode crawls 4x faster websites without sitemap, but may not be as accurate and shouldn't be used in heavy js-rendered websites.", - "enum": [ - "default", - "fast" - ], - "type": "string" - }, - "returnOnlyUrls": { - "default": false, - "description": "If true, returns only the URLs as a list on the crawl status. Attention: the return response will be a list of URLs inside the data, not a list of documents.", - "type": "boolean" - } - }, - "type": "object" - }, - "pageOptions": { - "description": "Options for controlling the scraping behavior of individual pages.", - "properties": { - "fullPageScreenshot": { - "default": false, - "description": "Include a full page screenshot of the page that you are scraping.", - "type": "boolean" - }, - "headers": { - "description": "Headers to send with the request. Can be used to send cookies, user-agent, etc.", - "type": "object" - }, - "includeHtml": { - "default": false, - "description": "Include the HTML version of the content on page. Will output a html key in the response.", - "type": "boolean" - }, - "includeRawHtml": { - "default": false, - "description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.", - "type": "boolean" - }, - "onlyIncludeTags": { - "description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'", - "items": { - "type": "string" - }, - "type": "array" - }, - "onlyMainContent": { - "default": false, - "description": "Only return the main content of the page excluding headers, navs, footers, etc.", - "type": "boolean" - }, - "removeTags": { - "description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'", - "items": { - "type": "string" - }, - "type": "array" - }, - "replaceAllPathsWithAbsolutePaths": { - "default": false, - "description": "Replace all relative paths with absolute paths for images and links", - "type": "boolean" - }, - "screenshot": { - "default": false, - "description": "Include a screenshot of the top of the page that you are scraping.", - "type": "boolean" - }, - "waitFor": { - "default": 0, - "description": "Wait x amount of milliseconds for the page to load to fetch content", - "type": "integer" - } - }, - "type": "object" - }, - "url": { - "description": "The base URL to start crawling from", - "required": true, - "type": "string" - } - }, - "type": "object" - } - } - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "jobId": { - "description": "The ID of the submitted crawl job.", - "type": "string" - } - }, - "type": "object" - } - } - }, - "description": "Crawl job submitted successfully." - }, - "402": { - "description": "Payment required." - }, - "429": { - "description": "Rate limit exceeded." - }, - "500": { - "description": "Internal server error." - } - } - }, - "summary": "Crawl a Website" - } - } - }, - "servers": [ - { - "url": "https://api.firecrawl.dev/v0" - } - ] -} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/combined_api_spec.json b/examples/turning_docs_into_api_specs/combined_api_spec.json new file mode 100644 index 00000000..526dec8b --- /dev/null +++ b/examples/turning_docs_into_api_specs/combined_api_spec.json @@ -0,0 +1,510 @@ +{ + "openapi": "3.0.0", + "info": { + "title": "https://docs.firecrawl.dev/api-reference API Specification", + "version": "1.0.0" + }, + "paths": { + "/crawl": { + "post": { + "summary": "Crawl a website", + "requestBody": { + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "url": { + "type": "string", + "description": "Base URL to crawl" + }, + "excludePaths": { + "type": "array", + "items": { + "type": "string" + }, + "description": "URL patterns to exclude" + }, + "includePaths": { + "type": "array", + "items": { + "type": "string" + }, + "description": "URL patterns to include" + }, + "maxDepth": { + "type": "integer", + "description": "Maximum crawl depth" + }, + "ignoreSitemap": { + "type": "boolean", + "description": "Ignore sitemap?" + }, + "limit": { + "type": "integer", + "description": "Maximum pages to crawl" + }, + "allowBackwardLinks": { + "type": "boolean", + "description": "Allow backward links?" + }, + "allowExternalLinks": { + "type": "boolean", + "description": "Allow external links?" + }, + "webhook": { + "type": "string", + "description": "Webhook URL" + }, + "scrapeOptions": { + "type": "object", + "properties": { + "formats": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Formats to include" + }, + "headers": { + "type": "object", + "description": "Headers to send" + }, + "includeTags": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Tags to include" + }, + "excludeTags": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Tags to exclude" + }, + "onlyMainContent": { + "type": "boolean", + "description": "Only main content?" + }, + "waitFor": { + "type": "integer", + "description": "Wait time in ms" + } + } + } + } + } + } + } + }, + "responses": { + "200": { + "description": "Crawl started", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "success": { + "type": "boolean" + }, + "id": { + "type": "string" + }, + "url": { + "type": "string" + } + } + } + } + } + } + }, + "security": [ + { + "Authorization": [] + } + ] + } + }, + "/scrape": { + "post": { + "summary": "Scrape a webpage", + "requestBody": { + "required": true, + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "url": { + "type": "string", + "description": "URL to scrape" + }, + "formats": { + "type": "array", + "description": "Output formats", + "items": { + "type": "string", + "enum": [ + "markdown", + "html", + "rawHtml", + "links", + "screenshot", + "extract", + "screenshot@fullPage" + ] + } + }, + "onlyMainContent": { + "type": "boolean", + "description": "Only main content" + }, + "includeTags": { + "type": "array", + "description": "Tags to include", + "items": { + "type": "string" + } + }, + "excludeTags": { + "type": "array", + "description": "Tags to exclude", + "items": { + "type": "string" + } + }, + "headers": { + "type": "object", + "description": "Request headers" + }, + "waitFor": { + "type": "integer", + "description": "Delay in ms" + }, + "timeout": { + "type": "integer", + "description": "Timeout in ms" + }, + "extract": { + "type": "object", + "description": "Extract object", + "properties": { + "schema": { + "type": "object", + "description": "Extraction schema" + }, + "systemPrompt": { + "type": "string", + "description": "System prompt" + }, + "prompt": { + "type": "string", + "description": "Extraction prompt" + } + } + } + } + } + } + } + }, + "responses": { + "200": { + "description": "Successful scrape", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "success": { + "type": "boolean" + }, + "data": { + "type": "object", + "properties": { + "markdown": { + "type": "string" + }, + "html": { + "type": "string" + }, + "rawHtml": { + "type": "string" + }, + "screenshot": { + "type": "string" + }, + "links": { + "type": "array", + "items": { + "type": "string" + } + }, + "metadata": { + "type": "object", + "properties": { + "title": { + "type": "string" + }, + "description": { + "type": "string" + }, + "language": { + "type": "string" + }, + "sourceURL": { + "type": "string" + }, + "statusCode": { + "type": "integer" + }, + "error": { + "type": "string" + } + } + }, + "llm_extraction": { + "type": "object" + }, + "warning": { + "type": "string" + } + } + } + } + } + } + } + } + }, + "security": [ + { + "Bearer": [] + } + ] + } + }, + "/v1/crawl/{id}": { + "get": { + "summary": "Get crawl status", + "parameters": [ + { + "name": "id", + "in": "path", + "description": "ID of crawl job", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "description": "Crawl status", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "status": { + "type": "string", + "description": "Current status of crawl" + }, + "total": { + "type": "integer", + "description": "Total pages crawled" + }, + "completed": { + "type": "integer", + "description": "Number of pages crawled" + }, + "creditsUsed": { + "type": "integer", + "description": "Credits used" + }, + "expiresAt": { + "type": "string", + "format": "date-time", + "description": "Crawl expiry" + }, + "next": { + "type": "string", + "nullable": true, + "description": "URL for next data" + }, + "data": { + "type": "array", + "description": "Data of the crawl", + "items": { + "type": "object", + "properties": { + "markdown": { + "type": "string" + }, + "html": { + "type": "string" + }, + "rawHtml": { + "type": "string" + }, + "links": { + "type": "array", + "items": { + "type": "string" + } + }, + "screenshot": { + "type": "string" + }, + "metadata": { + "type": "object", + "properties": { + "title": { + "type": "string" + }, + "description": { + "type": "string" + }, + "language": { + "type": "string" + }, + "sourceURL": { + "type": "string" + }, + "statusCode": { + "type": "integer" + }, + "error": { + "type": "string" + } + } + } + } + } + } + } + } + } + } + } + }, + "security": [ + { + "Bearer": [] + } + ] + } + }, + "/crawl/{id}": { + "delete": { + "summary": "Cancel crawl job", + "security": [ + { + "bearerAuth": [] + } + ], + "parameters": [ + { + "name": "id", + "in": "path", + "description": "ID of crawl job", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "description": "Crawl job cancelled", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "success": { + "type": "boolean" + }, + "message": { + "type": "string" + } + } + } + } + } + } + } + } + }, + "/map": { + "post": { + "summary": "Map website and return links", + "requestBody": { + "required": true, + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "url": { + "type": "string", + "description": "Base URL to crawl" + }, + "search": { + "type": "string", + "description": "Search query for mapping" + }, + "ignoreSitemap": { + "type": "boolean", + "description": "Ignore sitemap?" + }, + "includeSubdomains": { + "type": "boolean", + "description": "Include subdomains?" + }, + "limit": { + "type": "integer", + "description": "Max links to return" + } + }, + "required": [ + "url" + ] + } + } + } + }, + "responses": { + "200": { + "description": "Successful mapping", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "success": { + "type": "boolean" + }, + "links": { + "type": "array", + "items": { + "type": "string" + } + } + } + } + } + } + } + } + } + } + }, + "components": { + "schemas": {} + } +} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/dify_api_spec.json b/examples/turning_docs_into_api_specs/dify_api_spec.json deleted file mode 100644 index e6eec457..00000000 --- a/examples/turning_docs_into_api_specs/dify_api_spec.json +++ /dev/null @@ -1,164 +0,0 @@ -{ - "openapi": "3.0.0", - "info": { - "title": "Knowledge Base API", - "description": "API for managing knowledge bases and documents." - }, - "paths": { - "/datasets": { - "post": { - "summary": "Create an Empty Dataset", - "description": "Only used to create an empty dataset", - "requestBody": { - "content": { - "application/json": { - "schema": { - "type": "object", - "properties": { - "name": { - "type": "string" - } - } - } - } - } - }, - "responses": {} - }, - "get": { - "summary": "Dataset List", - "parameters": [ - { - "name": "page", - "in": "query", - "schema": { - "type": "integer" - } - }, - { - "name": "limit", - "in": "query", - "schema": { - "type": "integer" - } - } - ], - "responses": {} - } - }, - "/datasets/{dataset_id}/document/create_by_text": { - "post": { - "summary": "Create Document by Text", - "requestBody": { - "content": { - "application/json": { - "schema": { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "text": { - "type": "string" - }, - "indexing_technique": { - "type": "string" - }, - "process_rule": { - "type": "object" - } - } - } - } - } - }, - "responses": {} - } - }, - "/datasets/{dataset_id}/document/create_by_file": { - "post": { - "summary": "Create Document by File", - "requestBody": { - "content": { - "multipart/form-data": { - "schema": { - "type": "object", - "properties": { - "data": { - "type": "string" - }, - "file": { - "type": "string", - "format": "binary" - } - } - } - } - } - }, - "responses": {} - } - }, - "/datasets/{dataset_id}/documents/{batch}/indexing-status": { - "get": { - "summary": "Get Document Embedding Status (Progress)", - "responses": {} - } - }, - "/datasets/{dataset_id}/documents/{document_id}": { - "delete": { - "summary": "Delete Document", - "responses": {} - } - }, - "/datasets/{dataset_id}/documents": { - "get": { - "summary": "Dataset Document List", - "responses": {} - } - }, - "/datasets/{dataset_id}/documents/{document_id}/segments": { - "post": { - "summary": "Add Segments", - "requestBody": { - "content": { - "application/json": { - "schema": { - "type": "object", - "properties": { - "segments": { - "type": "array", - "items": { - "type": "object", - "properties": { - "content": { - "type": "string" - }, - "answer": { - "type": "string" - }, - "keywords": { - "type": "array", - "items": { - "type": "string" - } - } - } - } - } - } - } - } - } - }, - "responses": {} - } - }, - "/datasets/{dataset_id}/segments/{segment_id}": { - "delete": { - "summary": "Delete Document Segment", - "responses": {} - } - } - } -} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_0.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_0.json deleted file mode 100644 index 84bce02c..00000000 --- a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_0.json +++ /dev/null @@ -1,211 +0,0 @@ -{ - "info": { - "title": "Firecrawl API", - "version": "v0" - }, - "openapi": "3.0.0", - "paths": { - "/v0/crawl": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "crawlerOptions": { - "description": "Crawling options.", - "properties": { - "excludes": { - "description": "URL patterns to exclude.", - "items": { - "type": "string" - }, - "type": "array" - }, - "includes": { - "description": "URL patterns to include.", - "items": { - "type": "string" - }, - "type": "array" - }, - "limit": { - "description": "Maximum pages to crawl.", - "type": "integer" - }, - "maxDepth": { - "description": "Maximum crawl depth.", - "type": "integer" - }, - "mode": { - "description": "Crawling mode.", - "enum": [ - "default", - "fast" - ], - "type": "string" - }, - "returnOnlyUrls": { - "description": "Return only URLs.", - "type": "boolean" - } - }, - "type": "object" - }, - "pageOptions": { - "description": "Page scraping options.", - "properties": { - "includeHtml": { - "description": "Include HTML content.", - "type": "boolean" - }, - "includeRawHtml": { - "description": "Include raw HTML content.", - "type": "boolean" - }, - "onlyMainContent": { - "description": "Only main content.", - "type": "boolean" - }, - "screenshot": { - "description": "Include page screenshot.", - "type": "boolean" - }, - "waitFor": { - "description": "Wait time in milliseconds.", - "type": "integer" - } - }, - "type": "object" - }, - "url": { - "description": "Base URL to crawl.", - "type": "string" - } - }, - "type": "object" - } - } - } - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "jobId": { - "description": "Crawl job ID.", - "type": "string" - } - }, - "type": "object" - } - } - }, - "description": "Crawl job initiated." - } - }, - "summary": "Crawl multiple pages." - } - }, - "/v0/crawl/status/{jobId}": { - "get": { - "parameters": [ - { - "description": "Crawl job ID.", - "in": "path", - "name": "jobId", - "required": true, - "schema": { - "type": "string" - } - } - ], - "responses": { - "200": { - "description": "Crawl job status." - } - }, - "summary": "Check crawl job status." - } - }, - "/v0/scrape": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "extractorOptions": { - "description": "Data extraction options.", - "properties": { - "extractionPrompt": { - "description": "Prompt for data extraction.", - "type": "string" - }, - "extractionSchema": { - "description": "Schema for data extraction.", - "type": "object" - }, - "mode": { - "description": "Extraction mode.", - "enum": [ - "llm-extraction", - "llm-extraction-from-raw-html" - ], - "type": "string" - } - }, - "type": "object" - }, - "pageOptions": { - "description": "Page scraping options.", - "properties": { - "includeHtml": { - "description": "Include HTML content.", - "type": "boolean" - }, - "includeRawHtml": { - "description": "Include raw HTML content.", - "type": "boolean" - }, - "onlyMainContent": { - "description": "Only main content.", - "type": "boolean" - }, - "screenshot": { - "description": "Include page screenshot.", - "type": "boolean" - }, - "waitFor": { - "description": "Wait time in milliseconds.", - "type": "integer" - } - }, - "type": "object" - }, - "timeout": { - "description": "Timeout in milliseconds.", - "type": "integer" - }, - "url": { - "description": "URL to scrape.", - "type": "string" - } - }, - "type": "object" - } - } - } - }, - "responses": { - "200": { - "description": "Successful scraping." - } - }, - "summary": "Scrape a single page." - } - } - } -} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_1.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_1.json deleted file mode 100644 index 8656c978..00000000 --- a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_1.json +++ /dev/null @@ -1,165 +0,0 @@ -{ - "info": { - "title": "Firecrawl API", - "version": "v0" - }, - "openapi": "3.0.0", - "paths": { - "/crawl": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "crawlerOptions": { - "properties": { - "allowBackwardCrawling": { - "description": "Allow backward crawling", - "type": "boolean" - }, - "allowExternalContentLinks": { - "description": "Allow external links", - "type": "boolean" - }, - "excludes": { - "description": "URL patterns to exclude", - "items": { - "type": "string" - }, - "type": "array" - }, - "generateImgAltText": { - "description": "Generate alt text for images", - "type": "boolean" - }, - "ignoreSitemap": { - "description": "Ignore website sitemap", - "type": "boolean" - }, - "includes": { - "description": "URL patterns to include", - "items": { - "type": "string" - }, - "type": "array" - }, - "limit": { - "description": "Maximum pages to crawl", - "type": "integer" - }, - "maxDepth": { - "description": "Maximum crawl depth", - "type": "integer" - }, - "mode": { - "description": "Crawling mode", - "enum": [ - "default", - "fast" - ], - "type": "string" - }, - "returnOnlyUrls": { - "description": "Return only crawled URLs", - "type": "boolean" - } - }, - "type": "object" - }, - "pageOptions": { - "properties": { - "fullPageScreenshot": { - "description": "Include full page screenshot", - "type": "boolean" - }, - "headers": { - "description": "Headers for requests", - "type": "object" - }, - "includeHtml": { - "description": "Include HTML content", - "type": "boolean" - }, - "includeRawHtml": { - "description": "Include raw HTML content", - "type": "boolean" - }, - "onlyIncludeTags": { - "description": "Include only specific tags", - "items": { - "type": "string" - }, - "type": "array" - }, - "onlyMainContent": { - "description": "Return only main content", - "type": "boolean" - }, - "removeTags": { - "description": "Remove specific tags", - "items": { - "type": "string" - }, - "type": "array" - }, - "replaceAllPathsWithAbsolutePaths": { - "description": "Use absolute paths", - "type": "boolean" - }, - "screenshot": { - "description": "Include page screenshot", - "type": "boolean" - }, - "waitFor": { - "description": "Wait for page load (ms)", - "type": "integer" - } - }, - "type": "object" - }, - "url": { - "description": "Base URL to crawl", - "type": "string" - } - }, - "type": "object" - } - } - } - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "jobId": { - "description": "Job ID of the crawl", - "type": "string" - } - }, - "type": "object" - } - } - }, - "description": "Crawl request successful" - } - }, - "security": [ - { - "Bearer": [] - } - ], - "summary": "Crawl a website" - } - } - }, - "securitySchemes": { - "Bearer": { - "bearerFormat": "JWT", - "scheme": "bearer", - "type": "http" - } - } -} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_10.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_10.json deleted file mode 100644 index 55f73a32..00000000 --- a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_10.json +++ /dev/null @@ -1,93 +0,0 @@ -{ - "info": { - "title": "Firecrawl API", - "version": "1.0.0" - }, - "openapi": "3.0.0", - "paths": { - "/check_crawl_status": { - "post": { - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "current": { - "type": "integer" - }, - "data": { - "items": { - "properties": { - "content": { - "type": "string" - }, - "markdown": { - "type": "string" - }, - "metadata": { - "properties": { - "description": { - "type": "string" - }, - "language": { - "type": "string" - }, - "sourceURL": { - "type": "string" - }, - "title": { - "type": "string" - } - }, - "type": "object" - }, - "provider": { - "type": "string" - } - }, - "type": "object" - }, - "type": "array" - }, - "status": { - "type": "string" - }, - "total": { - "type": "integer" - } - }, - "type": "object" - } - } - }, - "description": "Crawl job status" - } - }, - "summary": "Check crawl job status" - } - }, - "/crawl": { - "post": { - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "jobId": { - "type": "string" - } - }, - "type": "object" - } - } - }, - "description": "Job ID" - } - }, - "summary": "Crawl URL and subpages" - } - } - } -} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_11.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_11.json deleted file mode 100644 index e19ed056..00000000 --- a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_11.json +++ /dev/null @@ -1,131 +0,0 @@ -{ - "info": { - "title": "Firecrawl API", - "version": "v0" - }, - "openapi": "3.0.0", - "paths": { - "/scrape": { - "post": { - "requestBody": { - "content": { - "application/json": { - "example": { - "extractorOptions": { - "extractionPrompt": "Based on the information on the page, extract the information from the schema. ", - "extractionSchema": { - "properties": { - "company_mission": { - "type": "string" - }, - "is_in_yc": { - "type": "boolean" - }, - "is_open_source": { - "type": "boolean" - }, - "supports_sso": { - "type": "boolean" - } - }, - "required": [ - "company_mission", - "supports_sso", - "is_open_source", - "is_in_yc" - ], - "type": "object" - }, - "mode": "llm-extraction" - }, - "url": "https://docs.firecrawl.dev/" - }, - "schema": { - "properties": { - "extractorOptions": { - "properties": { - "extractionPrompt": { - "description": "Prompt for extraction", - "type": "string" - }, - "extractionSchema": { - "description": "Schema for data extraction", - "type": "object" - }, - "mode": { - "description": "Extraction mode", - "type": "string" - } - }, - "type": "object" - }, - "url": { - "description": "URL to scrape", - "type": "string" - } - }, - "type": "object" - } - } - } - }, - "responses": { - "200": { - "content": { - "application/json": { - "example": { - "data": { - "content": "Raw Content", - "llm_extraction": { - "company_mission": "Train a secure AI on your technical resources that answers customer and employee questions so your team doesn't have to", - "is_in_yc": true, - "is_open_source": false, - "supports_sso": true - }, - "metadata": { - "description": "Mendable allows you to easily build AI chat applications. Ingest, customize, then deploy with one line of code anywhere you want. Brought to you by SideGuide", - "ogDescription": "Mendable allows you to easily build AI chat applications. Ingest, customize, then deploy with one line of code anywhere you want. Brought to you by SideGuide", - "ogImage": "https://docs.firecrawl.dev/mendable_new_og1.png", - "ogLocaleAlternate": [], - "ogSiteName": "Mendable", - "ogTitle": "Mendable", - "ogUrl": "https://docs.firecrawl.dev/", - "robots": "follow, index", - "sourceURL": "https://docs.firecrawl.dev/", - "title": "Mendable" - } - }, - "success": true - }, - "schema": { - "properties": { - "data": { - "properties": { - "content": { - "type": "string" - }, - "llm_extraction": { - "type": "object" - }, - "metadata": { - "type": "object" - } - }, - "type": "object" - }, - "success": { - "type": "boolean" - } - }, - "type": "object" - } - } - }, - "description": "Successful scrape" - } - }, - "summary": "Extract data from pages." - } - } - } -} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_13.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_13.json deleted file mode 100644 index 0352c66f..00000000 --- a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_13.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "info": { - "title": "Firecrawl API", - "version": "v0" - }, - "openapi": "3.0.0", - "paths": { - "/search": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "pageOptions": { - "properties": { - "fetchPageContent": { - "type": "boolean" - } - }, - "type": "object" - }, - "query": { - "type": "string" - } - }, - "type": "object" - } - } - } - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "data": { - "items": { - "properties": { - "markdown": { - "type": "string" - }, - "metadata": { - "properties": { - "description": { - "type": "string" - }, - "language": { - "type": "string" - }, - "sourceURL": { - "type": "string" - }, - "title": { - "type": "string" - } - }, - "type": "object" - }, - "provider": { - "type": "string" - }, - "url": { - "type": "string" - } - }, - "type": "object" - }, - "type": "array" - }, - "success": { - "type": "boolean" - } - }, - "type": "object" - } - } - }, - "description": "Successful search and scrape." - } - }, - "summary": "Search web, scrape, return markdown." - } - } - } -} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_15.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_15.json deleted file mode 100644 index e7384f8e..00000000 --- a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_15.json +++ /dev/null @@ -1,83 +0,0 @@ -{ - "info": { - "title": "Firecrawl API", - "version": "1.0.0" - }, - "openapi": "3.0.0", - "paths": { - "/crawl": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "url": { - "description": "Website URL to crawl.", - "type": "string" - } - }, - "type": "object" - } - } - } - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "items": { - "properties": { - "markdown": { - "description": "Markdown content.", - "type": "string" - } - }, - "type": "object" - }, - "type": "array" - } - } - }, - "description": "Website crawled successfully." - } - }, - "summary": "Crawl a website." - } - }, - "/scrape": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "url": { - "description": "Page URL to scrape.", - "type": "string" - } - }, - "type": "object" - } - } - } - }, - "responses": { - "200": { - "content": { - "text/plain": { - "schema": { - "description": "Scraped content.", - "type": "string" - } - } - }, - "description": "Page scraped successfully." - } - }, - "summary": "Scrape a single page." - } - } - } -} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_16.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_16.json deleted file mode 100644 index ed6fb9d6..00000000 --- a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_16.json +++ /dev/null @@ -1,200 +0,0 @@ -{ - "info": { - "title": "Firecrawl API", - "version": "1.0.0" - }, - "openapi": "3.0.0", - "paths": { - "/crawl": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "crawler_options": { - "properties": { - "exclude": { - "description": "URL patterns to exclude", - "items": { - "type": "string" - }, - "type": "array" - }, - "generateImgAltText": { - "description": "Generate alt text for images", - "type": "boolean" - }, - "includes": { - "description": "URL patterns to include", - "items": { - "type": "string" - }, - "type": "array" - }, - "limit": { - "description": "Max pages to crawl", - "type": "integer" - }, - "maxDepth": { - "description": "Maximum crawl depth", - "type": "integer" - }, - "mode": { - "description": "Crawling mode", - "type": "string" - }, - "returnOnlyUrls": { - "description": "Return only URLs", - "type": "boolean" - }, - "timeout": { - "description": "Timeout in milliseconds", - "type": "integer" - } - }, - "type": "object" - }, - "page_options": { - "properties": { - "includeHtml": { - "description": "Include raw HTML", - "type": "boolean" - }, - "onlyMainContent": { - "description": "Only main content", - "type": "boolean" - } - }, - "type": "object" - }, - "url": { - "description": "Base URL to crawl", - "type": "string" - } - }, - "type": "object" - } - } - } - }, - "responses": { - "200": { - "description": "Crawl successful." - } - }, - "summary": "Crawl a website." - } - }, - "/scrape": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "extractor_options": { - "properties": { - "extractionPrompt": { - "description": "Prompt for extraction", - "type": "string" - }, - "extractionSchema": { - "description": "Schema for extraction", - "type": "string" - }, - "mode": { - "description": "Extraction mode", - "type": "string" - } - }, - "type": "object" - }, - "page_options": { - "properties": { - "includeHtml": { - "description": "Include raw HTML", - "type": "boolean" - }, - "onlyMainContent": { - "description": "Only main content", - "type": "boolean" - } - }, - "type": "object" - }, - "timeout": { - "description": "Timeout in milliseconds", - "type": "integer" - }, - "url": { - "description": "URL to scrape", - "type": "string" - } - }, - "type": "object" - } - } - } - }, - "responses": { - "200": { - "description": "Scrape successful." - } - }, - "summary": "Scrape a website." - } - }, - "/search": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "page_options": { - "properties": { - "fetchPageContent": { - "description": "Fetch full content", - "type": "boolean" - }, - "includeHtml": { - "description": "Include raw HTML", - "type": "boolean" - }, - "onlyMainContent": { - "description": "Only main content", - "type": "boolean" - } - }, - "type": "object" - }, - "query": { - "description": "Search query string", - "type": "string" - }, - "search_options": { - "properties": { - "limit": { - "description": "Max results", - "type": "integer" - } - }, - "type": "object" - } - }, - "type": "object" - } - } - } - }, - "responses": { - "200": { - "description": "Search successful." - } - }, - "summary": "Search Firecrawl index." - } - } - } -} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_2.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_2.json deleted file mode 100644 index 25cf6c05..00000000 --- a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_2.json +++ /dev/null @@ -1,54 +0,0 @@ -{ - "info": { - "title": "Firecrawl API", - "version": "v0" - }, - "openapi": "3.0.0", - "paths": { - "/crawl/cancel/{jobId}": { - "delete": { - "parameters": [ - { - "description": "ID of crawl job", - "in": "path", - "name": "jobId", - "required": true, - "schema": { - "type": "string" - } - } - ], - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "status": { - "type": "string" - } - }, - "type": "object" - } - } - }, - "description": "Returns cancelled." - } - }, - "security": [ - { - "Bearer": [] - } - ], - "summary": "Cancel crawl job" - } - } - }, - "securitySchemes": { - "Bearer": { - "bearerFormat": "Bearer ", - "scheme": "bearer", - "type": "http" - } - } -} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_22.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_22.json deleted file mode 100644 index ac146a63..00000000 --- a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_22.json +++ /dev/null @@ -1,166 +0,0 @@ -{ - "info": { - "title": "Firecrawl API", - "version": "1.0.0" - }, - "openapi": "3.0.0", - "paths": { - "/check-crawl-status/{jobId}": { - "get": { - "parameters": [ - { - "in": "path", - "name": "jobId", - "required": true, - "schema": { - "type": "string" - } - } - ], - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "current": { - "description": "Current progress", - "type": "integer" - }, - "data": { - "items": { - "properties": { - "content": { - "description": "Raw content", - "type": "string" - }, - "markdown": { - "description": "Markdown content", - "type": "string" - }, - "metadata": { - "description": "Page metadata", - "type": "object" - }, - "provider": { - "description": "Data provider", - "type": "string" - } - }, - "type": "object" - }, - "type": "array" - }, - "status": { - "description": "Job status", - "type": "string" - }, - "total": { - "description": "Total pages", - "type": "integer" - } - }, - "type": "object" - } - } - }, - "description": "Crawl job status." - } - }, - "summary": "Check crawl job status." - } - }, - "/crawl": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "crawlerOptions": { - "description": "Crawler options", - "type": "object" - }, - "url": { - "description": "URL to crawl", - "type": "string" - } - }, - "type": "object" - } - } - } - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "jobId": { - "description": "Job ID", - "type": "string" - } - }, - "type": "object" - } - } - }, - "description": "Crawl job submitted." - } - }, - "summary": "Crawl a URL." - } - }, - "/scrape": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "extractorOptions": { - "description": "Extractor options", - "type": "object" - }, - "pageOptions": { - "description": "Page options", - "type": "object" - }, - "url": { - "description": "URL to scrape", - "type": "string" - } - }, - "type": "object" - } - } - } - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "data": { - "description": "Scraped data", - "type": "object" - }, - "success": { - "description": "Success flag", - "type": "boolean" - } - }, - "type": "object" - } - } - }, - "description": "Scraped data." - } - }, - "summary": "Scrape a single URL." - } - } - } -} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_25.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_25.json deleted file mode 100644 index 9701a462..00000000 --- a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_25.json +++ /dev/null @@ -1,229 +0,0 @@ -{ - "info": { - "title": "Firecrawl API", - "version": "v0" - }, - "openapi": "3.0.0", - "paths": { - "/v0/crawl": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "crawlerOptions": { - "properties": { - "excludes": { - "description": "Paths to exclude", - "items": { - "type": "string" - }, - "type": "array" - }, - "includes": { - "description": "Paths to include", - "items": { - "type": "string" - }, - "type": "array" - }, - "limit": { - "description": "Maximum pages to crawl", - "type": "integer" - }, - "maxDepth": { - "description": "Maximum crawl depth", - "type": "integer" - }, - "returnOnlyUrls": { - "description": "Only return URLs", - "type": "boolean" - } - }, - "type": "object" - }, - "pageOptions": { - "properties": { - "onlyMainContent": { - "description": "Extract main content", - "type": "boolean" - } - }, - "type": "object" - }, - "url": { - "description": "URL to crawl", - "type": "string" - } - }, - "type": "object" - } - } - } - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "jobId": { - "description": "Job ID", - "type": "string" - } - }, - "type": "object" - } - } - }, - "description": "Crawl job created" - } - }, - "summary": "Crawl a website" - } - }, - "/v0/crawl/status/{jobId}": { - "get": { - "parameters": [ - { - "in": "path", - "name": "jobId", - "required": true, - "schema": { - "type": "string" - } - } - ], - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "current": { - "type": "integer" - }, - "data": { - "items": { - "properties": { - "url": { - "type": "string" - } - }, - "type": "object" - }, - "type": "array" - }, - "status": { - "description": "Job status", - "type": "string" - }, - "total": { - "type": "integer" - } - }, - "type": "object" - } - } - }, - "description": "Crawl job status" - } - }, - "summary": "Get crawl job status" - } - }, - "/v0/scrape": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "pageOptions": { - "properties": { - "onlyMainContent": { - "description": "Extract main content", - "type": "boolean" - } - }, - "type": "object" - }, - "url": { - "description": "URL to scrape", - "type": "string" - } - }, - "type": "object" - } - } - } - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "data": { - "properties": { - "content": { - "type": "string" - }, - "html": { - "type": "string" - }, - "llm_extraction": { - "type": "object" - }, - "markdown": { - "type": "string" - }, - "metadata": { - "properties": { - "description": { - "type": "string" - }, - "language": { - "type": "string" - }, - "pageError": { - "type": "string" - }, - "pageStatusCode": { - "type": "integer" - }, - "sourceURL": { - "type": "string" - }, - "title": { - "type": "string" - } - }, - "type": "object" - }, - "rawHtml": { - "type": "string" - }, - "warning": { - "type": "string" - } - }, - "type": "object" - }, - "success": { - "type": "boolean" - } - }, - "type": "object" - } - } - }, - "description": "Scrape results" - } - }, - "summary": "Scrape a webpage" - } - } - } -} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_26.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_26.json deleted file mode 100644 index b642e9c0..00000000 --- a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_26.json +++ /dev/null @@ -1,115 +0,0 @@ -{ - "info": { - "title": "Firecrawl API", - "version": "1.0.0" - }, - "openapi": "3.0.0", - "paths": { - "/scrape": { - "post": { - "requestBody": { - "content": { - "application/json": { - "example": { - "extractorOptions": { - "extractionPrompt": "Extract company info.", - "extractionSchema": { - "properties": { - "company_description": { - "type": "string" - }, - "company_industry": { - "type": "string" - }, - "who_they_serve": { - "type": "string" - } - }, - "required": [ - "company_description", - "company_industry", - "who_they_serve" - ], - "type": "object" - }, - "mode": "llm-extraction" - }, - "pageOptions": { - "onlyMainContent": true - }, - "url": "https://example.com" - }, - "schema": { - "properties": { - "extractorOptions": { - "properties": { - "extractionPrompt": { - "description": "Prompt for LLM extraction.", - "type": "string" - }, - "extractionSchema": { - "properties": { - "properties": { - "company_description": { - "type": "string" - }, - "company_industry": { - "type": "string" - }, - "who_they_serve": { - "type": "string" - } - }, - "required": [ - "company_description", - "company_industry", - "who_they_serve" - ], - "type": { - "type": "string" - } - }, - "type": "object" - }, - "mode": { - "description": "Extraction mode.", - "type": "string" - } - }, - "type": "object" - }, - "pageOptions": { - "properties": { - "onlyMainContent": { - "type": "boolean" - } - }, - "type": "object" - }, - "url": { - "description": "URL to scrape.", - "type": "string" - } - }, - "type": "object" - } - } - } - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "type": "object" - } - } - }, - "description": "Successful scrape." - } - }, - "summary": "Scrape data from URL." - } - } - } -} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_3.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_3.json deleted file mode 100644 index bcf94159..00000000 --- a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_3.json +++ /dev/null @@ -1,185 +0,0 @@ -{ - "components": { - "securitySchemes": { - "bearerAuth": { - "scheme": "bearer", - "type": "http" - } - } - }, - "info": { - "title": "Firecrawl API", - "version": "v0" - }, - "openapi": "3.0.0", - "paths": { - "/v0/scrape": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "extractorOptions": { - "description": "Options for extraction", - "properties": { - "extractionPrompt": { - "description": "Prompt for LLM extraction", - "type": "string" - }, - "extractionSchema": { - "description": "Schema for LLM extraction", - "type": "object" - }, - "mode": { - "description": "Extraction mode", - "enum": [ - "markdown", - "llm-extraction", - "llm-extraction-from-raw-html", - "llm-extraction-from-markdown" - ], - "type": "string" - } - }, - "type": "object" - }, - "pageOptions": { - "properties": { - "fullPageScreenshot": { - "description": "Include full page screenshot", - "type": "boolean" - }, - "headers": { - "description": "Headers for request", - "type": "object" - }, - "includeHtml": { - "description": "Include HTML content", - "type": "boolean" - }, - "includeRawHtml": { - "description": "Include raw HTML content", - "type": "boolean" - }, - "onlyIncludeTags": { - "description": "Include only these tags", - "items": { - "type": "string" - }, - "type": "array" - }, - "onlyMainContent": { - "description": "Only return main content", - "type": "boolean" - }, - "removeTags": { - "description": "Remove these tags", - "items": { - "type": "string" - }, - "type": "array" - }, - "replaceAllPathsWithAbsolutePaths": { - "description": "Replace relative paths", - "type": "boolean" - }, - "screenshot": { - "description": "Include screenshot", - "type": "boolean" - }, - "waitFor": { - "description": "Wait time in ms", - "type": "integer" - } - }, - "type": "object" - }, - "timeout": { - "description": "Timeout in ms", - "type": "integer" - }, - "url": { - "description": "URL to scrape", - "type": "string" - } - }, - "type": "object" - } - } - }, - "required": true - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "data": { - "properties": { - "content": { - "type": "string" - }, - "html": { - "type": "string" - }, - "llm_extraction": { - "type": "object" - }, - "markdown": { - "type": "string" - }, - "metadata": { - "properties": { - "description": { - "type": "string" - }, - "language": { - "type": "string" - }, - "pageError": { - "type": "string" - }, - "pageStatusCode": { - "type": "integer" - }, - "sourceURL": { - "type": "string" - }, - "title": { - "type": "string" - } - }, - "type": "object" - }, - "rawHtml": { - "type": "string" - }, - "warning": { - "type": "string" - } - }, - "type": "object" - }, - "success": { - "type": "boolean" - } - }, - "type": "object" - } - } - }, - "description": "Successful scrape" - } - }, - "security": [ - { - "bearerAuth": [] - } - ], - "summary": "Scrape a webpage" - } - } - } -} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_30.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_30.json deleted file mode 100644 index bc542e2a..00000000 --- a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_30.json +++ /dev/null @@ -1,212 +0,0 @@ -{ - "info": { - "title": "Firecrawl API", - "version": "1.0.0" - }, - "openapi": "3.0.0", - "paths": { - "/crawl": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "crawlerOptions": { - "description": "Crawl job options", - "properties": { - "excludes": { - "description": "Pages to exclude", - "items": { - "type": "string" - }, - "type": "array" - }, - "includes": { - "description": "Pages to include", - "items": { - "type": "string" - }, - "type": "array" - }, - "limit": { - "description": "Max pages to crawl", - "type": "integer" - } - }, - "type": "object" - }, - "pageOptions": { - "description": "Page scraping options", - "properties": { - "onlyMainContent": { - "description": "Only scrape main content", - "type": "boolean" - } - }, - "type": "object" - }, - "url": { - "description": "URL to crawl", - "type": "string" - } - }, - "required": [ - "url" - ], - "type": "object" - } - } - } - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "description": "Crawl job result", - "type": "object" - } - } - }, - "description": "Crawl job result" - } - }, - "summary": "Crawl a website" - } - }, - "/crawl/{jobId}/cancel": { - "post": { - "parameters": [ - { - "description": "Crawl job ID", - "in": "path", - "name": "jobId", - "required": true, - "schema": { - "type": "string" - } - } - ], - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "description": "Cancellation status", - "type": "object" - } - } - }, - "description": "Cancellation status" - } - }, - "summary": "Cancel crawl job" - } - }, - "/crawl/{jobId}/status": { - "get": { - "parameters": [ - { - "description": "Crawl job ID", - "in": "path", - "name": "jobId", - "required": true, - "schema": { - "type": "string" - } - } - ], - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "description": "Crawl status", - "type": "object" - } - } - }, - "description": "Crawl status" - } - }, - "summary": "Check crawl status" - } - }, - "/scrape": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "extractorOptions": { - "description": "LLM extraction options", - "properties": { - "extractionSchema": { - "description": "JSON schema for extraction", - "type": "object" - } - }, - "type": "object" - }, - "url": { - "description": "URL to scrape", - "type": "string" - } - }, - "required": [ - "url" - ], - "type": "object" - } - } - } - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "description": "Scraped data", - "type": "object" - } - } - }, - "description": "Scraped data" - } - }, - "summary": "Scrape a single URL" - } - }, - "/search": { - "get": { - "parameters": [ - { - "description": "Search query", - "in": "query", - "name": "query", - "required": true, - "schema": { - "type": "string" - } - } - ], - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "description": "Search results", - "type": "object" - } - } - }, - "description": "Search results" - } - }, - "summary": "Search and scrape" - } - } - } -} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_31.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_31.json deleted file mode 100644 index 07f71759..00000000 --- a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_31.json +++ /dev/null @@ -1,199 +0,0 @@ -{ - "info": { - "title": "Firecrawl API", - "version": "1.0.0" - }, - "openapi": "3.0.0", - "paths": { - "/crawl": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "crawlerOptions": { - "properties": { - "excludes": { - "description": "Paths to exclude", - "items": { - "type": "string" - }, - "type": "array" - }, - "includes": { - "description": "Paths to include", - "items": { - "type": "string" - }, - "type": "array" - }, - "limit": { - "description": "Maximum pages to crawl", - "type": "integer" - } - }, - "type": "object" - }, - "pageOptions": { - "properties": { - "onlyMainContent": { - "description": "Extract only main content", - "type": "boolean" - } - }, - "type": "object" - }, - "url": { - "description": "Starting URL for crawl", - "type": "string" - } - }, - "type": "object" - } - } - } - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "jobId": { - "description": "Unique job identifier", - "type": "string" - } - }, - "type": "object" - } - } - }, - "description": "Crawl job started" - } - }, - "summary": "Crawl a website" - } - }, - "/crawl/{jobId}/status": { - "get": { - "parameters": [ - { - "in": "path", - "name": "jobId", - "required": true, - "schema": { - "type": "string" - } - } - ], - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "status": { - "description": "Current job status", - "type": "string" - } - }, - "type": "object" - } - } - }, - "description": "Crawl job status" - } - }, - "summary": "Check crawl status" - } - }, - "/scrape": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "extractorOptions": { - "properties": { - "extractionSchema": { - "description": "Zod schema for extraction", - "type": "object" - } - }, - "type": "object" - }, - "url": { - "description": "URL to scrape", - "type": "string" - } - }, - "type": "object" - } - } - } - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "data": { - "description": "Extracted data", - "type": "object" - } - }, - "type": "object" - } - } - }, - "description": "Scraped data" - } - }, - "summary": "Scrape a single URL" - } - }, - "/search": { - "get": { - "parameters": [ - { - "in": "query", - "name": "query", - "required": true, - "schema": { - "type": "string" - } - } - ], - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "items": { - "properties": { - "content": { - "description": "Page content (optional)", - "type": "string" - }, - "url": { - "description": "Result URL", - "type": "string" - } - }, - "type": "object" - }, - "type": "array" - } - } - }, - "description": "Search results" - } - }, - "summary": "Search for a query" - } - } - } -} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_33.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_33.json deleted file mode 100644 index b45ae841..00000000 --- a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_33.json +++ /dev/null @@ -1,202 +0,0 @@ -{ - "info": { - "title": "Firecrawl API", - "version": "1.0.0" - }, - "openapi": "3.0.0", - "paths": { - "/crawl": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "crawlerOptions": { - "description": "Options for crawling", - "properties": { - "excludes": { - "description": "URLs to exclude", - "items": { - "type": "string" - }, - "type": "array" - }, - "includes": { - "description": "URLs to include", - "items": { - "type": "string" - }, - "type": "array" - }, - "limit": { - "description": "Maximum pages to crawl", - "type": "integer" - } - }, - "type": "object" - }, - "pageOptions": { - "description": "Options for page content", - "properties": { - "onlyMainContent": { - "description": "Extract only main content", - "type": "boolean" - } - }, - "type": "object" - }, - "url": { - "description": "URL to crawl", - "type": "string" - } - }, - "type": "object" - } - } - } - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "jobId": { - "description": "Unique crawl job ID", - "type": "string" - } - }, - "type": "object" - } - } - }, - "description": "Crawl job started." - } - }, - "summary": "Crawl a website." - } - }, - "/crawl/{jobId}": { - "get": { - "parameters": [ - { - "in": "path", - "name": "jobId", - "required": true, - "schema": { - "type": "string" - } - } - ], - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "status": { - "description": "Current job status", - "type": "string" - } - }, - "type": "object" - } - } - }, - "description": "Crawl job status." - } - }, - "summary": "Check crawl job status." - } - }, - "/scrape": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "extractorOptions": { - "description": "Options for data extraction", - "properties": { - "extractionSchema": { - "description": "Pydantic schema", - "type": "object" - }, - "mode": { - "description": "Extraction mode", - "type": "string" - } - }, - "type": "object" - }, - "pageOptions": { - "description": "Options for page content", - "properties": { - "onlyMainContent": { - "description": "Extract only main content", - "type": "boolean" - } - }, - "type": "object" - }, - "url": { - "description": "URL to scrape", - "type": "string" - } - }, - "type": "object" - } - } - } - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "type": "object" - } - } - }, - "description": "Scraped data." - } - }, - "summary": "Scrape a single URL." - } - }, - "/search": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "query": { - "description": "Search query", - "type": "string" - } - }, - "type": "object" - } - } - } - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "type": "object" - } - } - }, - "description": "Search results." - } - }, - "summary": "Search the web." - } - } - } -} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_34.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_34.json deleted file mode 100644 index 3bafda42..00000000 --- a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_34.json +++ /dev/null @@ -1,201 +0,0 @@ -{ - "info": { - "title": "Firecrawl API", - "version": "0.1" - }, - "openapi": "3.0.0", - "paths": { - "/crawl": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "crawlerOptions": { - "description": "Crawl job options", - "properties": { - "excludes": { - "description": "URLs to exclude", - "items": { - "type": "string" - }, - "type": "array" - }, - "includes": { - "description": "URLs to include", - "items": { - "type": "string" - }, - "type": "array" - }, - "limit": { - "description": "Maximum pages to crawl", - "type": "integer" - } - }, - "type": "object" - }, - "pageOptions": { - "description": "Page scraping options", - "properties": { - "onlyMainContent": { - "description": "Only scrape main content", - "type": "boolean" - } - }, - "type": "object" - }, - "url": { - "description": "URL to crawl", - "type": "string" - } - }, - "type": "object" - } - } - } - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "type": "object" - } - } - }, - "description": "Crawl job started" - } - }, - "summary": "Crawl a website." - } - }, - "/crawl/{job_id}/cancel": { - "post": { - "parameters": [ - { - "description": "Crawl job ID", - "in": "path", - "name": "job_id", - "required": true, - "schema": { - "type": "string" - } - } - ], - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "type": "object" - } - } - }, - "description": "Cancellation status" - } - }, - "summary": "Cancel crawl job." - } - }, - "/crawl/{job_id}/status": { - "get": { - "parameters": [ - { - "description": "Crawl job ID", - "in": "path", - "name": "job_id", - "required": true, - "schema": { - "type": "string" - } - } - ], - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "type": "object" - } - } - }, - "description": "Crawl status" - } - }, - "summary": "Check crawl status." - } - }, - "/scrape": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "extractorOptions": { - "description": "LLM extraction options", - "properties": { - "extractionSchema": { - "description": "JSON schema for extraction", - "type": "object" - } - }, - "type": "object" - }, - "url": { - "description": "URL to scrape", - "type": "string" - } - }, - "type": "object" - } - } - } - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "type": "object" - } - } - }, - "description": "Scraped data" - } - }, - "summary": "Scrape a single URL." - } - }, - "/search": { - "get": { - "parameters": [ - { - "description": "Search query", - "in": "query", - "name": "query", - "required": true, - "schema": { - "type": "string" - } - } - ], - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "type": "object" - } - } - }, - "description": "Search results" - } - }, - "summary": "Search and scrape results." - } - } - } -} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_35.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_35.json deleted file mode 100644 index 890d31b1..00000000 --- a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_35.json +++ /dev/null @@ -1,245 +0,0 @@ -{ - "info": { - "title": "Firecrawl API", - "version": "1.0.0" - }, - "openapi": "3.0.0", - "paths": { - "/check-crawl-status": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "jobId": { - "description": "Crawl job ID", - "type": "string" - } - }, - "type": "object" - } - } - } - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "current": { - "description": "Current page count", - "type": "integer" - }, - "data": { - "description": "Crawl data", - "items": { - "properties": { - "content": { - "description": "Raw content", - "type": "string" - }, - "markdown": { - "description": "Markdown content", - "type": "string" - }, - "metadata": { - "description": "Page metadata", - "properties": { - "description": { - "description": "Page description", - "type": "string" - }, - "language": { - "description": "Page language", - "type": "string" - }, - "sourceURL": { - "description": "Page URL", - "type": "string" - }, - "title": { - "description": "Page title", - "type": "string" - } - }, - "type": "object" - }, - "provider": { - "description": "Content provider", - "type": "string" - } - }, - "type": "object" - }, - "type": "array" - }, - "status": { - "description": "Crawl status", - "type": "string" - }, - "total": { - "description": "Total page count", - "type": "integer" - } - }, - "type": "object" - } - } - }, - "description": "Crawl job status." - } - }, - "summary": "Check crawl job status." - } - }, - "/crawl": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "crawlerOptions": { - "description": "Crawler options", - "properties": { - "excludes": { - "description": "URLs to exclude", - "items": { - "type": "string" - }, - "type": "array" - } - }, - "type": "object" - }, - "url": { - "description": "URL to crawl", - "type": "string" - } - }, - "type": "object" - } - } - } - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "jobId": { - "description": "Job ID", - "type": "string" - } - }, - "type": "object" - } - } - }, - "description": "Crawl job submitted." - } - }, - "summary": "Crawl a URL." - } - }, - "/scrape-url": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "extractorOptions": { - "description": "Extractor options", - "properties": { - "extractionSchema": { - "description": "Extraction schema", - "type": "string" - }, - "mode": { - "description": "Extraction mode", - "type": "string" - } - }, - "type": "object" - }, - "pageOptions": { - "description": "Page options", - "properties": { - "onlyMainContent": { - "description": "Only main content", - "type": "boolean" - } - }, - "type": "object" - }, - "url": { - "description": "URL to scrape", - "type": "string" - } - }, - "type": "object" - } - } - } - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "data": { - "description": "Scraped data", - "properties": { - "content": { - "description": "Raw content", - "type": "string" - }, - "html": { - "description": "HTML content", - "type": "string" - }, - "llm_extraction": { - "description": "LLM extraction results", - "type": "object" - }, - "markdown": { - "description": "Markdown content", - "type": "string" - }, - "metadata": { - "description": "Page metadata", - "type": "object" - }, - "rawHtml": { - "description": "Raw HTML content", - "type": "string" - }, - "warning": { - "description": "Warning message", - "type": "string" - } - }, - "type": "object" - }, - "success": { - "description": "Request success", - "type": "boolean" - } - }, - "type": "object" - } - } - }, - "description": "Scraped data." - } - }, - "summary": "Scrape a single URL." - } - } - } -} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_4.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_4.json deleted file mode 100644 index daf53932..00000000 --- a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_4.json +++ /dev/null @@ -1,129 +0,0 @@ -{ - "components": { - "securitySchemes": { - "Bearer": { - "scheme": "bearer", - "type": "http" - } - } - }, - "info": { - "title": "Firecrawl API", - "version": "v0" - }, - "openapi": "3.0.0", - "paths": { - "/search": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "pageOptions": { - "properties": { - "fetchPageContent": { - "description": "Fetch content of each page.", - "type": "boolean" - }, - "includeHtml": { - "description": "Include HTML content.", - "type": "boolean" - }, - "includeRawHtml": { - "description": "Include raw HTML content.", - "type": "boolean" - }, - "onlyMainContent": { - "description": "Only return main content.", - "type": "boolean" - } - }, - "type": "object" - }, - "query": { - "description": "The query to search for", - "type": "string" - }, - "searchOptions": { - "properties": { - "limit": { - "description": "Maximum number of results.", - "type": "integer" - } - }, - "type": "object" - } - }, - "type": "object" - } - } - }, - "required": true - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "data": { - "items": { - "properties": { - "content": { - "type": "string" - }, - "markdown": { - "type": "string" - }, - "metadata": { - "properties": { - "description": { - "type": "string" - }, - "language": { - "type": "string" - }, - "sourceURL": { - "type": "string" - }, - "title": { - "type": "string" - } - }, - "type": "object" - }, - "url": { - "type": "string" - } - }, - "type": "object" - }, - "type": "array" - }, - "success": { - "type": "boolean" - } - }, - "type": "object" - } - } - }, - "description": "Successful search." - } - }, - "security": [ - { - "Bearer": [] - } - ], - "summary": "Search the web." - } - } - }, - "servers": [ - { - "url": "https://api.firecrawl.dev/v0" - } - ] -} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_5.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_5.json deleted file mode 100644 index 4fae28c0..00000000 --- a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_5.json +++ /dev/null @@ -1,186 +0,0 @@ -{ - "info": { - "title": "Firecrawl API", - "version": "v0" - }, - "openapi": "3.0.0", - "paths": { - "/crawl/status/{jobId}": { - "get": { - "parameters": [ - { - "description": "ID of crawl job", - "in": "path", - "name": "jobId", - "required": true, - "schema": { - "type": "string" - } - } - ], - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "current": { - "description": "Current page number", - "type": "integer" - }, - "data": { - "description": "Data from the job", - "items": { - "properties": { - "content": { - "type": "string" - }, - "html": { - "description": "HTML content", - "nullable": true, - "type": "string" - }, - "index": { - "description": "Page number crawled", - "type": "integer" - }, - "markdown": { - "type": "string" - }, - "metadata": { - "properties": { - "description": { - "type": "string" - }, - "language": { - "nullable": true, - "type": "string" - }, - "pageError": { - "description": "Error message of page", - "nullable": true, - "type": "string" - }, - "pageStatusCode": { - "description": "Status code of page", - "type": "integer" - }, - "sourceURL": { - "type": "string" - }, - "title": { - "type": "string" - }, - "{any other metadata}": { - "type": "string" - } - }, - "type": "object" - }, - "rawHtml": { - "description": "Raw HTML content", - "nullable": true, - "type": "string" - } - }, - "type": "object" - }, - "type": "array" - }, - "partial_data": { - "description": "Partial documents (streaming)", - "items": { - "properties": { - "content": { - "type": "string" - }, - "html": { - "description": "HTML content", - "nullable": true, - "type": "string" - }, - "index": { - "description": "Page number crawled", - "type": "integer" - }, - "markdown": { - "type": "string" - }, - "metadata": { - "properties": { - "description": { - "type": "string" - }, - "language": { - "nullable": true, - "type": "string" - }, - "pageError": { - "description": "Error message of page", - "nullable": true, - "type": "string" - }, - "pageStatusCode": { - "description": "Status code of page", - "type": "integer" - }, - "sourceURL": { - "type": "string" - }, - "title": { - "type": "string" - }, - "{any other metadata}": { - "type": "string" - } - }, - "type": "object" - }, - "rawHtml": { - "description": "Raw HTML content", - "nullable": true, - "type": "string" - } - }, - "type": "object" - }, - "type": "array" - }, - "status": { - "description": "Status of the job", - "type": "string" - }, - "total": { - "description": "Total number of pages", - "type": "integer" - } - }, - "type": "object" - } - } - }, - "description": "Successful operation" - } - }, - "security": [ - { - "Authorization": [] - } - ], - "summary": "Get crawl job status" - } - } - }, - "securitySchemes": { - "Authorization": { - "bearerFormat": "Bearer ", - "scheme": "bearer", - "type": "http" - } - }, - "servers": [ - { - "url": "https://api.firecrawl.dev/v0" - } - ] -} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_7.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_7.json deleted file mode 100644 index b74b9886..00000000 --- a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_7.json +++ /dev/null @@ -1,86 +0,0 @@ -{ - "info": { - "title": "Firecrawl API", - "version": "v0" - }, - "openapi": "3.0.0", - "paths": { - "/v0/search": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "pageOptions": { - "properties": { - "fetchPageContent": { - "description": "Fetch page content", - "type": "boolean" - } - }, - "type": "object" - }, - "query": { - "description": "Search term", - "type": "string" - } - }, - "type": "object" - } - } - } - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "data": { - "items": { - "properties": { - "markdown": { - "type": "string" - }, - "metadata": { - "properties": { - "description": { - "type": "string" - }, - "language": { - "type": "string" - }, - "sourceURL": { - "type": "string" - }, - "title": { - "type": "string" - } - }, - "type": "object" - }, - "url": { - "type": "string" - } - }, - "type": "object" - }, - "type": "array" - }, - "success": { - "type": "boolean" - } - }, - "type": "object" - } - } - }, - "description": "Successful search" - } - }, - "summary": "Search and extract content" - } - } - } -} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_8.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_8.json deleted file mode 100644 index 2d5f40e2..00000000 --- a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/api_spec_8.json +++ /dev/null @@ -1,59 +0,0 @@ -{ - "info": { - "title": "Firecrawl API", - "version": "v0" - }, - "openapi": "3.0.0", - "paths": { - "/test": { - "get": { - "description": "Returns a test message.", - "responses": { - "200": { - "content": { - "text/plain": { - "schema": { - "example": "Hello, world!", - "type": "string" - } - } - }, - "description": "Successful operation" - } - }, - "summary": "Test endpoint" - } - }, - "/v0/crawl": { - "post": { - "description": "Processes crawl job for URL.", - "requestBody": { - "content": { - "application/json": { - "example": { - "url": "https://docs.firecrawl.dev" - }, - "schema": { - "properties": { - "url": { - "description": "Website URL", - "type": "string" - } - }, - "type": "object" - } - } - }, - "description": "URL to crawl", - "required": true - }, - "responses": { - "200": { - "description": "Crawl initiated." - } - }, - "summary": "Crawl a given URL." - } - } - } -} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/combined_api_spec.json b/examples/turning_docs_into_api_specs/docs.firecrawl.dev/combined_api_spec.json deleted file mode 100644 index 77d67234..00000000 --- a/examples/turning_docs_into_api_specs/docs.firecrawl.dev/combined_api_spec.json +++ /dev/null @@ -1,738 +0,0 @@ -{ - "components": { - "schemas": {} - }, - "info": { - "title": "https://docs.firecrawl.dev API Specification", - "version": "1.0.0" - }, - "openapi": "3.0.0", - "paths": { - "/check_crawl_status": { - "post": { - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "current": { - "type": "integer" - }, - "data": { - "items": { - "properties": { - "content": { - "type": "string" - }, - "markdown": { - "type": "string" - }, - "metadata": { - "properties": { - "description": { - "type": "string" - }, - "language": { - "type": "string" - }, - "sourceURL": { - "type": "string" - }, - "title": { - "type": "string" - } - }, - "type": "object" - }, - "provider": { - "type": "string" - } - }, - "type": "object" - }, - "type": "array" - }, - "status": { - "type": "string" - }, - "total": { - "type": "integer" - } - }, - "type": "object" - } - } - }, - "description": "Crawl job status" - } - }, - "summary": "Check crawl job status" - } - }, - "/crawl": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "crawlerOptions": { - "properties": { - "allowBackwardCrawling": { - "description": "Allow backward crawling", - "type": "boolean" - }, - "allowExternalContentLinks": { - "description": "Allow external links", - "type": "boolean" - }, - "excludes": { - "description": "URL patterns to exclude", - "items": { - "type": "string" - }, - "type": "array" - }, - "generateImgAltText": { - "description": "Generate alt text for images", - "type": "boolean" - }, - "ignoreSitemap": { - "description": "Ignore website sitemap", - "type": "boolean" - }, - "includes": { - "description": "URL patterns to include", - "items": { - "type": "string" - }, - "type": "array" - }, - "limit": { - "description": "Maximum pages to crawl", - "type": "integer" - }, - "maxDepth": { - "description": "Maximum crawl depth", - "type": "integer" - }, - "mode": { - "description": "Crawling mode", - "enum": [ - "default", - "fast" - ], - "type": "string" - }, - "returnOnlyUrls": { - "description": "Return only crawled URLs", - "type": "boolean" - } - }, - "type": "object" - }, - "pageOptions": { - "properties": { - "fullPageScreenshot": { - "description": "Include full page screenshot", - "type": "boolean" - }, - "headers": { - "description": "Headers for requests", - "type": "object" - }, - "includeHtml": { - "description": "Include HTML content", - "type": "boolean" - }, - "includeRawHtml": { - "description": "Include raw HTML content", - "type": "boolean" - }, - "onlyIncludeTags": { - "description": "Include only specific tags", - "items": { - "type": "string" - }, - "type": "array" - }, - "onlyMainContent": { - "description": "Return only main content", - "type": "boolean" - }, - "removeTags": { - "description": "Remove specific tags", - "items": { - "type": "string" - }, - "type": "array" - }, - "replaceAllPathsWithAbsolutePaths": { - "description": "Use absolute paths", - "type": "boolean" - }, - "screenshot": { - "description": "Include page screenshot", - "type": "boolean" - }, - "waitFor": { - "description": "Wait for page load (ms)", - "type": "integer" - } - }, - "type": "object" - }, - "url": { - "description": "Base URL to crawl", - "type": "string" - } - }, - "type": "object" - } - } - } - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "jobId": { - "description": "Job ID of the crawl", - "type": "string" - } - }, - "type": "object" - } - } - }, - "description": "Crawl request successful" - } - }, - "security": [ - { - "Bearer": [] - } - ], - "summary": "Crawl a website" - } - }, - "/crawl/cancel/{jobId}": { - "delete": { - "parameters": [ - { - "description": "ID of crawl job", - "in": "path", - "name": "jobId", - "required": true, - "schema": { - "type": "string" - } - } - ], - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "status": { - "type": "string" - } - }, - "type": "object" - } - } - }, - "description": "Returns cancelled." - } - }, - "security": [ - { - "Bearer": [] - } - ], - "summary": "Cancel crawl job" - } - }, - "/search": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "pageOptions": { - "properties": { - "fetchPageContent": { - "description": "Fetch content of each page.", - "type": "boolean" - }, - "includeHtml": { - "description": "Include HTML content.", - "type": "boolean" - }, - "includeRawHtml": { - "description": "Include raw HTML content.", - "type": "boolean" - }, - "onlyMainContent": { - "description": "Only return main content.", - "type": "boolean" - } - }, - "type": "object" - }, - "query": { - "description": "The query to search for", - "type": "string" - }, - "searchOptions": { - "properties": { - "limit": { - "description": "Maximum number of results.", - "type": "integer" - } - }, - "type": "object" - } - }, - "type": "object" - } - } - }, - "required": true - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "data": { - "items": { - "properties": { - "content": { - "type": "string" - }, - "markdown": { - "type": "string" - }, - "metadata": { - "properties": { - "description": { - "type": "string" - }, - "language": { - "type": "string" - }, - "sourceURL": { - "type": "string" - }, - "title": { - "type": "string" - } - }, - "type": "object" - }, - "url": { - "type": "string" - } - }, - "type": "object" - }, - "type": "array" - }, - "success": { - "type": "boolean" - } - }, - "type": "object" - } - } - }, - "description": "Successful search." - } - }, - "security": [ - { - "Bearer": [] - } - ], - "summary": "Search the web." - } - }, - "/v0/crawl": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "crawlerOptions": { - "description": "Crawling options.", - "properties": { - "excludes": { - "description": "URL patterns to exclude.", - "items": { - "type": "string" - }, - "type": "array" - }, - "includes": { - "description": "URL patterns to include.", - "items": { - "type": "string" - }, - "type": "array" - }, - "limit": { - "description": "Maximum pages to crawl.", - "type": "integer" - }, - "maxDepth": { - "description": "Maximum crawl depth.", - "type": "integer" - }, - "mode": { - "description": "Crawling mode.", - "enum": [ - "default", - "fast" - ], - "type": "string" - }, - "returnOnlyUrls": { - "description": "Return only URLs.", - "type": "boolean" - } - }, - "type": "object" - }, - "pageOptions": { - "description": "Page scraping options.", - "properties": { - "includeHtml": { - "description": "Include HTML content.", - "type": "boolean" - }, - "includeRawHtml": { - "description": "Include raw HTML content.", - "type": "boolean" - }, - "onlyMainContent": { - "description": "Only main content.", - "type": "boolean" - }, - "screenshot": { - "description": "Include page screenshot.", - "type": "boolean" - }, - "waitFor": { - "description": "Wait time in milliseconds.", - "type": "integer" - } - }, - "type": "object" - }, - "url": { - "description": "Base URL to crawl.", - "type": "string" - } - }, - "type": "object" - } - } - } - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "jobId": { - "description": "Crawl job ID.", - "type": "string" - } - }, - "type": "object" - } - } - }, - "description": "Crawl job initiated." - } - }, - "summary": "Crawl multiple pages." - } - }, - "/v0/crawl/status/{jobId}": { - "get": { - "parameters": [ - { - "description": "Crawl job ID.", - "in": "path", - "name": "jobId", - "required": true, - "schema": { - "type": "string" - } - } - ], - "responses": { - "200": { - "description": "Crawl job status." - } - }, - "summary": "Check crawl job status." - } - }, - "/v0/scrape": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "extractorOptions": { - "description": "Options for extraction", - "properties": { - "extractionPrompt": { - "description": "Prompt for LLM extraction", - "type": "string" - }, - "extractionSchema": { - "description": "Schema for LLM extraction", - "type": "object" - }, - "mode": { - "description": "Extraction mode", - "enum": [ - "markdown", - "llm-extraction", - "llm-extraction-from-raw-html", - "llm-extraction-from-markdown" - ], - "type": "string" - } - }, - "type": "object" - }, - "pageOptions": { - "properties": { - "fullPageScreenshot": { - "description": "Include full page screenshot", - "type": "boolean" - }, - "headers": { - "description": "Headers for request", - "type": "object" - }, - "includeHtml": { - "description": "Include HTML content", - "type": "boolean" - }, - "includeRawHtml": { - "description": "Include raw HTML content", - "type": "boolean" - }, - "onlyIncludeTags": { - "description": "Include only these tags", - "items": { - "type": "string" - }, - "type": "array" - }, - "onlyMainContent": { - "description": "Only return main content", - "type": "boolean" - }, - "removeTags": { - "description": "Remove these tags", - "items": { - "type": "string" - }, - "type": "array" - }, - "replaceAllPathsWithAbsolutePaths": { - "description": "Replace relative paths", - "type": "boolean" - }, - "screenshot": { - "description": "Include screenshot", - "type": "boolean" - }, - "waitFor": { - "description": "Wait time in ms", - "type": "integer" - } - }, - "type": "object" - }, - "timeout": { - "description": "Timeout in ms", - "type": "integer" - }, - "url": { - "description": "URL to scrape", - "type": "string" - } - }, - "type": "object" - } - } - }, - "required": true - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "data": { - "properties": { - "content": { - "type": "string" - }, - "html": { - "type": "string" - }, - "llm_extraction": { - "type": "object" - }, - "markdown": { - "type": "string" - }, - "metadata": { - "properties": { - "description": { - "type": "string" - }, - "language": { - "type": "string" - }, - "pageError": { - "type": "string" - }, - "pageStatusCode": { - "type": "integer" - }, - "sourceURL": { - "type": "string" - }, - "title": { - "type": "string" - } - }, - "type": "object" - }, - "rawHtml": { - "type": "string" - }, - "warning": { - "type": "string" - } - }, - "type": "object" - }, - "success": { - "type": "boolean" - } - }, - "type": "object" - } - } - }, - "description": "Successful scrape" - } - }, - "security": [ - { - "bearerAuth": [] - } - ], - "summary": "Scrape a webpage" - } - }, - "/v0/search": { - "post": { - "requestBody": { - "content": { - "application/json": { - "schema": { - "properties": { - "pageOptions": { - "properties": { - "fetchPageContent": { - "description": "Fetch page content", - "type": "boolean" - } - }, - "type": "object" - }, - "query": { - "description": "Search term", - "type": "string" - } - }, - "type": "object" - } - } - } - }, - "responses": { - "200": { - "content": { - "application/json": { - "schema": { - "properties": { - "data": { - "items": { - "properties": { - "markdown": { - "type": "string" - }, - "metadata": { - "properties": { - "description": { - "type": "string" - }, - "language": { - "type": "string" - }, - "sourceURL": { - "type": "string" - }, - "title": { - "type": "string" - } - }, - "type": "object" - }, - "url": { - "type": "string" - } - }, - "type": "object" - }, - "type": "array" - }, - "success": { - "type": "boolean" - } - }, - "type": "object" - } - } - }, - "description": "Successful search" - } - }, - "summary": "Search and extract content" - } - } - } -} \ No newline at end of file diff --git a/examples/turning_docs_into_api_specs/turning_docs_into_api_specs.ipynb b/examples/turning_docs_into_api_specs/turning_docs_into_api_specs.ipynb deleted file mode 100644 index 1b97f67b..00000000 --- a/examples/turning_docs_into_api_specs/turning_docs_into_api_specs.ipynb +++ /dev/null @@ -1,287 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/ericciarla/projects/python_projects/agents_testing/.conda/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - } - ], - "source": [ - "import os\n", - "import datetime\n", - "import time\n", - "from firecrawl import FirecrawlApp\n", - "import json\n", - "import google.generativeai as genai\n", - "from dotenv import load_dotenv\n", - "\n", - "# Load environment variables\n", - "load_dotenv()\n", - "\n", - "# Retrieve API keys from environment variables\n", - "google_api_key = os.getenv(\"GOOGLE_API_KEY\")\n", - "firecrawl_api_key = os.getenv(\"FIRECRAWL_API_KEY\")\n", - "\n", - "# Configure the Google Generative AI module with the API key\n", - "genai.configure(api_key=google_api_key)\n", - "model = genai.GenerativeModel(\"gemini-1.5-pro-001\")\n", - "\n", - "# Set the docs URL\n", - "docs_url=\"https://docs.firecrawl.dev\"\n", - "\n", - "# Initialize the FirecrawlApp with your API key\n", - "app = FirecrawlApp(api_key=firecrawl_api_key)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "36\n" - ] - } - ], - "source": [ - "# Crawl all pages on docs\n", - "params = {\n", - " \"pageOptions\": {\n", - " \"onlyMainContent\": True\n", - " },\n", - "}\n", - "crawl_result = app.crawl_url(docs_url, params=params)\n", - "\n", - "print(len(crawl_result))" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "prompt_instructions = f\"\"\"Given the following API documentation content, generate an OpenAPI 3.0 specification in JSON format ONLY if you are 100% confident and clear about all details. Focus on extracting the main endpoints, their HTTP methods, parameters, request bodies, and responses. The specification should follow OpenAPI 3.0 structure and conventions. Include only the 200 response for each endpoint. Limit all descriptions to 5 words or less.\n", - "\n", - "If there is ANY uncertainty, lack of complete information, or if you are not 100% confident about ANY part of the specification, return an empty JSON object {{}}.\n", - "\n", - "Do not make anything up. Only include information that is explicitly provided in the documentation. If any detail is unclear or missing, do not attempt to fill it in.\n", - "\n", - "API Documentation Content:\n", - "{{content}}\n", - "\n", - "Generate the OpenAPI 3.0 specification in JSON format ONLY if you are 100% confident about every single detail. Include only the JSON object, no additional text, and ensure it has no errors in the JSON format so it can be parsed. Remember to include only the 200 response for each endpoint and keep all descriptions to 5 words maximum.\n", - "\n", - "Once again, if there is ANY doubt, uncertainty, or lack of complete information, return an empty JSON object {{}}.\n", - "\n", - "To reiterate: accuracy is paramount. Do not make anything up. If you are not 100% clear or confident about the entire OpenAPI spec, return an empty JSON object {{}}.\n", - "\"\"\"\n" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "API specification saved to docs.firecrawl.dev/api_spec_0.json\n", - "API specification saved to docs.firecrawl.dev/api_spec_1.json\n", - "API specification saved to docs.firecrawl.dev/api_spec_2.json\n", - "API specification saved to docs.firecrawl.dev/api_spec_3.json\n", - "API specification saved to docs.firecrawl.dev/api_spec_4.json\n", - "An error occurred for page 5: 'content'\n", - "No API specification found for page 6\n", - "API specification saved to docs.firecrawl.dev/api_spec_7.json\n", - "No API specification found for page 8\n", - "No API specification found for page 9\n", - "API specification saved to docs.firecrawl.dev/api_spec_10.json\n", - "No API specification found for page 11\n", - "No API specification found for page 12\n", - "API specification saved to docs.firecrawl.dev/api_spec_13.json\n", - "No API specification found for page 14\n", - "No API specification found for page 15\n", - "No API specification found for page 16\n", - "No API specification found for page 17\n", - "No API specification found for page 18\n", - "No API specification found for page 19\n", - "No API specification found for page 20\n", - "No API specification found for page 21\n", - "No API specification found for page 22\n", - "No API specification found for page 23\n", - "No API specification found for page 24\n", - "No API specification found for page 25\n", - "No API specification found for page 26\n", - "No API specification found for page 27\n", - "No API specification found for page 28\n", - "No API specification found for page 29\n", - "No API specification found for page 30\n", - "No API specification found for page 31\n", - "No API specification found for page 32\n", - "No API specification found for page 33\n", - "No API specification found for page 34\n", - "No API specification found for page 35\n", - "Total API specifications collected: 8\n" - ] - } - ], - "source": [ - "# Create a folder for storing API specs\n", - "import os\n", - "import urllib.parse\n", - "\n", - "folder_name = urllib.parse.urlparse(docs_url).netloc\n", - "os.makedirs(folder_name, exist_ok=True)\n", - "\n", - "# Initialize a list to store all API specs\n", - "all_api_specs = []\n", - "\n", - "# Process each page in crawl_result\n", - "for index, result in enumerate(crawl_result):\n", - " if 'content' in result:\n", - " # Update prompt_instructions with the current page's content\n", - " current_prompt = prompt_instructions.replace(\"{content}\", result['content'])\n", - " try:\n", - " # Query the model\n", - " response = model.generate_content([current_prompt])\n", - " response_dict = response.to_dict()\n", - " response_text = response_dict['candidates'][0]['content']['parts'][0]['text']\n", - " \n", - " # Remove the ```json code wrap if present\n", - " response_text = response_text.strip().removeprefix('```json').removesuffix('```').strip()\n", - " \n", - " # Parse JSON\n", - " json_data = json.loads(response_text)\n", - " \n", - " # Save non-empty API specs\n", - " if json_data != {}:\n", - " output_file = os.path.join(folder_name, f'api_spec_{index}.json')\n", - " with open(output_file, 'w') as f:\n", - " json.dump(json_data, f, indent=2, sort_keys=True)\n", - " print(f\"API specification saved to {output_file}\")\n", - " \n", - " # Add the API spec to the list\n", - " all_api_specs.append(json_data)\n", - " else:\n", - " print(f\"No API specification found for page {index}\")\n", - " \n", - " except json.JSONDecodeError:\n", - " print(f\"Error parsing JSON response for page {index}\")\n", - " except Exception as e:\n", - " print(f\"An error occurred for page {index}: {str(e)}\")\n", - "\n", - "# Print the total number of API specs collected\n", - "print(f\"Total API specifications collected: {len(all_api_specs)}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Combined API specification saved to docs.firecrawl.dev/combined_api_spec.json\n", - "Total paths in combined spec: 8\n", - "Total schemas in combined spec: 0\n" - ] - } - ], - "source": [ - "# Combine all API specs and keep the most filled out spec for each path and method\n", - "combined_spec = {\n", - " \"openapi\": \"3.0.0\",\n", - " \"info\": {\n", - " \"title\": f\"{docs_url} API Specification\",\n", - " \"version\": \"1.0.0\"\n", - " },\n", - " \"paths\": {},\n", - " \"components\": {\n", - " \"schemas\": {}\n", - " }\n", - "}\n", - "\n", - "def count_properties(obj):\n", - " if isinstance(obj, dict):\n", - " return sum(count_properties(v) for v in obj.values()) + len(obj)\n", - " elif isinstance(obj, list):\n", - " return sum(count_properties(item) for item in obj)\n", - " else:\n", - " return 1\n", - "\n", - "for spec in all_api_specs:\n", - " if \"paths\" in spec:\n", - " for path, methods in spec[\"paths\"].items():\n", - " if path not in combined_spec[\"paths\"]:\n", - " combined_spec[\"paths\"][path] = {}\n", - " for method, details in methods.items():\n", - " if method not in combined_spec[\"paths\"][path] or count_properties(details) > count_properties(combined_spec[\"paths\"][path][method]):\n", - " combined_spec[\"paths\"][path][method] = details\n", - "\n", - " if \"components\" in spec and \"schemas\" in spec[\"components\"]:\n", - " for schema_name, schema in spec[\"components\"][\"schemas\"].items():\n", - " if schema_name not in combined_spec[\"components\"][\"schemas\"] or count_properties(schema) > count_properties(combined_spec[\"components\"][\"schemas\"][schema_name]):\n", - " combined_spec[\"components\"][\"schemas\"][schema_name] = schema\n", - "\n", - "# Save the combined API spec\n", - "output_file = os.path.join(folder_name, 'combined_api_spec.json')\n", - "with open(output_file, 'w') as f:\n", - " json.dump(combined_spec, f, indent=2, sort_keys=True)\n", - "\n", - "print(f\"Combined API specification saved to {output_file}\")\n", - "print(f\"Total paths in combined spec: {len(combined_spec['paths'])}\")\n", - "print(f\"Total schemas in combined spec: {len(combined_spec['components']['schemas'])}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# note: turn this into a simple web app like roast my site\n", - "- select which methods you want to add\n", - "- generate a UI for each method\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/examples/turning_docs_into_api_specs/turning_docs_into_api_specs.py b/examples/turning_docs_into_api_specs/turning_docs_into_api_specs.py new file mode 100644 index 00000000..47b54ede --- /dev/null +++ b/examples/turning_docs_into_api_specs/turning_docs_into_api_specs.py @@ -0,0 +1,137 @@ +# %% +import os +import datetime +import time +from firecrawl import FirecrawlApp +import json +import google.generativeai as genai +from dotenv import load_dotenv + +# Load environment variables +load_dotenv() + +# Retrieve API keys from environment variables +google_api_key = os.getenv("GOOGLE_API_KEY") +firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY") + +# Configure the Google Generative AI module with the API key +genai.configure(api_key=google_api_key) +model = genai.GenerativeModel("gemini-1.5-pro-001") + +# Set the docs URL +docs_url = "https://docs.firecrawl.dev/api-reference" + +# Initialize the FirecrawlApp with your API key +app = FirecrawlApp(api_key=firecrawl_api_key) + +# %% +# Crawl all pages on docs +crawl_result = app.crawl_url(docs_url) +print(f"Total pages crawled: {len(crawl_result['data'])}") + +# %% +# Define the prompt instructions for generating OpenAPI specs +prompt_instructions = """ +Given the following API documentation content, generate an OpenAPI 3.0 specification in JSON format ONLY if you are 100% confident and clear about all details. Focus on extracting the main endpoints, their HTTP methods, parameters, request bodies, and responses. The specification should follow OpenAPI 3.0 structure and conventions. Include only the 200 response for each endpoint. Limit all descriptions to 5 words or less. + +If there is ANY uncertainty, lack of complete information, or if you are not 100% confident about ANY part of the specification, return an empty JSON object {{}}. + +Do not make anything up. Only include information that is explicitly provided in the documentation. If any detail is unclear or missing, do not attempt to fill it in. + +API Documentation Content: +{{content}} + +Generate the OpenAPI 3.0 specification in JSON format ONLY if you are 100% confident about every single detail. Include only the JSON object, no additional text, and ensure it has no errors in the JSON format so it can be parsed. Remember to include only the 200 response for each endpoint and keep all descriptions to 5 words maximum. + +Once again, if there is ANY doubt, uncertainty, or lack of complete information, return an empty JSON object {{}}. + +To reiterate: accuracy is paramount. Do not make anything up. If you are not 100% clear or confident about the entire OpenAPI spec, return an empty JSON object {{}}. +""" + +# %% +# Initialize a list to store all API specs +all_api_specs = [] + +# Process each page in crawl_result +for index, page in enumerate(crawl_result['data']): + if 'markdown' in page: + # Update prompt_instructions with the current page's content + current_prompt = prompt_instructions.replace("{content}", page['markdown']) + try: + # Query the model + response = model.generate_content([current_prompt]) + response_dict = response.to_dict() + response_text = response_dict['candidates'][0]['content']['parts'][0]['text'] + + # Remove the ```json code wrap if present + response_text = response_text.strip().removeprefix('```json').removesuffix('```').strip() + + # Parse JSON + json_data = json.loads(response_text) + + # Add non-empty API specs to the list + if json_data != {}: + all_api_specs.append(json_data) + print(f"API specification generated for page {index}") + else: + print(f"No API specification found for page {index}") + + except json.JSONDecodeError: + print(f"Error parsing JSON response for page {index}") + except Exception as e: + print(f"An error occurred for page {index}: {str(e)}") + +# Print the total number of API specs collected +print(f"Total API specifications collected: {len(all_api_specs)}") + +# %% +# Combine all API specs and keep the most filled out spec for each path and method +combined_spec = { + "openapi": "3.0.0", + "info": { + "title": f"{docs_url} API Specification", + "version": "1.0.0" + }, + "paths": {}, + "components": { + "schemas": {} + } +} + +# Helper function to count properties in an object +def count_properties(obj): + if isinstance(obj, dict): + return sum(count_properties(v) for v in obj.values()) + len(obj) + elif isinstance(obj, list): + return sum(count_properties(item) for item in obj) + else: + return 1 + +# Combine specs, keeping the most detailed version of each path and schema +for spec in all_api_specs: + # Combine paths + if "paths" in spec: + for path, methods in spec["paths"].items(): + if path not in combined_spec["paths"]: + combined_spec["paths"][path] = {} + for method, details in methods.items(): + if method not in combined_spec["paths"][path] or count_properties(details) > count_properties(combined_spec["paths"][path][method]): + combined_spec["paths"][path][method] = details + + # Combine schemas + if "components" in spec and "schemas" in spec["components"]: + for schema_name, schema in spec["components"]["schemas"].items(): + if schema_name not in combined_spec["components"]["schemas"] or count_properties(schema) > count_properties(combined_spec["components"]["schemas"][schema_name]): + combined_spec["components"]["schemas"][schema_name] = schema + +# Print summary of combined spec +print(f"Combined API specification generated") +print(f"Total paths in combined spec: {len(combined_spec['paths'])}") +print(f"Total schemas in combined spec: {len(combined_spec['components']['schemas'])}") + +# Save the combined spec to a JSON file in the same directory as the Python file +output_file = os.path.join(os.path.dirname(__file__), "combined_api_spec.json") +with open(output_file, "w") as f: + json.dump(combined_spec, f, indent=2) + +print(f"Combined API specification saved to {output_file}") From 2d245a35f2131cb2f00b759b92adb75d306a4447 Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Fri, 6 Sep 2024 15:27:58 -0400 Subject: [PATCH 43/62] Delete combined_api_spec.json --- .../combined_api_spec.json | 510 ------------------ 1 file changed, 510 deletions(-) delete mode 100644 examples/turning_docs_into_api_specs/combined_api_spec.json diff --git a/examples/turning_docs_into_api_specs/combined_api_spec.json b/examples/turning_docs_into_api_specs/combined_api_spec.json deleted file mode 100644 index 526dec8b..00000000 --- a/examples/turning_docs_into_api_specs/combined_api_spec.json +++ /dev/null @@ -1,510 +0,0 @@ -{ - "openapi": "3.0.0", - "info": { - "title": "https://docs.firecrawl.dev/api-reference API Specification", - "version": "1.0.0" - }, - "paths": { - "/crawl": { - "post": { - "summary": "Crawl a website", - "requestBody": { - "content": { - "application/json": { - "schema": { - "type": "object", - "properties": { - "url": { - "type": "string", - "description": "Base URL to crawl" - }, - "excludePaths": { - "type": "array", - "items": { - "type": "string" - }, - "description": "URL patterns to exclude" - }, - "includePaths": { - "type": "array", - "items": { - "type": "string" - }, - "description": "URL patterns to include" - }, - "maxDepth": { - "type": "integer", - "description": "Maximum crawl depth" - }, - "ignoreSitemap": { - "type": "boolean", - "description": "Ignore sitemap?" - }, - "limit": { - "type": "integer", - "description": "Maximum pages to crawl" - }, - "allowBackwardLinks": { - "type": "boolean", - "description": "Allow backward links?" - }, - "allowExternalLinks": { - "type": "boolean", - "description": "Allow external links?" - }, - "webhook": { - "type": "string", - "description": "Webhook URL" - }, - "scrapeOptions": { - "type": "object", - "properties": { - "formats": { - "type": "array", - "items": { - "type": "string" - }, - "description": "Formats to include" - }, - "headers": { - "type": "object", - "description": "Headers to send" - }, - "includeTags": { - "type": "array", - "items": { - "type": "string" - }, - "description": "Tags to include" - }, - "excludeTags": { - "type": "array", - "items": { - "type": "string" - }, - "description": "Tags to exclude" - }, - "onlyMainContent": { - "type": "boolean", - "description": "Only main content?" - }, - "waitFor": { - "type": "integer", - "description": "Wait time in ms" - } - } - } - } - } - } - } - }, - "responses": { - "200": { - "description": "Crawl started", - "content": { - "application/json": { - "schema": { - "type": "object", - "properties": { - "success": { - "type": "boolean" - }, - "id": { - "type": "string" - }, - "url": { - "type": "string" - } - } - } - } - } - } - }, - "security": [ - { - "Authorization": [] - } - ] - } - }, - "/scrape": { - "post": { - "summary": "Scrape a webpage", - "requestBody": { - "required": true, - "content": { - "application/json": { - "schema": { - "type": "object", - "properties": { - "url": { - "type": "string", - "description": "URL to scrape" - }, - "formats": { - "type": "array", - "description": "Output formats", - "items": { - "type": "string", - "enum": [ - "markdown", - "html", - "rawHtml", - "links", - "screenshot", - "extract", - "screenshot@fullPage" - ] - } - }, - "onlyMainContent": { - "type": "boolean", - "description": "Only main content" - }, - "includeTags": { - "type": "array", - "description": "Tags to include", - "items": { - "type": "string" - } - }, - "excludeTags": { - "type": "array", - "description": "Tags to exclude", - "items": { - "type": "string" - } - }, - "headers": { - "type": "object", - "description": "Request headers" - }, - "waitFor": { - "type": "integer", - "description": "Delay in ms" - }, - "timeout": { - "type": "integer", - "description": "Timeout in ms" - }, - "extract": { - "type": "object", - "description": "Extract object", - "properties": { - "schema": { - "type": "object", - "description": "Extraction schema" - }, - "systemPrompt": { - "type": "string", - "description": "System prompt" - }, - "prompt": { - "type": "string", - "description": "Extraction prompt" - } - } - } - } - } - } - } - }, - "responses": { - "200": { - "description": "Successful scrape", - "content": { - "application/json": { - "schema": { - "type": "object", - "properties": { - "success": { - "type": "boolean" - }, - "data": { - "type": "object", - "properties": { - "markdown": { - "type": "string" - }, - "html": { - "type": "string" - }, - "rawHtml": { - "type": "string" - }, - "screenshot": { - "type": "string" - }, - "links": { - "type": "array", - "items": { - "type": "string" - } - }, - "metadata": { - "type": "object", - "properties": { - "title": { - "type": "string" - }, - "description": { - "type": "string" - }, - "language": { - "type": "string" - }, - "sourceURL": { - "type": "string" - }, - "statusCode": { - "type": "integer" - }, - "error": { - "type": "string" - } - } - }, - "llm_extraction": { - "type": "object" - }, - "warning": { - "type": "string" - } - } - } - } - } - } - } - } - }, - "security": [ - { - "Bearer": [] - } - ] - } - }, - "/v1/crawl/{id}": { - "get": { - "summary": "Get crawl status", - "parameters": [ - { - "name": "id", - "in": "path", - "description": "ID of crawl job", - "required": true, - "schema": { - "type": "string" - } - } - ], - "responses": { - "200": { - "description": "Crawl status", - "content": { - "application/json": { - "schema": { - "type": "object", - "properties": { - "status": { - "type": "string", - "description": "Current status of crawl" - }, - "total": { - "type": "integer", - "description": "Total pages crawled" - }, - "completed": { - "type": "integer", - "description": "Number of pages crawled" - }, - "creditsUsed": { - "type": "integer", - "description": "Credits used" - }, - "expiresAt": { - "type": "string", - "format": "date-time", - "description": "Crawl expiry" - }, - "next": { - "type": "string", - "nullable": true, - "description": "URL for next data" - }, - "data": { - "type": "array", - "description": "Data of the crawl", - "items": { - "type": "object", - "properties": { - "markdown": { - "type": "string" - }, - "html": { - "type": "string" - }, - "rawHtml": { - "type": "string" - }, - "links": { - "type": "array", - "items": { - "type": "string" - } - }, - "screenshot": { - "type": "string" - }, - "metadata": { - "type": "object", - "properties": { - "title": { - "type": "string" - }, - "description": { - "type": "string" - }, - "language": { - "type": "string" - }, - "sourceURL": { - "type": "string" - }, - "statusCode": { - "type": "integer" - }, - "error": { - "type": "string" - } - } - } - } - } - } - } - } - } - } - } - }, - "security": [ - { - "Bearer": [] - } - ] - } - }, - "/crawl/{id}": { - "delete": { - "summary": "Cancel crawl job", - "security": [ - { - "bearerAuth": [] - } - ], - "parameters": [ - { - "name": "id", - "in": "path", - "description": "ID of crawl job", - "required": true, - "schema": { - "type": "string" - } - } - ], - "responses": { - "200": { - "description": "Crawl job cancelled", - "content": { - "application/json": { - "schema": { - "type": "object", - "properties": { - "success": { - "type": "boolean" - }, - "message": { - "type": "string" - } - } - } - } - } - } - } - } - }, - "/map": { - "post": { - "summary": "Map website and return links", - "requestBody": { - "required": true, - "content": { - "application/json": { - "schema": { - "type": "object", - "properties": { - "url": { - "type": "string", - "description": "Base URL to crawl" - }, - "search": { - "type": "string", - "description": "Search query for mapping" - }, - "ignoreSitemap": { - "type": "boolean", - "description": "Ignore sitemap?" - }, - "includeSubdomains": { - "type": "boolean", - "description": "Include subdomains?" - }, - "limit": { - "type": "integer", - "description": "Max links to return" - } - }, - "required": [ - "url" - ] - } - } - } - }, - "responses": { - "200": { - "description": "Successful mapping", - "content": { - "application/json": { - "schema": { - "type": "object", - "properties": { - "success": { - "type": "boolean" - }, - "links": { - "type": "array", - "items": { - "type": "string" - } - } - } - } - } - } - } - } - } - } - }, - "components": { - "schemas": {} - } -} \ No newline at end of file From 79870e73053ef2a960112f2fa2227b39d0512bdb Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 6 Sep 2024 20:15:26 -0300 Subject: [PATCH 44/62] Update excludeTags.ts --- apps/api/src/scraper/WebScraper/utils/excludeTags.ts | 8 -------- 1 file changed, 8 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/utils/excludeTags.ts b/apps/api/src/scraper/WebScraper/utils/excludeTags.ts index bb9c5194..400ef84f 100644 --- a/apps/api/src/scraper/WebScraper/utils/excludeTags.ts +++ b/apps/api/src/scraper/WebScraper/utils/excludeTags.ts @@ -39,16 +39,8 @@ export const excludeNonMainTags = [ "#search", ".share", "#share", - ".pagination", - "#pagination", ".widget", "#widget", - ".related", - "#related", - ".tag", - "#tag", - ".category", - "#category", ".cookie", "#cookie" ]; From 5758af3291aaebbdc13c3e7c469b3406f730476a Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sat, 7 Sep 2024 13:12:46 -0300 Subject: [PATCH 45/62] Update website_params.ts --- .../src/scraper/WebScraper/utils/custom/website_params.ts | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts b/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts index af8d1f34..8169d9d3 100644 --- a/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts +++ b/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts @@ -242,5 +242,13 @@ export const urlSpecificParams = { engine: "chrome-cdp", }, }, + }, + "lorealparis.hu":{ + defaultScraper: "fire-engine", + params:{ + fireEngineOptions:{ + engine: "tlsclient", + }, + }, } }; From 48c665519ebff263316601a33620115d68d00c41 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sat, 7 Sep 2024 13:42:45 -0300 Subject: [PATCH 46/62] Update credit_billing.ts --- .../src/services/billing/credit_billing.ts | 54 ++++++++++--------- 1 file changed, 29 insertions(+), 25 deletions(-) diff --git a/apps/api/src/services/billing/credit_billing.ts b/apps/api/src/services/billing/credit_billing.ts index 53031de9..d22f0372 100644 --- a/apps/api/src/services/billing/credit_billing.ts +++ b/apps/api/src/services/billing/credit_billing.ts @@ -186,7 +186,8 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) { getValue(cacheKeyCoupons) ]); - let subscription, subscriptionError, coupons; + let subscription, subscriptionError; + let coupons : {credits: number}[]; if (cachedSubscription && cachedCoupons) { subscription = JSON.parse(cachedSubscription); @@ -225,16 +226,16 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) { ); } + + // If there are available coupons and they are enough for the operation + if (couponCredits >= credits) { + return { success: true, message: "Sufficient credits available", remainingCredits: couponCredits }; + } // Free credits, no coupons if (!subscription || subscriptionError) { - // If there is no active subscription but there are available coupons - if (couponCredits >= credits) { - return { success: true, message: "Sufficient credits available", remainingCredits: couponCredits }; - } - let creditUsages; let creditUsageError; let totalCreditsUsed = 0; @@ -251,6 +252,7 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) { const retryInterval = 2000; // 2 seconds while (retries < maxRetries) { + // Reminder, this has an 1000 limit. const result = await supabase_service .from("credit_usage") .select("credits_used") @@ -292,7 +294,7 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) { end.setDate(end.getDate() + 30); // check if usage is within 80% of the limit const creditLimit = FREE_CREDITS; - const creditUsagePercentage = (totalCreditsUsed + credits) / creditLimit; + const creditUsagePercentage = totalCreditsUsed / creditLimit; // Add a check to ensure totalCreditsUsed is greater than 0 if (totalCreditsUsed > 0 && creditUsagePercentage >= 0.8 && creditUsagePercentage < 1) { @@ -306,7 +308,7 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) { } // 5. Compare the total credits used with the credits allowed by the plan. - if (totalCreditsUsed + credits > FREE_CREDITS) { + if (totalCreditsUsed > FREE_CREDITS) { // Send email notification for insufficient credits await sendNotification( team_id, @@ -366,7 +368,7 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) { // Get the price details from cache or database const priceCacheKey = `price_${subscription.price_id}`; - let price; + let price : {credits: number}; try { const cachedPrice = await getValue(priceCacheKey); @@ -394,29 +396,31 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) { Logger.error(`Error retrieving or caching price: ${error}`); Sentry.captureException(error); // If errors, just assume it's a big number so user don't get an error - price = { credits: 1000000 }; + price = { credits: 10000000 }; } const creditLimit = price.credits; - const creditUsagePercentage = (adjustedCreditsUsed + credits) / creditLimit; + + // Removal of + credits + const creditUsagePercentage = adjustedCreditsUsed / creditLimit; // Compare the adjusted total credits used with the credits allowed by the plan - if (adjustedCreditsUsed + credits > price.credits) { - // await sendNotification( - // team_id, - // NotificationType.LIMIT_REACHED, - // subscription.current_period_start, - // subscription.current_period_end - // ); + if (adjustedCreditsUsed > price.credits) { + await sendNotification( + team_id, + NotificationType.LIMIT_REACHED, + subscription.current_period_start, + subscription.current_period_end + ); return { success: false, message: "Insufficient credits, please upgrade!", remainingCredits: creditLimit - adjustedCreditsUsed }; - } else if (creditUsagePercentage >= 0.8) { + } else if (creditUsagePercentage >= 0.8 && creditUsagePercentage < 1) { // Send email notification for approaching credit limit - // await sendNotification( - // team_id, - // NotificationType.APPROACHING_LIMIT, - // subscription.current_period_start, - // subscription.current_period_end - // ); + await sendNotification( + team_id, + NotificationType.APPROACHING_LIMIT, + subscription.current_period_start, + subscription.current_period_end + ); } return { success: true, message: "Sufficient credits available", remainingCredits: creditLimit - adjustedCreditsUsed }; From fbdfa1256bb6095a08434b356fb51688d5337780 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sun, 8 Sep 2024 13:07:10 -0300 Subject: [PATCH 47/62] Update credit_billing.ts --- apps/api/src/services/billing/credit_billing.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/api/src/services/billing/credit_billing.ts b/apps/api/src/services/billing/credit_billing.ts index d22f0372..6a71b40a 100644 --- a/apps/api/src/services/billing/credit_billing.ts +++ b/apps/api/src/services/billing/credit_billing.ts @@ -308,7 +308,7 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) { } // 5. Compare the total credits used with the credits allowed by the plan. - if (totalCreditsUsed > FREE_CREDITS) { + if (totalCreditsUsed >= FREE_CREDITS) { // Send email notification for insufficient credits await sendNotification( team_id, @@ -405,7 +405,7 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) { const creditUsagePercentage = adjustedCreditsUsed / creditLimit; // Compare the adjusted total credits used with the credits allowed by the plan - if (adjustedCreditsUsed > price.credits) { + if (adjustedCreditsUsed >= price.credits) { await sendNotification( team_id, NotificationType.LIMIT_REACHED, From 60a15d00eb73244257b99dfd05a2d55b0aab9dd4 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sun, 8 Sep 2024 16:39:12 -0300 Subject: [PATCH 48/62] Update types.ts --- apps/api/src/controllers/v1/types.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index c4e0cf84..63ec1dd4 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -322,6 +322,7 @@ export function legacyScrapeOptions(x: ScrapeOptions): PageOptions { removeTags: x.excludeTags, onlyMainContent: x.onlyMainContent, waitFor: x.waitFor, + headers: x.headers, includeLinks: x.formats.includes("links"), screenshot: x.formats.includes("screenshot"), fullPageScreenshot: x.formats.includes("screenshot@fullPage"), From 22a5e85899eb893c9a68f53201e13f5fb569bc46 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 9 Sep 2024 12:26:55 -0300 Subject: [PATCH 49/62] Update index.ts --- apps/api/src/index.ts | 102 ++++++++++++++++++++++++++++-------------- 1 file changed, 68 insertions(+), 34 deletions(-) diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index 58370158..1edf3759 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -1,5 +1,5 @@ import "dotenv/config"; -import "./services/sentry" +import "./services/sentry"; import * as Sentry from "@sentry/node"; import express, { NextFunction, Request, Response } from "express"; import bodyParser from "body-parser"; @@ -12,9 +12,9 @@ import os from "os"; import { Logger } from "./lib/logger"; import { adminRouter } from "./routes/admin"; import { ScrapeEvents } from "./lib/scrape-events"; -import http from 'node:http'; -import https from 'node:https'; -import CacheableLookup from 'cacheable-lookup'; +import http from "node:http"; +import https from "node:https"; +import CacheableLookup from "cacheable-lookup"; import { v1Router } from "./routes/v1"; import expressWs from "express-ws"; import { crawlStatusWSController } from "./controllers/v1/crawl-status-ws"; @@ -31,11 +31,11 @@ Logger.info(`Number of CPUs: ${numCPUs} available`); const cacheable = new CacheableLookup({ // this is important to avoid querying local hostnames see https://github.com/szmarczak/cacheable-lookup readme - lookup:false + lookup: false, }); cacheable.install(http.globalAgent); -cacheable.install(https.globalAgent) +cacheable.install(https.globalAgent); if (cluster.isMaster) { Logger.info(`Master ${process.pid} is running`); @@ -115,9 +115,7 @@ if (cluster.isMaster) { app.get(`/serverHealthCheck`, async (req, res) => { try { const scrapeQueue = getScrapeQueue(); - const [waitingJobs] = await Promise.all([ - scrapeQueue.getWaitingCount(), - ]); + const [waitingJobs] = await Promise.all([scrapeQueue.getWaitingCount()]); const noWaitingJobs = waitingJobs === 0; // 200 if no active jobs, 503 if there are active jobs @@ -190,38 +188,77 @@ if (cluster.isMaster) { res.send({ isProduction: global.isProduction }); }); - app.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: Response, next: NextFunction) => { - if (err instanceof ZodError) { - res.status(400).json({ success: false, error: "Bad Request", details: err.errors }); - } else { + app.use( + ( + err: unknown, + req: Request<{}, ErrorResponse, undefined>, + res: Response, + next: NextFunction + ) => { + if (err instanceof ZodError) { + res + .status(400) + .json({ success: false, error: "Bad Request", details: err.errors }); + } else { next(err); + } } - }); + ); Sentry.setupExpressErrorHandler(app); - app.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: ResponseWithSentry, next: NextFunction) => { - const id = res.sentry ?? uuidv4(); - let verbose = JSON.stringify(err); - if (verbose === "{}") { - if (err instanceof Error) { - verbose = JSON.stringify({ - message: err.message, - name: err.name, - stack: err.stack, - }); - } - } + app.use( + ( + err: unknown, + req: Request<{}, ErrorResponse, undefined>, + res: ResponseWithSentry, + next: NextFunction + ) => { + if ( + err instanceof SyntaxError && + "status" in err && + err.status === 400 && + "body" in err + ) { + return res + .status(400) + .json({ success: false, error: "Bad request, malformed JSON" }); + } - Logger.error("Error occurred in request! (" + req.path + ") -- ID " + id + " -- " + verbose); - res.status(500).json({ success: false, error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id }); - }); + const id = res.sentry ?? uuidv4(); + let verbose = JSON.stringify(err); + if (verbose === "{}") { + if (err instanceof Error) { + verbose = JSON.stringify({ + message: err.message, + name: err.name, + stack: err.stack, + }); + } + } + + Logger.error( + "Error occurred in request! (" + + req.path + + ") -- ID " + + id + + " -- " + + verbose + ); + res + .status(500) + .json({ + success: false, + error: + "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + + id, + }); + } + ); Logger.info(`Worker ${process.pid} started`); } - - // const sq = getScrapeQueue(); // sq.on("waiting", j => ScrapeEvents.logJobEvent(j, "waiting")); @@ -230,6 +267,3 @@ if (cluster.isMaster) { // sq.on("paused", j => ScrapeEvents.logJobEvent(j, "paused")); // sq.on("resumed", j => ScrapeEvents.logJobEvent(j, "resumed")); // sq.on("removed", j => ScrapeEvents.logJobEvent(j, "removed")); - - - From ca9a781eb7fbadf7aee7dd6926aea3a0b1ca5e07 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 9 Sep 2024 12:27:55 -0300 Subject: [PATCH 50/62] Update index.ts --- apps/api/src/index.ts | 106 +++++++++++++++--------------------------- 1 file changed, 38 insertions(+), 68 deletions(-) diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index 1edf3759..7d8817af 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -1,5 +1,5 @@ import "dotenv/config"; -import "./services/sentry"; +import "./services/sentry" import * as Sentry from "@sentry/node"; import express, { NextFunction, Request, Response } from "express"; import bodyParser from "body-parser"; @@ -12,9 +12,9 @@ import os from "os"; import { Logger } from "./lib/logger"; import { adminRouter } from "./routes/admin"; import { ScrapeEvents } from "./lib/scrape-events"; -import http from "node:http"; -import https from "node:https"; -import CacheableLookup from "cacheable-lookup"; +import http from 'node:http'; +import https from 'node:https'; +import CacheableLookup from 'cacheable-lookup'; import { v1Router } from "./routes/v1"; import expressWs from "express-ws"; import { crawlStatusWSController } from "./controllers/v1/crawl-status-ws"; @@ -31,11 +31,11 @@ Logger.info(`Number of CPUs: ${numCPUs} available`); const cacheable = new CacheableLookup({ // this is important to avoid querying local hostnames see https://github.com/szmarczak/cacheable-lookup readme - lookup: false, + lookup:false }); cacheable.install(http.globalAgent); -cacheable.install(https.globalAgent); +cacheable.install(https.globalAgent) if (cluster.isMaster) { Logger.info(`Master ${process.pid} is running`); @@ -115,7 +115,9 @@ if (cluster.isMaster) { app.get(`/serverHealthCheck`, async (req, res) => { try { const scrapeQueue = getScrapeQueue(); - const [waitingJobs] = await Promise.all([scrapeQueue.getWaitingCount()]); + const [waitingJobs] = await Promise.all([ + scrapeQueue.getWaitingCount(), + ]); const noWaitingJobs = waitingJobs === 0; // 200 if no active jobs, 503 if there are active jobs @@ -188,77 +190,42 @@ if (cluster.isMaster) { res.send({ isProduction: global.isProduction }); }); - app.use( - ( - err: unknown, - req: Request<{}, ErrorResponse, undefined>, - res: Response, - next: NextFunction - ) => { - if (err instanceof ZodError) { - res - .status(400) - .json({ success: false, error: "Bad Request", details: err.errors }); - } else { + app.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: Response, next: NextFunction) => { + if (err instanceof ZodError) { + res.status(400).json({ success: false, error: "Bad Request", details: err.errors }); + } else { next(err); - } } - ); + }); Sentry.setupExpressErrorHandler(app); - app.use( - ( - err: unknown, - req: Request<{}, ErrorResponse, undefined>, - res: ResponseWithSentry, - next: NextFunction - ) => { - if ( - err instanceof SyntaxError && - "status" in err && - err.status === 400 && - "body" in err - ) { - return res - .status(400) - .json({ success: false, error: "Bad request, malformed JSON" }); - } - - const id = res.sentry ?? uuidv4(); - let verbose = JSON.stringify(err); - if (verbose === "{}") { - if (err instanceof Error) { - verbose = JSON.stringify({ - message: err.message, - name: err.name, - stack: err.stack, - }); - } - } - - Logger.error( - "Error occurred in request! (" + - req.path + - ") -- ID " + - id + - " -- " + - verbose - ); - res - .status(500) - .json({ - success: false, - error: - "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + - id, - }); + app.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: ResponseWithSentry, next: NextFunction) => { + if (err instanceof SyntaxError && 'status' in err && err.status === 400 && 'body' in err) { + return res.status(400).json({ success: false, error: 'Bad request, malformed JSON' }); } - ); + + const id = res.sentry ?? uuidv4(); + let verbose = JSON.stringify(err); + if (verbose === "{}") { + if (err instanceof Error) { + verbose = JSON.stringify({ + message: err.message, + name: err.name, + stack: err.stack, + }); + } + } + + Logger.error("Error occurred in request! (" + req.path + ") -- ID " + id + " -- " + verbose); + res.status(500).json({ success: false, error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id }); + }); Logger.info(`Worker ${process.pid} started`); } + + // const sq = getScrapeQueue(); // sq.on("waiting", j => ScrapeEvents.logJobEvent(j, "waiting")); @@ -267,3 +234,6 @@ if (cluster.isMaster) { // sq.on("paused", j => ScrapeEvents.logJobEvent(j, "paused")); // sq.on("resumed", j => ScrapeEvents.logJobEvent(j, "resumed")); // sq.on("removed", j => ScrapeEvents.logJobEvent(j, "removed")); + + + From 17e419a7fb82dacba45692ea676f0487e66d5f70 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 9 Sep 2024 21:06:23 -0300 Subject: [PATCH 51/62] Nick: --- .../scraper/WebScraper/scrapers/fireEngine.ts | 2 +- apps/api/src/scraper/WebScraper/single_url.ts | 18 +++++++++--------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts index e7361c5c..a3f393c8 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts @@ -55,7 +55,7 @@ export async function scrapWithFireEngine({ try { const reqParams = await generateRequestParams(url); let waitParam = reqParams["params"]?.wait ?? waitFor; - let engineParam = reqParams["params"]?.engine ?? reqParams["params"]?.fireEngineOptions?.engine ?? fireEngineOptions?.engine ?? "playwright"; + let engineParam = reqParams["params"]?.engine ?? reqParams["params"]?.fireEngineOptions?.engine ?? fireEngineOptions?.engine ?? "chrome-cdp"; let screenshotParam = reqParams["params"]?.screenshot ?? screenshot; let fullPageScreenshotParam = reqParams["params"]?.fullPageScreenshot ?? fullPageScreenshot; let fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions; diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 8bafd203..2be65899 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -96,15 +96,15 @@ function getScrapingFallbackOrder( "fetch", ].filter(Boolean); - if (isWaitPresent || isScreenshotPresent || isHeadersPresent) { - defaultOrder = [ - "fire-engine", - useFireEngine ? undefined : "playwright", - ...defaultOrder.filter( - (scraper) => scraper !== "fire-engine" && scraper !== "playwright" - ), - ].filter(Boolean); - } + // if (isWaitPresent || isScreenshotPresent || isHeadersPresent) { + // defaultOrder = [ + // "fire-engine", + // useFireEngine ? undefined : "playwright", + // ...defaultOrder.filter( + // (scraper) => scraper !== "fire-engine" && scraper !== "playwright" + // ), + // ].filter(Boolean); + // } const filteredDefaultOrder = defaultOrder.filter( (scraper: (typeof baseScrapers)[number]) => From 26f2095de61103e854ef95326b6e0570b2494879 Mon Sep 17 00:00:00 2001 From: Gergo Moricz Date: Tue, 10 Sep 2024 09:24:23 +0200 Subject: [PATCH 52/62] fix(v1): proper Invalid URL handling --- apps/api/src/controllers/v1/types.ts | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 63ec1dd4..f812f981 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -30,7 +30,14 @@ export const url = z.preprocess( "URL must have a valid top-level domain or be a valid path" ) .refine( - (x) => checkUrl(x as string), + (x) => { + try { + checkUrl(x as string) + return true; + } catch (_) { + return false; + } + }, "Invalid URL" ) .refine( From b4dbf7553750a54040ff47fea9042d2858aaa9cd Mon Sep 17 00:00:00 2001 From: Gergo Moricz Date: Tue, 10 Sep 2024 10:25:14 +0200 Subject: [PATCH 53/62] fix(v1): check if url is string in blocklistMiddleware Fixes FIRECRAWL-SCRAPER-JS-9Z --- apps/api/src/routes/v1.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/routes/v1.ts b/apps/api/src/routes/v1.ts index daa9bf43..484ab5dc 100644 --- a/apps/api/src/routes/v1.ts +++ b/apps/api/src/routes/v1.ts @@ -83,7 +83,7 @@ function idempotencyMiddleware(req: Request, res: Response, next: NextFunction) } function blocklistMiddleware(req: Request, res: Response, next: NextFunction) { - if (req.body.url && isUrlBlocked(req.body.url)) { + if (typeof req.body.url === "string" && isUrlBlocked(req.body.url)) { if (!res.headersSent) { return res.status(403).json({ success: false, error: "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." }); } From a17e1cac929ace616e371b4df4100a1029300609 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 10 Sep 2024 06:53:24 -0300 Subject: [PATCH 54/62] Rate bump --- apps/api/src/services/rate-limiter.ts | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/apps/api/src/services/rate-limiter.ts b/apps/api/src/services/rate-limiter.ts index dade8493..7cfff35b 100644 --- a/apps/api/src/services/rate-limiter.ts +++ b/apps/api/src/services/rate-limiter.ts @@ -6,7 +6,7 @@ const RATE_LIMITS = { crawl: { default: 3, free: 2, - starter: 3, + starter: 10, standard: 5, standardOld: 40, scale: 50, @@ -19,9 +19,9 @@ const RATE_LIMITS = { scrape: { default: 20, free: 10, - starter: 20, + starter: 100, standard: 100, - standardOld: 40, + standardOld: 100, scale: 500, hobby: 20, standardNew: 100, @@ -32,8 +32,8 @@ const RATE_LIMITS = { search: { default: 20, free: 5, - starter: 20, - standard: 40, + starter: 50, + standard: 50, standardOld: 40, scale: 500, hobby: 10, @@ -45,9 +45,9 @@ const RATE_LIMITS = { map:{ default: 20, free: 5, - starter: 20, - standard: 40, - standardOld: 40, + starter: 50, + standard: 50, + standardOld: 50, scale: 500, hobby: 10, standardNew: 50, From 45237a29dde6f38af4a1a9b7c3d203fbb6c38795 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 10 Sep 2024 09:09:39 -0300 Subject: [PATCH 55/62] updated js-sdk examples --- apps/js-sdk/example.js | 2 +- apps/js-sdk/example.ts | 2 +- apps/js-sdk/package-lock.json | 76 +++++++++++++++++++++++++++++++++-- apps/js-sdk/package.json | 1 + 4 files changed, 75 insertions(+), 6 deletions(-) diff --git a/apps/js-sdk/example.js b/apps/js-sdk/example.js index eb4bc489..c4b21d5f 100644 --- a/apps/js-sdk/example.js +++ b/apps/js-sdk/example.js @@ -1,4 +1,4 @@ -import FirecrawlApp from '@mendable/firecrawl-js'; +import FirecrawlApp from 'firecrawl'; const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"}); diff --git a/apps/js-sdk/example.ts b/apps/js-sdk/example.ts index 4142416f..7412e479 100644 --- a/apps/js-sdk/example.ts +++ b/apps/js-sdk/example.ts @@ -1,4 +1,4 @@ -import FirecrawlApp, { CrawlStatusResponse, ErrorResponse } from '@mendable/firecrawl-js'; +import FirecrawlApp, { CrawlStatusResponse, ErrorResponse } from 'firecrawl'; const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"}); diff --git a/apps/js-sdk/package-lock.json b/apps/js-sdk/package-lock.json index 95dd7d27..b0f358cb 100644 --- a/apps/js-sdk/package-lock.json +++ b/apps/js-sdk/package-lock.json @@ -9,8 +9,8 @@ "version": "1.0.0", "license": "ISC", "dependencies": { - "@mendable/firecrawl-js": "^0.0.36", "axios": "^1.6.8", + "firecrawl": "^1.2.0", "ts-node": "^10.9.2", "typescript": "^5.4.5", "uuid": "^10.0.0", @@ -422,12 +422,14 @@ } }, "node_modules/@mendable/firecrawl-js": { - "version": "0.0.36", - "resolved": "https://registry.npmjs.org/@mendable/firecrawl-js/-/firecrawl-js-0.0.36.tgz", - "integrity": "sha512-5zQMWUD49r6Q7cxj+QBthQ964Bm9fMooW4E8E4nIca3BMXCeEuQFVf5C3OEWwZf0SjJvR+5Yx2wUbXJWd1wCOA==", + "version": "1.2.2", + "resolved": "https://registry.npmjs.org/@mendable/firecrawl-js/-/firecrawl-js-1.2.2.tgz", + "integrity": "sha512-2A1GzLD0bczlFIlcjxHcm/x8i76ndtV4EUzOfc81oOJ/HbycE2mbT6EUthoL+r4s5A8yO3bKr9o/GxmEn456VA==", "dependencies": { "axios": "^1.6.8", "dotenv": "^16.4.5", + "isows": "^1.0.4", + "typescript-event-target": "^1.1.1", "uuid": "^9.0.1", "zod": "^3.23.8", "zod-to-json-schema": "^3.23.0" @@ -594,6 +596,32 @@ "@esbuild/win32-x64": "0.20.2" } }, + "node_modules/firecrawl": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/firecrawl/-/firecrawl-1.2.0.tgz", + "integrity": "sha512-Sy1BCCvs5FhGc4yxPP7NG9iWnK8RXdvA1ZS/K1Gj+LrEN3iAT2WRzhYET7x8G2bif25F6rHJg57vdVb5sr6RyQ==", + "dependencies": { + "axios": "^1.6.8", + "dotenv": "^16.4.5", + "isows": "^1.0.4", + "typescript-event-target": "^1.1.1", + "uuid": "^9.0.1", + "zod": "^3.23.8", + "zod-to-json-schema": "^3.23.0" + } + }, + "node_modules/firecrawl/node_modules/uuid": { + "version": "9.0.1", + "resolved": "https://registry.npmjs.org/uuid/-/uuid-9.0.1.tgz", + "integrity": "sha512-b+1eJOlsR9K8HJpow9Ok3fiWOWSIcIzXodvv0rQjVoOVNpWMpxf1wZNpt4y9h10odCNrqnYp1OBzRktckBe3sA==", + "funding": [ + "https://github.com/sponsors/broofa", + "https://github.com/sponsors/ctavan" + ], + "bin": { + "uuid": "dist/bin/uuid" + } + }, "node_modules/follow-redirects": { "version": "1.15.6", "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.6.tgz", @@ -652,6 +680,20 @@ "url": "https://github.com/privatenumber/get-tsconfig?sponsor=1" } }, + "node_modules/isows": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/isows/-/isows-1.0.4.tgz", + "integrity": "sha512-hEzjY+x9u9hPmBom9IIAqdJCwNLax+xrPb51vEPpERoFlIxgmZcHzsT5jKG06nvInKOBGvReAVz80Umed5CczQ==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/wagmi-dev" + } + ], + "peerDependencies": { + "ws": "*" + } + }, "node_modules/make-error": { "version": "1.3.6", "resolved": "https://registry.npmjs.org/make-error/-/make-error-1.3.6.tgz", @@ -763,6 +805,11 @@ "node": ">=14.17" } }, + "node_modules/typescript-event-target": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/typescript-event-target/-/typescript-event-target-1.1.1.tgz", + "integrity": "sha512-dFSOFBKV6uwaloBCCUhxlD3Pr/P1a/tJdcmPrTXCHlEFD3faj0mztjcGn6VBAhQ0/Bdy8K3VWrrqwbt/ffsYsg==" + }, "node_modules/undici-types": { "version": "5.26.5", "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz", @@ -786,6 +833,27 @@ "resolved": "https://registry.npmjs.org/v8-compile-cache-lib/-/v8-compile-cache-lib-3.0.1.tgz", "integrity": "sha512-wa7YjyUGfNZngI/vtK0UHAN+lgDCxBPCylVXGp0zu59Fz5aiGtNXaq3DhIov063MorB+VfufLh3JlF2KdTK3xg==" }, + "node_modules/ws": { + "version": "8.18.0", + "resolved": "https://registry.npmjs.org/ws/-/ws-8.18.0.tgz", + "integrity": "sha512-8VbfWfHLbbwu3+N6OKsOMpBdT4kXPDDB9cJk2bJ6mh9ucxdlnNvH1e+roYkKmN9Nxw2yjz7VzeO9oOz2zJ04Pw==", + "peer": true, + "engines": { + "node": ">=10.0.0" + }, + "peerDependencies": { + "bufferutil": "^4.0.1", + "utf-8-validate": ">=5.0.2" + }, + "peerDependenciesMeta": { + "bufferutil": { + "optional": true + }, + "utf-8-validate": { + "optional": true + } + } + }, "node_modules/yn": { "version": "3.1.1", "resolved": "https://registry.npmjs.org/yn/-/yn-3.1.1.tgz", diff --git a/apps/js-sdk/package.json b/apps/js-sdk/package.json index b5d919f4..ac3ef038 100644 --- a/apps/js-sdk/package.json +++ b/apps/js-sdk/package.json @@ -13,6 +13,7 @@ "dependencies": { "@mendable/firecrawl-js": "^1.0.3", "axios": "^1.6.8", + "firecrawl": "^1.2.0", "ts-node": "^10.9.2", "typescript": "^5.4.5", "uuid": "^10.0.0", From ee8a54213c50ae88720ce5a03f76a65d270e81d0 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 10 Sep 2024 10:25:27 -0300 Subject: [PATCH 56/62] fix(py-sdk): removed asyncio package tested websocket with example.py without asyncio and it works with no problem. --- apps/python-sdk/firecrawl/firecrawl.py | 1 - apps/python-sdk/pyproject.toml | 3 +-- apps/python-sdk/requirements.txt | 1 - 3 files changed, 1 insertion(+), 4 deletions(-) diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 254f4c70..3961631e 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -13,7 +13,6 @@ import logging import os import time from typing import Any, Dict, Optional, List -import asyncio import json import requests diff --git a/apps/python-sdk/pyproject.toml b/apps/python-sdk/pyproject.toml index 969fb051..87cb91f1 100644 --- a/apps/python-sdk/pyproject.toml +++ b/apps/python-sdk/pyproject.toml @@ -12,8 +12,7 @@ dependencies = [ "requests", "python-dotenv", "websockets", - "asyncio", -"nest-asyncio" + "nest-asyncio" ] authors = [{name = "Mendable.ai",email = "nick@mendable.ai"}] maintainers = [{name = "Mendable.ai",email = "nick@mendable.ai"}] diff --git a/apps/python-sdk/requirements.txt b/apps/python-sdk/requirements.txt index 94971fde..db67ceeb 100644 --- a/apps/python-sdk/requirements.txt +++ b/apps/python-sdk/requirements.txt @@ -2,5 +2,4 @@ requests pytest python-dotenv websockets -asyncio nest-asyncio \ No newline at end of file From f855ad3436f97972383193980f1fb9f775636a0f Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 10 Sep 2024 10:29:44 -0300 Subject: [PATCH 57/62] bumping py-sdk version --- apps/python-sdk/firecrawl/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py index f178cd61..540ce67e 100644 --- a/apps/python-sdk/firecrawl/__init__.py +++ b/apps/python-sdk/firecrawl/__init__.py @@ -13,7 +13,7 @@ import os from .firecrawl import FirecrawlApp -__version__ = "1.2.3" +__version__ = "1.2.4" # Define the logger for the Firecrawl project logger: logging.Logger = logging.getLogger("firecrawl") From 4ebc35c9dde46e1fd2e38364000aa493287b9650 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 10 Sep 2024 18:59:09 +0200 Subject: [PATCH 58/62] fix(crawl-status): add success: true --- apps/api/src/controllers/v1/crawl-status-ws.ts | 1 + apps/api/src/controllers/v1/crawl-status.ts | 1 + apps/api/src/controllers/v1/types.ts | 1 + 3 files changed, 3 insertions(+) diff --git a/apps/api/src/controllers/v1/crawl-status-ws.ts b/apps/api/src/controllers/v1/crawl-status-ws.ts index 8d823096..16a67682 100644 --- a/apps/api/src/controllers/v1/crawl-status-ws.ts +++ b/apps/api/src/controllers/v1/crawl-status-ws.ts @@ -103,6 +103,7 @@ async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth Date: Tue, 10 Sep 2024 19:29:38 +0200 Subject: [PATCH 59/62] feat(js-sdk): paginate next on checkCrawlStatus + better types for CSR --- apps/js-sdk/firecrawl/package.json | 2 +- apps/js-sdk/firecrawl/src/index.ts | 62 ++++++++++++++++++------------ 2 files changed, 38 insertions(+), 26 deletions(-) diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 7114a625..75ebe390 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.2.2", + "version": "1.2.3", "description": "JavaScript SDK for Firecrawl API", "main": "build/cjs/index.js", "types": "types/index.d.ts", diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 8b16adfb..55c5be0b 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -131,15 +131,14 @@ export interface CrawlResponse { */ export interface CrawlStatusResponse { success: true; - total: number; + status: "scraping" | "completed" | "failed" | "cancelled"; completed: number; + total: number; creditsUsed: number; expiresAt: Date; - status: "scraping" | "completed" | "failed"; - next: string; - data?: FirecrawlDocument[]; - error?: string; -} + next?: string; + data: FirecrawlDocument[]; +}; /** * Parameters for mapping operations. @@ -329,9 +328,10 @@ export default class FirecrawlApp { /** * Checks the status of a crawl job using the Firecrawl API. * @param id - The ID of the crawl operation. + * @param getAllData - Paginate through all the pages of documents, returning the full list of all documents. (default: `false`) * @returns The response containing the job status. */ - async checkCrawlStatus(id?: string): Promise { + async checkCrawlStatus(id?: string, getAllData = false): Promise { if (!id) { throw new Error("No crawl ID provided"); } @@ -342,17 +342,29 @@ export default class FirecrawlApp { `${this.apiUrl}/v1/crawl/${id}`, headers ); - if (response.status === 200) { + if (response.status === 200 && getAllData) { + let allData = response.data.data; + if (response.data.status === "completed") { + let statusData = response.data + if ("data" in statusData) { + let data = statusData.data; + while ('next' in statusData) { + statusData = (await this.getRequest(statusData.next, headers)).data; + data = data.concat(statusData.data); + } + allData = data; + } + } return ({ - success: true, + success: response.data.success, status: response.data.status, total: response.data.total, completed: response.data.completed, creditsUsed: response.data.creditsUsed, expiresAt: new Date(response.data.expiresAt), next: response.data.next, - data: response.data.data, - error: response.data.error + data: allData, + error: response.data.error, }) } else { this.handleError(response, "check crawl status"); @@ -452,7 +464,7 @@ export default class FirecrawlApp { id: string, headers: AxiosRequestHeaders, checkInterval: number - ): Promise { + ): Promise { while (true) { let statusResponse: AxiosResponse = await this.getRequest( `${this.apiUrl}/v1/crawl/${id}`, @@ -460,20 +472,20 @@ export default class FirecrawlApp { ); if (statusResponse.status === 200) { let statusData = statusResponse.data; - if (statusData.status === "completed") { - if ("data" in statusData) { - let data = statusData.data; - while ('next' in statusData) { - statusResponse = await this.getRequest(statusData.next, headers); - statusData = statusResponse.data; - data = data.concat(statusData.data); + if (statusData.status === "completed") { + if ("data" in statusData) { + let data = statusData.data; + while ('next' in statusData) { + statusResponse = await this.getRequest(statusData.next, headers); + statusData = statusResponse.data; + data = data.concat(statusData.data); + } + statusData.data = data; + return statusData; + } else { + throw new Error("Crawl job completed but no data was returned"); } - statusData.data = data; - return statusData; - } else { - throw new Error("Crawl job completed but no data was returned"); - } - } else if ( + } else if ( ["active", "paused", "pending", "queued", "waiting", "scraping"].includes(statusData.status) ) { checkInterval = Math.max(checkInterval, 2); From ad1a6fbc74eeb51c8ac2be870c4535382c8e0428 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 10 Sep 2024 19:41:01 +0200 Subject: [PATCH 60/62] fix(v1/map): handle invalid URLs gracefully --- apps/api/src/controllers/v1/map.ts | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/apps/api/src/controllers/v1/map.ts b/apps/api/src/controllers/v1/map.ts index e6abd9ae..a9c61d04 100644 --- a/apps/api/src/controllers/v1/map.ts +++ b/apps/api/src/controllers/v1/map.ts @@ -88,7 +88,13 @@ export async function mapController( links = performCosineSimilarity(links, searchQuery); } - links = links.map((x) => checkAndUpdateURLForMap(x).url.trim()); + links = links.map((x) => { + try { + return checkAndUpdateURLForMap(x).url.trim() + } catch (_) { + return null; + } + }).filter(x => x !== null); // allows for subdomains to be included links = links.filter((x) => isSameDomain(x, req.body.url)); From 83a165db0fd0e680f4dfb1c41cbcb20901d5e8f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 10 Sep 2024 21:18:53 +0200 Subject: [PATCH 61/62] fix(v0/scrape): ensure url is string --- apps/api/src/controllers/v0/scrape.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/controllers/v0/scrape.ts b/apps/api/src/controllers/v0/scrape.ts index bc91da18..2a5f1d4f 100644 --- a/apps/api/src/controllers/v0/scrape.ts +++ b/apps/api/src/controllers/v0/scrape.ts @@ -39,7 +39,7 @@ export async function scrapeHelper( returnCode: number; }> { const url = req.body.url; - if (!url) { + if (typeof url !== "string") { return { success: false, error: "Url is required", returnCode: 400 }; } From 97ffabff3a6b6bd1c7455b85ce794949f540469b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 10 Sep 2024 21:21:20 +0200 Subject: [PATCH 62/62] fix(v1): converting bad docs always gives null --- apps/api/src/controllers/v1/types.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 6b2db308..c44c1cc5 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -348,7 +348,7 @@ export function legacyExtractorOptions(x: ExtractOptions): ExtractorOptions { } export function legacyDocumentConverter(doc: any): Document { - if (doc === null || doc === undefined) return doc; + if (doc === null || doc === undefined) return null; if (doc.metadata) { if (doc.metadata.screenshot) {