From 6dc5b1cd64b9a8c642bc846c87af811da44838a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 19 Mar 2025 17:39:43 +0100 Subject: [PATCH 001/160] feat(auth): acuc update --- apps/api/src/controllers/auth.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/controllers/auth.ts b/apps/api/src/controllers/auth.ts index a32a437e..ccd1a5b6 100644 --- a/apps/api/src/controllers/auth.ts +++ b/apps/api/src/controllers/auth.ts @@ -100,7 +100,7 @@ export async function getACUC( const client = Math.random() > (2/3) ? supabase_rr_service : supabase_service; ({ data, error } = await client.rpc( - "auth_credit_usage_chunk_23_tally", + "auth_credit_usage_chunk_24_tally", { input_key: api_key, i_is_extract: isExtract, tally_untallied_credits: true }, { get: true }, )); From 0bdaa97bfbcbfcf5aff5dc888914535f2f328fd3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 19 Mar 2025 21:10:52 +0100 Subject: [PATCH 002/160] acuc fix --- apps/api/src/controllers/auth.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/controllers/auth.ts b/apps/api/src/controllers/auth.ts index ccd1a5b6..d68615b2 100644 --- a/apps/api/src/controllers/auth.ts +++ b/apps/api/src/controllers/auth.ts @@ -100,7 +100,7 @@ export async function getACUC( const client = Math.random() > (2/3) ? supabase_rr_service : supabase_service; ({ data, error } = await client.rpc( - "auth_credit_usage_chunk_24_tally", + "auth_credit_usage_chunk_25_tally", { input_key: api_key, i_is_extract: isExtract, tally_untallied_credits: true }, { get: true }, )); From 3e0d3db90e6d5d71772c0edd867e4aedc8b1a3c5 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 19 Mar 2025 23:20:18 -0400 Subject: [PATCH 003/160] Update rate-limiter.ts --- apps/api/src/services/rate-limiter.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/services/rate-limiter.ts b/apps/api/src/services/rate-limiter.ts index b8449a92..e87fa611 100644 --- a/apps/api/src/services/rate-limiter.ts +++ b/apps/api/src/services/rate-limiter.ts @@ -280,7 +280,7 @@ export function getRateLimiter( return etier1aRateLimiter; } - if (teamId && teamId === process.env.ETIER2A_TEAM_ID) { + if (teamId && (teamId === process.env.ETIER2A_TEAM_ID || teamId === process.env.ETIER2A_TEAM_ID_B)) { return etier2aRateLimiter; } From 867e545511b4c8fa7fa3c0a43c7e857c3ef4bccc Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 19 Mar 2025 23:25:05 -0400 Subject: [PATCH 004/160] Update rate-limiter.ts --- apps/api/src/services/rate-limiter.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/services/rate-limiter.ts b/apps/api/src/services/rate-limiter.ts index e87fa611..be53bf80 100644 --- a/apps/api/src/services/rate-limiter.ts +++ b/apps/api/src/services/rate-limiter.ts @@ -314,7 +314,7 @@ export function getConcurrencyLimitMax( return CONCURRENCY_LIMIT.etier1a; } - if (teamId && teamId === process.env.ETIER2A_TEAM_ID) { + if (teamId && (teamId === process.env.ETIER2A_TEAM_ID || teamId === process.env.ETIER2A_TEAM_ID_B)) { return CONCURRENCY_LIMIT.etier2a; } From 2fb29ee46ee4b1bd194d6eb575e63cff049e3f43 Mon Sep 17 00:00:00 2001 From: Aparup Ganguly Date: Thu, 20 Mar 2025 15:26:02 +0530 Subject: [PATCH 005/160] Add examples/ mistra- small-3.1-crawler --- .../mistral-small-3.1-crawler.py | 272 ++++++++++++++++++ 1 file changed, 272 insertions(+) create mode 100644 examples/mistral-small-3.1-crawler/mistral-small-3.1-crawler.py diff --git a/examples/mistral-small-3.1-crawler/mistral-small-3.1-crawler.py b/examples/mistral-small-3.1-crawler/mistral-small-3.1-crawler.py new file mode 100644 index 00000000..9c8c3173 --- /dev/null +++ b/examples/mistral-small-3.1-crawler/mistral-small-3.1-crawler.py @@ -0,0 +1,272 @@ +import os +from firecrawl import FirecrawlApp +import json +from dotenv import load_dotenv +from mistralai import Mistral + +# ANSI color codes +class Colors: + CYAN = '\033[96m' + YELLOW = '\033[93m' + GREEN = '\033[92m' + RED = '\033[91m' + MAGENTA = '\033[95m' + BLUE = '\033[94m' + RESET = '\033[0m' + +# Load environment variables +load_dotenv() + +# Retrieve API keys from environment variables +firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY") +mistral_api_key = os.getenv("MISTRAL_API_KEY") + +# Initialize the FirecrawlApp and Mistral client +app = FirecrawlApp(api_key=firecrawl_api_key) +client = Mistral(api_key=mistral_api_key) + +# Find the page that most likely contains the objective +def find_relevant_page_via_map(objective, url, app, client): + try: + print(f"{Colors.CYAN}Understood. The objective is: {objective}{Colors.RESET}") + print(f"{Colors.CYAN}Initiating search on the website: {url}{Colors.RESET}") + + map_prompt = f""" + The map function generates a list of URLs from a website and it accepts a search parameter. Based on the objective of: {objective}, come up with a 1-2 word search parameter that will help us find the information we need. Only respond with 1-2 words nothing else. + """ + + print(f"{Colors.YELLOW}Analyzing objective to determine optimal search parameter...{Colors.RESET}") + completion = client.chat.complete( + model="mistral-small-latest", + messages=[ + { + "role": "user", + "content": map_prompt + } + ] + ) + + map_search_parameter = completion.choices[0].message.content + print(f"{Colors.GREEN}Optimal search parameter identified: {map_search_parameter}{Colors.RESET}") + + print(f"{Colors.YELLOW}Mapping website using the identified search parameter...{Colors.RESET}") + map_website = app.map_url(url, params={"search": map_search_parameter}) + + # Debug print to see the response structure + print(f"{Colors.MAGENTA}Debug - Map response structure: {json.dumps(map_website, indent=2)}{Colors.RESET}") + + print(f"{Colors.GREEN}Website mapping completed successfully.{Colors.RESET}") + + # Handle the response based on its structure + if isinstance(map_website, dict): + # Assuming the links are in a 'urls' or similar key + links = map_website.get('urls', []) or map_website.get('links', []) + elif isinstance(map_website, str): + try: + parsed = json.loads(map_website) + links = parsed.get('urls', []) or parsed.get('links', []) + except json.JSONDecodeError: + links = [] + else: + links = map_website if isinstance(map_website, list) else [] + + if not links: + print(f"{Colors.RED}No links found in map response.{Colors.RESET}") + return None + + rank_prompt = f""" + Given this list of URLs and the objective: {objective} + Analyze each URL and rank the top 3 most relevant ones that are most likely to contain the information we need. + Return your response as a JSON array with exactly 3 objects, each containing: + - "url": the full URL + - "relevance_score": number between 0-100 indicating relevance to objective + - "reason": brief explanation of why this URL is relevant + + Example output: + [ + {{ + "url": "https://example.com/about", + "relevance_score": 95, + "reason": "Main about page containing company information" + }}, + {{ + "url": "https://example.com/team", + "relevance_score": 80, + "reason": "Team page with leadership details" + }}, + {{ + "url": "https://example.com/contact", + "relevance_score": 70, + "reason": "Contact page with location information" + }} + ] + + URLs to analyze: + {json.dumps(links, indent=2)} + """ + + print(f"{Colors.YELLOW}Ranking URLs by relevance to objective...{Colors.RESET}") + completion = client.chat.complete( + model="mistral-small-latest", + messages=[ + { + "role": "user", + "content": rank_prompt + } + ] + ) + + # Debug print to see Mistral's raw response + print(f"{Colors.MAGENTA}Debug - Mistral's raw response:{Colors.RESET}") + print(f"{Colors.MAGENTA}{completion.choices[0].message.content}{Colors.RESET}") + + try: + # Try to clean the response by stripping any potential markdown or extra whitespace + cleaned_response = completion.choices[0].message.content.strip() + print(f"{Colors.YELLOW}Attempting to extract JSON from response...{Colors.RESET}") + + # Extract JSON block if it exists between triple backticks + if "```json" in cleaned_response and "```" in cleaned_response.split("```json", 1)[1]: + print(f"{Colors.GREEN}Found JSON code block, extracting...{Colors.RESET}") + cleaned_response = cleaned_response.split("```json", 1)[1].split("```", 1)[0].strip() + # If no code blocks but contains square brackets, try to extract just the JSON array + elif "[" in cleaned_response and "]" in cleaned_response: + print(f"{Colors.GREEN}Found JSON array markers, extracting...{Colors.RESET}") + start_idx = cleaned_response.find("[") + end_idx = cleaned_response.rfind("]") + 1 + cleaned_response = cleaned_response[start_idx:end_idx].strip() + + print(f"{Colors.YELLOW}Parsing extracted content: {cleaned_response[:100]}...{Colors.RESET}") + ranked_results = json.loads(cleaned_response) + print(f"{Colors.GREEN}Successfully parsed JSON response{Colors.RESET}") + + # Validate the structure of the results + if not isinstance(ranked_results, list): + raise ValueError("Response is not a list") + + for result in ranked_results: + if not all(key in result for key in ["url", "relevance_score", "reason"]): + raise ValueError("Response items missing required fields") + + links = [result["url"] for result in ranked_results] + + # Print detailed ranking info + print(f"{Colors.CYAN}Top 3 ranked URLs:{Colors.RESET}") + for result in ranked_results: + print(f"{Colors.GREEN}URL: {result['url']}{Colors.RESET}") + print(f"{Colors.YELLOW}Relevance Score: {result['relevance_score']}{Colors.RESET}") + print(f"{Colors.BLUE}Reason: {result['reason']}{Colors.RESET}") + print("---") + + if not links: + print(f"{Colors.RED}No relevant links identified.{Colors.RESET}") + return None + + except (json.JSONDecodeError, KeyError) as e: + print(f"{Colors.RED}Error parsing ranked results: {str(e)}{Colors.RESET}") + return None + + print(f"{Colors.GREEN}Located {len(links)} relevant links.{Colors.RESET}") + return links + + except Exception as e: + print(f"{Colors.RED}Error encountered during relevant page identification: {str(e)}{Colors.RESET}") + return None + +# Scrape the top 3 pages and see if the objective is met, if so return in json format else return None +def find_objective_in_top_pages(map_website, objective, app, client): + try: + # Get top 3 links from the map result + if not map_website: + print(f"{Colors.RED}No links found to analyze.{Colors.RESET}") + return None + + top_links = map_website[:3] + print(f"{Colors.CYAN}Proceeding to analyze top {len(top_links)} links: {top_links}{Colors.RESET}") + + for link in top_links: + print(f"{Colors.YELLOW}Initiating scrape of page: {link}{Colors.RESET}") + scrape_result = app.scrape_url(link, params={'formats': ['markdown']}) + print(f"{Colors.GREEN}Page scraping completed successfully.{Colors.RESET}") + + check_prompt = f""" + Given the following scraped content and objective, determine if the objective is met. + If it is, extract the relevant information in a simple and concise JSON format. Use only the necessary fields and avoid nested structures if possible. + If the objective is not met with confidence, respond with exactly 'Objective not met'. + + Objective: {objective} + Scraped content: {scrape_result['markdown']} + + Remember: + 1. Only return JSON if you are confident the objective is fully met. + 2. Keep the JSON structure as simple and flat as possible. + 3. If returning JSON, ensure it's valid JSON format without any markdown formatting. + 4. If objective is not met, respond only with 'Objective not met'. + """ + + completion = client.chat.complete( + model="mistral-small-latest", + messages=[{"role": "user", "content": check_prompt}] + ) + + result = completion.choices[0].message.content.strip() + + # Clean up the response if it contains markdown formatting + # Extract JSON block if it exists between triple backticks + if "```json" in result and "```" in result.split("```json", 1)[1]: + print(f"{Colors.GREEN}Found JSON code block, extracting...{Colors.RESET}") + result = result.split("```json", 1)[1].split("```", 1)[0].strip() + # If no code blocks but contains curly braces, try to extract just the JSON object + elif result != "Objective not met" and "{" in result and "}" in result: + print(f"{Colors.GREEN}Found JSON object markers, extracting...{Colors.RESET}") + start_idx = result.find("{") + end_idx = result.rfind("}") + 1 + result = result[start_idx:end_idx].strip() + + if result == "Objective not met": + print(f"{Colors.YELLOW}Objective not met on this page. Proceeding to next link...{Colors.RESET}") + continue + + try: + print(f"{Colors.YELLOW}Parsing extracted content: {result[:100]}...{Colors.RESET}") + json_result = json.loads(result) + print(f"{Colors.GREEN}Successfully parsed JSON response{Colors.RESET}") + print(f"{Colors.GREEN}Objective fulfilled. Relevant information found.{Colors.RESET}") + return json_result + except json.JSONDecodeError as e: + print(f"{Colors.RED}Error parsing JSON response: {str(e)}{Colors.RESET}") + print(f"{Colors.MAGENTA}Raw response: {result}{Colors.RESET}") + continue + + print(f"{Colors.RED}All available pages analyzed. Objective not fulfilled in examined content.{Colors.RESET}") + return None + + except Exception as e: + print(f"{Colors.RED}Error encountered during page analysis: {str(e)}{Colors.RESET}") + return None + +# Main function to execute the process +def main(): + # Get user input + url = input(f"{Colors.BLUE}Enter the website to crawl : {Colors.RESET}") + objective = input(f"{Colors.BLUE}Enter your objective: {Colors.RESET}") + + print(f"{Colors.YELLOW}Initiating web crawling process...{Colors.RESET}") + # Find the relevant page + map_website = find_relevant_page_via_map(objective, url, app, client) + + if map_website: + print(f"{Colors.GREEN}Relevant pages identified. Proceeding with detailed analysis using Mistral Small 3.1...{Colors.RESET}") + # Find objective in top pages + result = find_objective_in_top_pages(map_website, objective, app, client) + + if result: + print(f"{Colors.GREEN}Objective successfully fulfilled. Extracted information :{Colors.RESET}") + print(f"{Colors.MAGENTA}{json.dumps(result, indent=2)}{Colors.RESET}") + else: + print(f"{Colors.RED}Unable to fulfill the objective with the available content.{Colors.RESET}") + else: + print(f"{Colors.RED}No relevant pages identified. Consider refining the search parameters or trying a different website.{Colors.RESET}") + +if __name__ == "__main__": + main() \ No newline at end of file From 5e35782b5d97034d3a6599cf9e954ea39b18284d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 20 Mar 2025 17:47:29 +0100 Subject: [PATCH 006/160] update acuc --- apps/api/src/controllers/auth.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/controllers/auth.ts b/apps/api/src/controllers/auth.ts index d68615b2..ae9abcff 100644 --- a/apps/api/src/controllers/auth.ts +++ b/apps/api/src/controllers/auth.ts @@ -100,7 +100,7 @@ export async function getACUC( const client = Math.random() > (2/3) ? supabase_rr_service : supabase_service; ({ data, error } = await client.rpc( - "auth_credit_usage_chunk_25_tally", + "auth_credit_usage_chunk_26_tally", { input_key: api_key, i_is_extract: isExtract, tally_untallied_credits: true }, { get: true }, )); From 6a6199eb4b6e76ab4078f48e2685b996cd2ec8ed Mon Sep 17 00:00:00 2001 From: Aparup Ganguly Date: Fri, 21 Mar 2025 14:03:31 +0530 Subject: [PATCH 007/160] Add examples/mistral 3.1 company researcher --- .../mistral-small-3.1-extractor.py | 376 ++++++++++++++++++ 1 file changed, 376 insertions(+) create mode 100644 examples/mistral-small-3.1-extractor/mistral-small-3.1-extractor.py diff --git a/examples/mistral-small-3.1-extractor/mistral-small-3.1-extractor.py b/examples/mistral-small-3.1-extractor/mistral-small-3.1-extractor.py new file mode 100644 index 00000000..3e9f4d33 --- /dev/null +++ b/examples/mistral-small-3.1-extractor/mistral-small-3.1-extractor.py @@ -0,0 +1,376 @@ +import os +import json +import time +import requests +from dotenv import load_dotenv +from serpapi.google_search import GoogleSearch +from mistralai import Mistral + +# ANSI color codes +class Colors: + CYAN = '\033[96m' + YELLOW = '\033[93m' + GREEN = '\033[92m' + RED = '\033[91m' + MAGENTA = '\033[95m' + BLUE = '\033[94m' + RESET = '\033[0m' + +# Load environment variables +load_dotenv() + +# Initialize clients +mistral_client = Mistral(api_key=os.getenv("MISTRAL_API_KEY")) +firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY") +serp_api_key = os.getenv("SERP_API_KEY") + + +if not firecrawl_api_key: + print(f"{Colors.RED}Warning: FIRECRAWL_API_KEY not found in environment variables{Colors.RESET}") + +if not os.getenv("MISTRAL_API_KEY"): + print(f"{Colors.RED}Warning: MISTRAL_API_KEY not found in environment variables{Colors.RESET}") + +def search_google(query): + """Search Google using SerpAPI and return top results.""" + print(f"{Colors.YELLOW}Searching Google for '{query}'...{Colors.RESET}") + search = GoogleSearch({"q": query, "api_key": serp_api_key}) + return search.get_dict().get("organic_results", []) + +def select_urls_with_mistral(company, objective, serp_results): + """ + Use Mistral Small 3.1 to select URLs from SERP results with enhanced criteria. + Returns a list of URLs with confidence scores and justifications. + """ + try: + serp_data = [{"title": r.get("title"), "link": r.get("link"), "snippet": r.get("snippet")} + for r in serp_results if r.get("link")] + + prompt = ( + "Task: Select the MOST RELIABLE and RELEVANT URLs that contain VERIFIABLE information about the specified company.\n\n" + "Instructions:\n" + "1. Analyze the search results for information SPECIFICALLY about the requested objective\n" + "2. Select ONLY official and highly reliable URLs that DIRECTLY address the requested information\n" + "3. Prioritize in this exact order:\n" + " a. The company's official website sections that specifically address the requested information\n" + " b. Official company documents (annual reports, SEC filings, press releases) that contain verifiable data\n" + " c. Government databases or regulatory filings that contain verified information\n" + " d. Trusted industry databases with cited sources (e.g., Bloomberg, Reuters, industry associations)\n" + "4. EXCLUDE any sources that:\n" + " a. Contain primarily opinions or analysis rather than facts\n" + " b. Are outdated (older than 1 year unless historical information is requested)\n" + " c. Are from general news sites without specific expertise in the topic\n" + " d. Do not cite their sources or methodology\n" + " e. Are social media links or user-generated content\n" + "5. For each URL selected, provide a confidence score (1-10) and brief justification\n" + "6. Limit selection to 3-5 of the MOST RELIABLE and RELEVANT sources only\n" + "7. Return a JSON object with the following structure: {\"selected_urls\": [{\"url\": \"url1\", \"confidence\": 9, \"justification\": \"Official company annual report with audited figures\"}]}\n\n" + f"Company: {company}\n" + f"Information Needed: {objective}\n" + f"Search Results: {json.dumps(serp_data, indent=2)}\n\n" + "Response Format: {\"selected_urls\": [{\"url\": \"https://example.com\", \"confidence\": 9, \"justification\": \"Reason this is reliable\"}]}" + ) + + response = mistral_client.chat.complete( + model="mistral-small-latest", + messages=[ + {"role": "user", "content": prompt} + ] + ) + + # Clean the response text + cleaned_response = response.choices[0].message.content.strip() + if cleaned_response.startswith('```'): + cleaned_response = cleaned_response.split('```')[1] + if cleaned_response.startswith('json'): + cleaned_response = cleaned_response[4:] + cleaned_response = cleaned_response.strip() + + try: + # Parse JSON response + result = json.loads(cleaned_response) + if isinstance(result, dict) and "selected_urls" in result: + url_data = result["selected_urls"] + # Extract just the URLs for compatibility with existing code + urls = [item["url"] for item in url_data if "url" in item] + + # Print detailed information about selected URLs + print(f"{Colors.CYAN}Selected URLs with confidence scores:{Colors.RESET}") + for item in url_data: + if "url" in item and "confidence" in item and "justification" in item: + print(f"- {item['url']} (Confidence: {item['confidence']}/10)") + print(f" Justification: {item['justification']}") + else: + # Fallback to text parsing + urls = [line.strip() for line in cleaned_response.split('\n') + if line.strip().startswith(('http://', 'https://'))] + except json.JSONDecodeError: + # Fallback to text parsing + urls = [line.strip() for line in cleaned_response.split('\n') + if line.strip().startswith(('http://', 'https://'))] + + # Clean up URLs + cleaned_urls = [url.replace('/*', '').rstrip('/') for url in urls] + cleaned_urls = [url for url in cleaned_urls if url] + + # Limit to top 5 URLs to ensure quality over quantity + cleaned_urls = cleaned_urls[:5] + + if not cleaned_urls: + print(f"{Colors.YELLOW}No valid URLs found in response.{Colors.RESET}") + return [] + + # Return the URLs for cross-verification + return cleaned_urls + + except Exception as e: + print(f"{Colors.RED}Error selecting URLs: {str(e)}{Colors.RESET}") + return [] + +def cross_verify_sources(urls, company, objective): + """Use Mistral to cross-verify information across selected sources.""" + + print(f"{Colors.YELLOW}Cross-verifying selected sources...{Colors.RESET}") + + verification_prompt = ( + f"Task: Evaluate the reliability and consistency of these sources for information about {company}.\n\n" + f"Objective: {objective}\n\n" + f"URLs to evaluate: {json.dumps(urls)}\n\n" + "Instructions:\n" + "1. For each URL, identify what makes it reliable or unreliable for the specific objective\n" + "2. Assess whether these sources are likely to provide consistent or contradictory information\n" + "3. Identify any potential biases in these sources (e.g., company's own website may present favorable information)\n" + "4. Recommend the final set of URLs that, when used together, will provide the most accurate and complete information\n" + "5. IMPORTANT: Only include URLs that are DIRECTLY relevant to the specific objective\n" + "6. Exclude any URLs that contain primarily general information about the company not related to the objective\n" + "7. Return a JSON object with: {\"verified_urls\": [\"url1\", \"url2\"], \"verification_notes\": \"explanation\"}\n" + ) + + try: + response = mistral_client.chat.complete( + model="mistral-small-latest", + messages=[ + {"role": "user", "content": verification_prompt} + ] + ) + + # Clean the response text + cleaned_response = response.choices[0].message.content.strip() + if cleaned_response.startswith('```'): + cleaned_response = cleaned_response.split('```')[1] + if cleaned_response.startswith('json'): + cleaned_response = cleaned_response[4:] + cleaned_response = cleaned_response.strip() + + try: + # Parse JSON response + result = json.loads(cleaned_response) + if isinstance(result, dict) and "verified_urls" in result: + verified_urls = result["verified_urls"] + verification_notes = result.get("verification_notes", "") + + print(f"{Colors.CYAN}Cross-verification complete:{Colors.RESET}") + print(f"{Colors.CYAN}Notes: {verification_notes}{Colors.RESET}") + print(f"{Colors.CYAN}Final verified URLs:{Colors.RESET}") + for url in verified_urls: + print(f"- {url}") + + return verified_urls + else: + # If JSON parsing fails, return original URLs + print(f"{Colors.YELLOW}Could not parse cross-verification result. Using original URLs.{Colors.RESET}") + return urls + except json.JSONDecodeError: + # If JSON parsing fails, return original URLs + print(f"{Colors.YELLOW}Could not parse cross-verification result. Using original URLs.{Colors.RESET}") + return urls + + except Exception as e: + print(f"{Colors.RED}Error during cross-verification: {str(e)}{Colors.RESET}") + return urls # Return original URLs if cross-verification fails + +def extract_company_info(urls, prompt, company, api_key): + """Use requests to call Firecrawl's extract endpoint with selected URLs.""" + print(f"{Colors.YELLOW}Extracting structured data from the provided URLs using Firecrawl...{Colors.RESET}") + + # Enhanced prompt for better data quality + enhanced_prompt = ( + f"Extract accurate and verified information about {company}. " + f"Specifically focus on: {prompt}. " + f"IMPORTANT INSTRUCTIONS:\n" + f"1. Only include information that is EXPLICITLY stated in the source material\n" + f"2. Do NOT include any speculative information\n" + f"3. If information conflicts between sources, prioritize information from the company's official website\n" + f"4. For each piece of information, cite the specific source URL\n" + f"5. Assign a confidence score (1-10) to each piece of information based on source reliability\n" + f"6. ONLY include information that is DIRECTLY relevant to the specific request\n" + f"7. EXCLUDE any tangential or general information about the company not related to the specific request\n" + f"8. Format the response as a structured JSON with clear categories related to the request\n" + f"9. For each data point, include both the information and its source in this format: {{\"value\": \"information\", \"source\": \"url\", \"confidence\": 8}}\n" + f"10. If multiple sources confirm the same information, cite all sources and increase the confidence score\n" + f"11. If you cannot find specific information requested, explicitly state that it was not found in the sources rather than providing general information" + ) + + headers = { + 'Content-Type': 'application/json', + 'Authorization': f'Bearer {api_key}' + } + + payload = { + "urls": urls, + "prompt": enhanced_prompt, + "enableWebSearch": False # Changed to False to rely only on verified URLs + } + + try: + # Print the payload for debugging + print(f"{Colors.YELLOW}Request payload:{Colors.RESET}") + print(json.dumps(payload, indent=2)) + + response = requests.post( + "https://api.firecrawl.dev/v1/extract", + headers=headers, + json=payload, + timeout=30 + ) + + # Print detailed response for debugging + print(f"{Colors.YELLOW}Response status code: {response.status_code}{Colors.RESET}") + print(f"{Colors.YELLOW}Response headers: {response.headers}{Colors.RESET}") + + data = response.json() + print(f"{Colors.YELLOW}Response body:{Colors.RESET}") + print(json.dumps(data, indent=2)) + + if not data.get('success'): + print(f"{Colors.RED}API returned error: {data.get('error', 'No error message')}{Colors.RESET}") + return None + + extraction_id = data.get('id') + if not extraction_id: + print(f"{Colors.RED}No extraction ID found in response.{Colors.RESET}") + return None + + return poll_firecrawl_result(extraction_id, api_key) + + except requests.exceptions.RequestException as e: + print(f"{Colors.RED}Request failed: {e}{Colors.RESET}") + return None + except json.JSONDecodeError as e: + print(f"{Colors.RED}Failed to parse response: {e}{Colors.RESET}") + return None + except Exception as e: + print(f"{Colors.RED}Failed to extract data: {e}{Colors.RESET}") + return None + +def poll_firecrawl_result(extraction_id, api_key, interval=5, max_attempts=60): + """Poll Firecrawl API to get the extraction result.""" + url = f"https://api.firecrawl.dev/v1/extract/{extraction_id}" + headers = { + 'Authorization': f'Bearer {api_key}' + } + + print(f"{Colors.YELLOW}Waiting for extraction to complete...{Colors.RESET}") + + # Show a simple progress indicator instead of "still processing" messages + print(f"{Colors.YELLOW}[", end="", flush=True) + + for attempt in range(1, max_attempts + 1): + try: + response = requests.get(url, headers=headers, timeout=30) + response.raise_for_status() + data = response.json() + + if data.get('success') and data.get('data'): + print(f"]{Colors.RESET}") # Close the progress indicator + print(f"{Colors.GREEN}Data successfully extracted:{Colors.RESET}") + + # Validate and clean the extracted data + validated_data = validate_extracted_data(data['data']) + print(json.dumps(validated_data, indent=2)) + return validated_data + elif data.get('success') and not data.get('data'): + # Show a simple progress indicator + print(f"{Colors.YELLOW}.", end="", flush=True) + time.sleep(interval) + else: + print(f"]{Colors.RESET}") # Close the progress indicator + print(f"{Colors.RED}API Error: {data.get('error', 'No error message provided')}{Colors.RESET}") + return None + + except requests.exceptions.RequestException as e: + print(f"]{Colors.RESET}") # Close the progress indicator + print(f"{Colors.RED}Request error: {str(e)}{Colors.RESET}") + return None + except json.JSONDecodeError as e: + print(f"]{Colors.RESET}") # Close the progress indicator + print(f"{Colors.RED}JSON parsing error: {str(e)}{Colors.RESET}") + return None + except Exception as e: + print(f"]{Colors.RESET}") # Close the progress indicator + print(f"{Colors.RED}Unexpected error: {str(e)}{Colors.RESET}") + return None + + print(f"]{Colors.RESET}") # Close the progress indicator + print(f"{Colors.RED}Max polling attempts reached. Extraction did not complete in time.{Colors.RESET}") + return None + +def validate_extracted_data(data): + """Validate and clean the extracted data to reduce misinformation.""" + if not data or not isinstance(data, dict): + return data + + # Look for confidence scores or source information if available + validated_data = {} + + for key, value in data.items(): + # Skip entries that indicate uncertainty + if isinstance(value, str) and any(term in value.lower() for term in ["unknown", "unclear", "not specified", "not found", "couldn't find"]): + continue + + # Keep entries with clear information + validated_data[key] = value + + return validated_data + +def main(): + company = input(f"{Colors.BLUE}Enter the company name: {Colors.RESET}") + objective = input(f"{Colors.BLUE}Enter what information you want about the company: {Colors.RESET}") + + # Add more specific search terms for better results + search_query = f"{company} {objective}" + # print(f"{Colors.YELLOW}Searching Google for '{search_query}'...{Colors.RESET}") + serp_results = search_google(search_query) + + if not serp_results: + # Fallback to just company name + print(f"{Colors.YELLOW}No results found. Trying broader search...{Colors.RESET}") + serp_results = search_google(company) + + if not serp_results: + print(f"{Colors.RED}No search results found.{Colors.RESET}") + return + + # Select URLs with Mistral + selected_urls = select_urls_with_mistral(company, objective, serp_results) + + if not selected_urls: + print(f"{Colors.RED}No URLs were selected.{Colors.RESET}") + return + + # Cross-verify the selected sources + verified_urls = cross_verify_sources(selected_urls, company, objective) + + if not verified_urls: + print(f"{Colors.YELLOW}No URLs were verified. Using original selected URLs.{Colors.RESET}") + verified_urls = selected_urls + + data = extract_company_info(verified_urls, objective, company, firecrawl_api_key) + + if data: + print(f"{Colors.GREEN}Extraction completed successfully.{Colors.RESET}") + else: + print(f"{Colors.RED}Failed to extract the requested information. Try refining your prompt or choosing a different company.{Colors.RESET}") + +if __name__ == "__main__": + main() \ No newline at end of file From 87539aaf160f00051f91feef0cd673fda006e3db Mon Sep 17 00:00:00 2001 From: Aparup Ganguly Date: Mon, 24 Mar 2025 20:06:08 +0530 Subject: [PATCH 008/160] Add example/Deep-research Apartment finder --- .../.env.example | 5 + .../deep-research-apartment-finder/README.md | 55 ++++ .../apartment_finder.py | 293 ++++++++++++++++++ .../requirements.txt | 3 + 4 files changed, 356 insertions(+) create mode 100644 examples/deep-research-apartment-finder/.env.example create mode 100644 examples/deep-research-apartment-finder/README.md create mode 100644 examples/deep-research-apartment-finder/apartment_finder.py create mode 100644 examples/deep-research-apartment-finder/requirements.txt diff --git a/examples/deep-research-apartment-finder/.env.example b/examples/deep-research-apartment-finder/.env.example new file mode 100644 index 00000000..7f34bdb7 --- /dev/null +++ b/examples/deep-research-apartment-finder/.env.example @@ -0,0 +1,5 @@ +# Firecrawl API key (get from https://firecrawl.dev) +FIRECRAWL_API_KEY=your_firecrawl_api_key_here + +# Anthropic API key (get from https://console.anthropic.com) +ANTHROPIC_API_KEY=your_anthropic_api_key_here \ No newline at end of file diff --git a/examples/deep-research-apartment-finder/README.md b/examples/deep-research-apartment-finder/README.md new file mode 100644 index 00000000..ccae9706 --- /dev/null +++ b/examples/deep-research-apartment-finder/README.md @@ -0,0 +1,55 @@ +# Apartment Finder CLI + +A command-line tool that uses Firecrawl's Deep Research API and Anthropic's Claude 3.7 to find and analyze apartment listings based on your preferences. + +## Features + +- Interactive input for apartment search preferences +- Searches apartments by location, budget, bedrooms, and amenities +- Automatically researches apartment listings across multiple websites +- Uses AI to analyze and extract the top 3 options +- Provides detailed information including price, location, features, and pros/cons +- Option to save results as JSON + +## Installation + +1. Clone this repository: + + ``` + git clone + cd apartment-finder + ``` + +2. Install dependencies: + + ``` + pip install -r requirements.txt + ``` + +3. Set up API keys: + - Copy `.env.example` to `.env` + - Fill in your Firecrawl API key from [firecrawl.dev](https://firecrawl.dev) + - Fill in your Anthropic API key from [console.anthropic.com](https://console.anthropic.com) + +## Usage + +Run the script and follow the interactive prompts: + +```bash +python apartment_finder.py +``` + +The script will prompt you for: + +- Location (city or neighborhood) +- Budget (maximum monthly rent) +- Number of bedrooms +- Desired amenities + +After searching and analyzing, the tool will display the top apartment options and offer to save the results to a JSON file. + +## Notes + +- The search process may take a few minutes due to the deep research API. +- Results will vary based on available apartment listings at the time of search. +- API usage may incur costs depending on your Firecrawl and Anthropic subscription plans. diff --git a/examples/deep-research-apartment-finder/apartment_finder.py b/examples/deep-research-apartment-finder/apartment_finder.py new file mode 100644 index 00000000..3a599a0d --- /dev/null +++ b/examples/deep-research-apartment-finder/apartment_finder.py @@ -0,0 +1,293 @@ +#!/usr/bin/env python3 + +import os +import sys +import json +from typing import Dict, List, Any +import anthropic +from firecrawl import FirecrawlApp +from dotenv import load_dotenv + +# Define colors for terminal output +class Colors: + CYAN = '\033[96m' + YELLOW = '\033[93m' + GREEN = '\033[92m' + RED = '\033[91m' + MAGENTA = '\033[95m' + BLUE = '\033[94m' + BOLD = '\033[1m' + RESET = '\033[0m' + +# Load environment variables +load_dotenv() + +FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY") +ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY") + +if not FIRECRAWL_API_KEY or not ANTHROPIC_API_KEY: + print(f"{Colors.RED}Error: API keys not found. Please set FIRECRAWL_API_KEY and ANTHROPIC_API_KEY environment variables.{Colors.RESET}") + print(f"{Colors.YELLOW}You can create a .env file with these variables or set them in your shell.{Colors.RESET}") + sys.exit(1) + +# Initialize clients +firecrawl = FirecrawlApp(api_key=FIRECRAWL_API_KEY) +claude = anthropic.Anthropic(api_key=ANTHROPIC_API_KEY) + +def get_user_preferences(): + """Get apartment search preferences from user input""" + print(f"\n{Colors.BOLD}{Colors.CYAN}=== Apartment Finder ==={Colors.RESET}") + print(f"{Colors.CYAN}Please enter your apartment search preferences:{Colors.RESET}") + + # Get required inputs + location = input(f"\n{Colors.YELLOW}Enter location (city or neighborhood): {Colors.RESET}") + while not location.strip(): + location = input(f"{Colors.RED}Location cannot be empty. Please enter a location: {Colors.RESET}") + + budget = input(f"{Colors.YELLOW}Enter your maximum budget (e.g., $2000): {Colors.RESET}") + while not budget.strip(): + budget = input(f"{Colors.RED}Budget cannot be empty. Please enter your maximum budget: {Colors.RESET}") + if not budget.startswith('$'): + budget = f"${budget}" + + # Get optional inputs with defaults + bedrooms = input(f"{Colors.YELLOW}Enter number of bedrooms (default: 1): {Colors.RESET}") or "1" + + amenities = input(f"{Colors.YELLOW}Enter desired amenities, separated by commas (e.g., gym,pool,parking): {Colors.RESET}") or "" + + return { + "location": location.strip(), + "budget": budget.strip(), + "bedrooms": bedrooms.strip(), + "amenities": amenities.strip() + } + +def build_search_query(user_prefs): + amenities_str = f" with {user_prefs['amenities'].replace(',', ', ')}" if user_prefs['amenities'] else "" + return f"{user_prefs['bedrooms']} bedroom apartments for rent in {user_prefs['location']} under {user_prefs['budget']}{amenities_str}" + +def research_apartments(query: str) -> Dict[str, Any]: + """Use Firecrawl's deep research to find apartment listings""" + print(f"\n{Colors.BOLD}{Colors.CYAN}🔍 INITIATING DEEP RESEARCH 🔍{Colors.RESET}") + print(f"{Colors.BLUE}Researching apartments with query: '{query}'{Colors.RESET}") + print(f"{Colors.BLUE}This may take a few minutes...{Colors.RESET}\n") + + # Define research parameters + params = { + "maxDepth": 3, # Number of research iterations + "timeLimit": 180, # Time limit in seconds + "maxUrls": 20 # Maximum URLs to analyze + } + + # Start research with real-time updates + def on_activity(activity): + activity_type = activity['type'] + message = activity['message'] + + if activity_type == 'info': + color = Colors.CYAN + elif activity_type == 'search': + color = Colors.BLUE + elif activity_type == 'scrape': + color = Colors.MAGENTA + elif activity_type == 'analyze': + color = Colors.GREEN + else: + color = Colors.RESET + + print(f"[{color}{activity_type}{Colors.RESET}] {message}") + + # Run deep research + results = firecrawl.deep_research( + query=query, + params=params, + on_activity=on_activity + ) + + return results + +def analyze_with_claude(research_results: Dict[str, Any], user_prefs: Dict[str, str]) -> List[Dict[str, Any]]: + """Use Claude to analyze apartment data and extract top options""" + print(f"\n{Colors.BOLD}{Colors.MAGENTA}🧠 ANALYZING RESULTS WITH CLAUDE 3.7 🧠{Colors.RESET}") + + # Extract relevant information from sources + sources_text = "\n\n".join([ + f"Source {i+1}:\n{source.get('content', '')}" + for i, source in enumerate(research_results['data']['sources'][:15]) # Limit to first 15 sources + ]) + + # Add the final analysis as an additional source + final_analysis = research_results['data'].get('finalAnalysis', '') + if final_analysis: + sources_text += f"\n\nFinal Analysis:\n{final_analysis}" + + # Prepare system prompt with better handling for limited data + system_prompt = """ + You are an expert apartment finder assistant. Your task is to analyze text about apartments and find the top apartment options that best match the user's preferences. + + If you find specific apartment listings with details, extract and organize them into exactly 3 options. + + For each listing you can identify, extract: + 1. Price (monthly rent) + 2. Location (specific neighborhood, address if available) + 3. Key features (bedrooms, bathrooms, square footage, type of building) + 4. Amenities (both in-unit and building amenities) + 5. Pros and cons (at least 3 of each) + + If you cannot find 3 complete listings with all details, do your best with the information available. You can: + - Create fewer than 3 listings if that's all you can find + - Extrapolate missing information based on similar listings or market trends + - For missing specific details, use general information about the area + + You MUST format your response as a JSON array of objects. Each object should have these exact fields: + - title (string) + - price (string) + - location (string) + - features (array of strings) + - amenities (array of strings) + - pros (array of strings) + - cons (array of strings) + + If you absolutely cannot find any apartment listings with enough details, return an array with a single object containing general information about apartments in the area, with "No specific listings found" as the title. + + Example JSON structure: + [ + { + "title": "Luxury 2BR in Downtown", + "price": "$2,500/month", + "location": "123 Main St, Downtown", + "features": ["2 bedrooms", "2 bathrooms", "950 sq ft"], + "amenities": ["In-unit laundry", "Parking garage", "Fitness center"], + "pros": ["Great location", "Modern appliances", "Pet friendly"], + "cons": ["Street noise", "Small kitchen", "Limited storage"] + } + ] + + Return ONLY the JSON array, nothing else. + """ + + # Create the user message + user_message = f""" + I'm looking for {user_prefs['bedrooms']} bedroom apartments in {user_prefs['location']} with a budget of {user_prefs['budget']}. + + Additional preferences: {user_prefs.get('amenities', 'None specified')} + + Please analyze the following information and find apartment options that match my criteria: + + {sources_text} + """ + + # Call Claude API + response = claude.messages.create( + model="claude-3-7-sonnet-20250219", + max_tokens=4000, + temperature=0, + system=system_prompt, + messages=[{"role": "user", "content": user_message}] + ) + + # Extract and parse JSON from response with better error handling + try: + content = response.content[0].text + + # Clean the content - strip markdown formatting or text before/after JSON + content = content.strip() + if content.startswith('```json'): + content = content[7:] + if content.endswith('```'): + content = content[:-3] + content = content.strip() + + # Look for JSON array in the response + if content.startswith('[') and content.endswith(']'): + return json.loads(content) + + # Try to find JSON brackets if not properly formatted + json_start = content.find('[') + json_end = content.rfind(']') + 1 + if json_start >= 0 and json_end > json_start: + json_str = content[json_start:json_end] + return json.loads(json_str) + + # If we can't find JSON, create a fallback response + print(f"{Colors.YELLOW}Could not find valid JSON in Claude's response, creating fallback response{Colors.RESET}") + return [{ + "title": "No specific listings found", + "price": f"Target: {user_prefs['budget']}", + "location": user_prefs['location'], + "features": [f"{user_prefs['bedrooms']} bedroom(s)"], + "amenities": user_prefs['amenities'].split(',') if user_prefs['amenities'] else ["Not specified"], + "pros": ["Information is based on general market research", "Consider visiting apartment listing websites directly", "Contact local real estate agents for current availability"], + "cons": ["No specific listings were found in the research", "Prices and availability may vary", "Additional research recommended"] + }] + except Exception as e: + print(f"{Colors.RED}Error parsing Claude's response: {e}{Colors.RESET}") + print(f"{Colors.YELLOW}Creating fallback response{Colors.RESET}") + return [{ + "title": "Error analyzing apartment listings", + "price": f"Target: {user_prefs['budget']}", + "location": user_prefs['location'], + "features": [f"{user_prefs['bedrooms']} bedroom(s)"], + "amenities": user_prefs['amenities'].split(',') if user_prefs['amenities'] else ["Not specified"], + "pros": ["Try refining your search criteria", "Consider searching specific apartment websites", "Contact local real estate agents"], + "cons": ["Search encountered technical difficulties", "Results may not be accurate", "Consider trying again later"] + }] + +def display_results(apartments: List[Dict[str, Any]]): + """Display the top apartment options in a readable format""" + if not apartments: + print(f"{Colors.RED}No suitable apartments found that match your criteria.{Colors.RESET}") + return + + print(f"\n{Colors.BOLD}{'=' * 80}{Colors.RESET}") + print(f"{Colors.BOLD}{Colors.GREEN}🏠 TOP {len(apartments)} APARTMENT OPTIONS 🏠{Colors.RESET}".center(80)) + print(f"{Colors.BOLD}{'=' * 80}{Colors.RESET}") + + for i, apt in enumerate(apartments): + print(f"\n{Colors.BOLD}{Colors.CYAN}🔑 OPTION {i+1}: {apt.get('title', 'Apartment')}{Colors.RESET}") + print(f"{Colors.YELLOW}💰 Price: {apt.get('price', 'N/A')}{Colors.RESET}") + print(f"{Colors.YELLOW}📍 Location: {apt.get('location', 'N/A')}{Colors.RESET}") + + print(f"\n{Colors.MAGENTA}📋 Features:{Colors.RESET}") + for feature in apt.get('features', []): + print(f" {Colors.BLUE}•{Colors.RESET} {feature}") + + print(f"\n{Colors.MAGENTA}✨ Amenities:{Colors.RESET}") + for amenity in apt.get('amenities', []): + print(f" {Colors.BLUE}•{Colors.RESET} {amenity}") + + print(f"\n{Colors.GREEN}👍 Pros:{Colors.RESET}") + for pro in apt.get('pros', []): + print(f" {Colors.BLUE}•{Colors.RESET} {pro}") + + print(f"\n{Colors.RED}👎 Cons:{Colors.RESET}") + for con in apt.get('cons', []): + print(f" {Colors.BLUE}•{Colors.RESET} {con}") + + print(f"\n{Colors.CYAN}{'-' * 80}{Colors.RESET}") + +def main(): + # Get user preferences through interactive input + user_prefs = get_user_preferences() + + # Print summary of search criteria + print(f"\n{Colors.BOLD}{Colors.CYAN}=== Search Criteria ==={Colors.RESET}") + print(f"{Colors.BLUE}Location: {Colors.YELLOW}{user_prefs['location']}{Colors.RESET}") + print(f"{Colors.BLUE}Budget: {Colors.YELLOW}{user_prefs['budget']}{Colors.RESET}") + print(f"{Colors.BLUE}Bedrooms: {Colors.YELLOW}{user_prefs['bedrooms']}{Colors.RESET}") + print(f"{Colors.BLUE}Amenities: {Colors.YELLOW}{user_prefs['amenities'] or 'None specified'}{Colors.RESET}") + + # Build search query + query = build_search_query(user_prefs) + + # Run research + research_results = research_apartments(query) + + # Analyze with Claude + top_apartments = analyze_with_claude(research_results, user_prefs) + + # Display results + display_results(top_apartments) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/examples/deep-research-apartment-finder/requirements.txt b/examples/deep-research-apartment-finder/requirements.txt new file mode 100644 index 00000000..d7961953 --- /dev/null +++ b/examples/deep-research-apartment-finder/requirements.txt @@ -0,0 +1,3 @@ +anthropic==0.18.0 +firecrawl==0.2.0 +python-dotenv==1.0.0 \ No newline at end of file From a50dc106ef02b73b8cd39a35668ae4330f309146 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 24 Mar 2025 12:13:52 -0400 Subject: [PATCH 009/160] (feat/deep-research) Deep Research Alpha v1 - Structured Outputs + Customizability (#1365) * Nick: * Nick: structured output support * Nick: support for zod and pydantic --- .../controllers/v1/deep-research-status.ts | 1 + apps/api/src/controllers/v1/deep-research.ts | 11 ++- .../lib/deep-research/deep-research-redis.ts | 1 + .../deep-research/deep-research-service.ts | 89 ++++++++++++------- .../src/lib/deep-research/research-manager.ts | 59 +++++++----- apps/api/src/services/queue-worker.ts | 3 + apps/js-sdk/firecrawl/src/index.ts | 45 ++++++++-- apps/python-sdk/firecrawl/firecrawl.py | 9 +- 8 files changed, 156 insertions(+), 62 deletions(-) diff --git a/apps/api/src/controllers/v1/deep-research-status.ts b/apps/api/src/controllers/v1/deep-research-status.ts index bae49d0a..b261e7be 100644 --- a/apps/api/src/controllers/v1/deep-research-status.ts +++ b/apps/api/src/controllers/v1/deep-research-status.ts @@ -37,6 +37,7 @@ export async function deepResearchStatusController( finalAnalysis: research.finalAnalysis, sources: research.sources, activities: research.activities, + json: research.json, // completedSteps: research.completedSteps, // totalSteps: research.totalExpectedSteps, }, diff --git a/apps/api/src/controllers/v1/deep-research.ts b/apps/api/src/controllers/v1/deep-research.ts index 7e454c3d..3d52d19d 100644 --- a/apps/api/src/controllers/v1/deep-research.ts +++ b/apps/api/src/controllers/v1/deep-research.ts @@ -1,5 +1,5 @@ import { Request, Response } from "express"; -import { RequestWithAuth } from "./types"; +import { extractOptions, RequestWithAuth } from "./types"; import { getDeepResearchQueue } from "../../services/queue-service"; import * as Sentry from "@sentry/node"; import { saveDeepResearch } from "../../lib/deep-research/deep-research-redis"; @@ -11,10 +11,19 @@ export const deepResearchRequestSchema = z.object({ maxUrls: z.number().min(1).max(1000).default(20).describe('Maximum number of URLs to analyze'), timeLimit: z.number().min(30).max(600).default(300).describe('Time limit in seconds'), analysisPrompt: z.string().describe('The prompt to use for the final analysis').optional(), + systemPrompt: z.string().describe('The system prompt to use for the research agent').optional(), + formats: z.array(z.enum(['markdown', 'json'])).default(['markdown']), // @deprecated Use query instead topic: z.string().describe('The topic or question to research').optional(), + jsonOptions: extractOptions.optional(), }).refine(data => data.query || data.topic, { message: "Either query or topic must be provided" +}).refine((obj) => { + const hasJsonFormat = obj.formats?.includes("json"); + const hasJsonOptions = obj.jsonOptions !== undefined; + return (hasJsonFormat && hasJsonOptions) || (!hasJsonFormat && !hasJsonOptions); +}, { + message: "When 'json' format is specified, jsonOptions must be provided, and vice versa" }).transform(data => ({ ...data, query: data.topic || data.query // Use topic as query if provided diff --git a/apps/api/src/lib/deep-research/deep-research-redis.ts b/apps/api/src/lib/deep-research/deep-research-redis.ts index 8b1d9c7a..acefaacc 100644 --- a/apps/api/src/lib/deep-research/deep-research-redis.ts +++ b/apps/api/src/lib/deep-research/deep-research-redis.ts @@ -45,6 +45,7 @@ export type StoredDeepResearch = { activities: DeepResearchActivity[]; summaries: string[]; finalAnalysis?: string; + json?: any; }; // TTL of 6 hours diff --git a/apps/api/src/lib/deep-research/deep-research-service.ts b/apps/api/src/lib/deep-research/deep-research-service.ts index 8a404d10..590014a8 100644 --- a/apps/api/src/lib/deep-research/deep-research-service.ts +++ b/apps/api/src/lib/deep-research/deep-research-service.ts @@ -5,6 +5,7 @@ import { searchAndScrapeSearchResult } from "../../controllers/v1/search"; import { ResearchLLMService, ResearchStateManager } from "./research-manager"; import { logJob } from "../../services/logging/log_job"; import { billTeam } from "../../services/billing/credit_billing"; +import { ExtractOptions } from "../../controllers/v1/types"; interface DeepResearchServiceOptions { researchId: string; @@ -15,6 +16,9 @@ interface DeepResearchServiceOptions { maxUrls: number; timeLimit: number; analysisPrompt: string; + systemPrompt: string; + formats: string[]; + jsonOptions: ExtractOptions; subId?: string; } @@ -54,13 +58,13 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) { await state.incrementDepth(); // Search phase - await state.addActivity({ + await state.addActivity([{ type: "search", status: "processing", message: `Generating deeper search queries for "${currentTopic}"`, timestamp: new Date().toISOString(), depth: state.getCurrentDepth(), - }); + }]); const nextSearchTopic = state.getNextSearchTopic(); logger.debug("[Deep Research] Next search topic:", { nextSearchTopic }); @@ -74,23 +78,23 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) { logger.debug("[Deep Research] Generated search queries:", { searchQueries }); - await state.addActivity({ + await state.addActivity([{ type: "search", status: "processing", message: `Starting ${searchQueries.length} parallel searches for "${currentTopic}"`, timestamp: new Date().toISOString(), depth: state.getCurrentDepth(), - }); + }]); + await state.addActivity(searchQueries.map(searchQuery => ({ + type: "search", + status: "processing", + message: `Searching for "${searchQuery.query}" - Goal: ${searchQuery.researchGoal}`, + timestamp: new Date().toISOString(), + depth: state.getCurrentDepth(), + }))) // Run all searches in parallel const searchPromises = searchQueries.map(async (searchQuery) => { - await state.addActivity({ - type: "search", - status: "processing", - message: `Searching for "${searchQuery.query}" - Goal: ${searchQuery.researchGoal}`, - timestamp: new Date().toISOString(), - depth: state.getCurrentDepth(), - }); const response = await searchAndScrapeSearchResult(searchQuery.query, { teamId: options.teamId, @@ -126,13 +130,13 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) { "[Deep Research] No results found for topic:", { currentTopic }, ); - await state.addActivity({ + await state.addActivity([{ type: "search", status: "error", message: `No results found for any queries about "${currentTopic}"`, timestamp: new Date().toISOString(), depth: state.getCurrentDepth(), - }); + }]); continue; } @@ -163,23 +167,23 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) { "[Deep Research] No new unique results found for topic:", { currentTopic }, ); - await state.addActivity({ + await state.addActivity([{ type: "search", status: "error", message: `Found ${searchResults.length} results but all URLs were already processed for "${currentTopic}"`, timestamp: new Date().toISOString(), depth: state.getCurrentDepth(), - }); + }]); continue; } - await state.addActivity({ + await state.addActivity([{ type: "search", status: "complete", message: `Found ${newSearchResults.length} new relevant results across ${searchQueries.length} parallel queries`, timestamp: new Date().toISOString(), depth: state.getCurrentDepth(), - }); + }]); await state.addFindings( newSearchResults.map((result) => ({ @@ -189,13 +193,13 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) { ); // Analysis phase - await state.addActivity({ + await state.addActivity([{ type: "analyze", status: "processing", message: "Analyzing findings and planning next steps", timestamp: new Date().toISOString(), depth: state.getCurrentDepth(), - }); + }]); const timeRemaining = timeLimit * 1000 - (Date.now() - startTime); logger.debug("[Deep Research] Time remaining (ms):", { timeRemaining }); @@ -204,17 +208,18 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) { state.getFindings(), currentTopic, timeRemaining, + options.systemPrompt ?? "", ); if (!analysis) { logger.debug("[Deep Research] Analysis failed"); - await state.addActivity({ + await state.addActivity([{ type: "analyze", status: "error", message: "Failed to analyze findings", timestamp: new Date().toISOString(), depth: state.getCurrentDepth(), - }); + }]); state.incrementFailedAttempts(); if (state.hasReachedMaxFailedAttempts()) { @@ -232,13 +237,13 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) { state.setNextSearchTopic(analysis.nextSearchTopic || ""); - await state.addActivity({ + await state.addActivity([{ type: "analyze", status: "complete", message: "Analyzed findings", timestamp: new Date().toISOString(), depth: state.getCurrentDepth(), - }); + }]); if (!analysis.shouldContinue || analysis.gaps.length === 0) { logger.debug("[Deep Research] No more gaps to research, ending search"); @@ -251,28 +256,42 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) { // Final synthesis logger.debug("[Deep Research] Starting final synthesis"); - await state.addActivity({ + await state.addActivity([{ type: "synthesis", status: "processing", message: "Preparing final analysis", timestamp: new Date().toISOString(), depth: state.getCurrentDepth(), - }); + }]); - const finalAnalysis = await llmService.generateFinalAnalysis( - options.query, - state.getFindings(), - state.getSummaries(), - options.analysisPrompt, - ); + let finalAnalysis = ""; + let finalAnalysisJson = null; + if(options.formats.includes('json')) { + finalAnalysisJson = await llmService.generateFinalAnalysis( + options.query, + state.getFindings(), + state.getSummaries(), + options.analysisPrompt, + options.formats, + options.jsonOptions, + ); + } + if(options.formats.includes('markdown')) { + finalAnalysis = await llmService.generateFinalAnalysis( + options.query, + state.getFindings(), + state.getSummaries(), + options.analysisPrompt, + ); + } - await state.addActivity({ + await state.addActivity([{ type: "synthesis", status: "complete", message: "Research completed", timestamp: new Date().toISOString(), depth: state.getCurrentDepth(), - }); + }]); const progress = state.getProgress(); logger.debug("[Deep Research] Research completed successfully"); @@ -283,7 +302,7 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) { success: true, message: "Research completed", num_docs: 1, - docs: [{ finalAnalysis: finalAnalysis, sources: state.getSources() }], + docs: [{ finalAnalysis: finalAnalysis, sources: state.getSources(), json: finalAnalysisJson }], time_taken: (Date.now() - startTime) / 1000, team_id: teamId, mode: "deep-research", @@ -296,6 +315,7 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) { await updateDeepResearch(researchId, { status: "completed", finalAnalysis: finalAnalysis, + json: finalAnalysisJson, }); // Bill team for usage based on URLs analyzed billTeam(teamId, subId, Math.min(urlsAnalyzed, options.maxUrls), logger).catch( @@ -310,6 +330,7 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) { data: { finalAnalysis: finalAnalysis, sources: state.getSources(), + json: finalAnalysisJson, }, }; } catch (error: any) { diff --git a/apps/api/src/lib/deep-research/research-manager.ts b/apps/api/src/lib/deep-research/research-manager.ts index d5f4fdd9..70e8067b 100644 --- a/apps/api/src/lib/deep-research/research-manager.ts +++ b/apps/api/src/lib/deep-research/research-manager.ts @@ -6,7 +6,7 @@ import { updateDeepResearch, } from "./deep-research-redis"; import { generateCompletions, trimToTokenLimit } from "../../scraper/scrapeURL/transformers/llmExtract"; - +import { ExtractOptions } from "../../controllers/v1/types"; interface AnalysisResult { gaps: string[]; nextSteps: string[]; @@ -50,13 +50,13 @@ export class ResearchStateManager { return this.seenUrls; } - async addActivity(activity: DeepResearchActivity): Promise { - if (activity.status === "complete") { + async addActivity(activities: DeepResearchActivity[]): Promise { + if (activities.some(activity => activity.status === "complete")) { this.completedSteps++; } await updateDeepResearch(this.researchId, { - activities: [activity], + activities: activities, completedSteps: this.completedSteps, }); } @@ -199,6 +199,7 @@ export class ResearchLLMService { findings: DeepResearchFinding[], currentTopic: string, timeRemaining: number, + systemPrompt: string, ): Promise { try { const timeRemainingMinutes = @@ -211,6 +212,7 @@ export class ResearchLLMService { options: { mode: "llm", systemPrompt: + systemPrompt + "You are an expert research agent that is analyzing findings. Your goal is to synthesize information and identify gaps for further research. Today's date is " + new Date().toISOString().split("T")[0], schema: { @@ -254,33 +256,48 @@ export class ResearchLLMService { findings: DeepResearchFinding[], summaries: string[], analysisPrompt: string, - ): Promise { + formats?: string[], + jsonOptions?: ExtractOptions, + ): Promise { + if(!formats) { + formats = ['markdown']; + } + if(!jsonOptions) { + jsonOptions = undefined; + } + const { extract } = await generateCompletions({ logger: this.logger.child({ method: "generateFinalAnalysis", }), - mode: "no-object", + mode: formats.includes('json') ? 'object' : 'no-object', options: { mode: "llm", - systemPrompt: - "You are an expert research analyst who creates comprehensive, well-structured reports. Your reports are detailed, properly formatted in Markdown, and include clear sections with citations. Today's date is " + - new Date().toISOString().split("T")[0], + ...(formats.includes('json') && { + ...jsonOptions + }), + systemPrompt: formats.includes('json') + ? "You are an expert research analyst who creates comprehensive, structured analysis following the provided JSON schema exactly." + : "You are an expert research analyst who creates comprehensive, well-structured reports. Your reports are detailed, properly formatted in Markdown, and include clear sections with citations. Today's date is " + + new Date().toISOString().split("T")[0], prompt: trimToTokenLimit( analysisPrompt ? `${analysisPrompt}\n\nResearch data:\n${findings.map((f) => `[From ${f.source}]: ${f.text}`).join("\n")}` - : `Create a comprehensive research report on "${topic}" based on the collected findings and analysis. + : formats.includes('json') + ? `Analyze the following research data on "${topic}" and structure the output according to the provided schema: Schema: ${JSON.stringify(jsonOptions?.schema)}\n\nFindings:\n\n${findings.map((f) => `[From ${f.source}]: ${f.text}`).join("\n")}` + : `Create a comprehensive research report on "${topic}" based on the collected findings and analysis. - Research data: - ${findings.map((f) => `[From ${f.source}]: ${f.text}`).join("\n")} - - Requirements: - - Format the report in Markdown with proper headers and sections - - Include specific citations to sources where appropriate - - Provide detailed analysis in each section - - Make it comprehensive and thorough (aim for 4+ pages worth of content) - - Include all relevant findings and insights from the research - - Cite sources - - Use bullet points and lists where appropriate for readability`, + Research data: + ${findings.map((f) => `[From ${f.source}]: ${f.text}`).join("\n")} + + Requirements: + - Format the report in Markdown with proper headers and sections + - Include specific citations to sources where appropriate + - Provide detailed analysis in each section + - Make it comprehensive and thorough (aim for 4+ pages worth of content) + - Include all relevant findings and insights from the research + - Cite sources + - Use bullet points and lists where appropriate for readability`, 100000, ).text, }, diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 66931fce..b6cfc454 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -413,6 +413,9 @@ const processDeepResearchJobInternal = async ( subId: job.data.subId, maxUrls: job.data.request.maxUrls, analysisPrompt: job.data.request.analysisPrompt, + systemPrompt: job.data.request.systemPrompt, + formats: job.data.request.formats, + jsonOptions: job.data.request.jsonOptions, }); if(result.success) { diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index de4ead7f..5cc9d119 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -356,7 +356,7 @@ export interface CrawlErrorsResponse { * Parameters for deep research operations. * Defines options for conducting deep research on a query. */ -export interface DeepResearchParams { +export interface DeepResearchParams { /** * Maximum depth of research iterations (1-10) * @default 7 @@ -377,9 +377,25 @@ export interface DeepResearchParams { */ analysisPrompt?: string; /** + * The system prompt to use for the research agent + */ + systemPrompt?: string; + /** + * The formats to use for the final analysis + */ + formats?: ("markdown" | "json")[]; + /** + * The JSON options to use for the final analysis + */ + jsonOptions?:{ + prompt?: string; + schema?: LLMSchema; + systemPrompt?: string; + }; + /** * Experimental flag for streaming steps */ - __experimental_streamSteps?: boolean; + // __experimental_streamSteps?: boolean; } /** @@ -1420,7 +1436,7 @@ export default class FirecrawlApp { */ async deepResearch( query: string, - params: DeepResearchParams, + params: DeepResearchParams, onActivity?: (activity: { type: string; status: string; @@ -1505,12 +1521,31 @@ export default class FirecrawlApp { * @param params - Parameters for the deep research operation. * @returns The response containing the research job ID. */ - async asyncDeepResearch(query: string, params: DeepResearchParams): Promise { + async asyncDeepResearch(query: string, params: DeepResearchParams): Promise { const headers = this.prepareHeaders(); + let jsonData: any = { query, ...params }; + + if (jsonData?.jsonOptions?.schema) { + let schema = jsonData.jsonOptions.schema; + // Try parsing the schema as a Zod schema + try { + schema = zodToJsonSchema(schema); + } catch (error) { + + } + jsonData = { + ...jsonData, + jsonOptions: { + ...jsonData.jsonOptions, + schema: schema, + }, + }; + } + try { const response: AxiosResponse = await this.postRequest( `${this.apiUrl}/v1/deep-research`, - { query, ...params }, + jsonData, headers ); diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 436f8764..d8d22421 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -49,6 +49,7 @@ class DeepResearchParams(pydantic.BaseModel): timeLimit: Optional[int] = 270 maxUrls: Optional[int] = 20 analysisPrompt: Optional[str] = None + systemPrompt: Optional[str] = None __experimental_streamSteps: Optional[bool] = None class DeepResearchResponse(pydantic.BaseModel): @@ -1171,7 +1172,6 @@ class FirecrawlApp: time.sleep(2) # Polling interval return {'success': False, 'error': 'Deep research job terminated unexpectedly'} - def async_deep_research(self, query: str, params: Optional[Union[Dict[str, Any], DeepResearchParams]] = None) -> Dict[str, Any]: """ Initiates an asynchronous deep research operation. @@ -1195,8 +1195,15 @@ class FirecrawlApp: research_params = params headers = self._prepare_headers() + json_data = {'query': query, **research_params.dict(exclude_none=True)} + # Handle json options schema if present + if 'jsonOptions' in json_data: + json_opts = json_data['jsonOptions'] + if json_opts and 'schema' in json_opts and hasattr(json_opts['schema'], 'schema'): + json_data['jsonOptions']['schema'] = json_opts['schema'].schema() + try: response = self._post_request(f'{self.api_url}/v1/deep-research', json_data, headers) if response.status_code == 200: From 555dab50e707934eaced413bb79a47b7b0568f43 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 24 Mar 2025 20:17:22 +0400 Subject: [PATCH 010/160] Nick: bump --- apps/js-sdk/firecrawl/package.json | 2 +- apps/python-sdk/firecrawl/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 0aca8907..0e6dbf7c 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.20.1", + "version": "1.21.0", "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts", diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py index 726a34d0..0adcb41d 100644 --- a/apps/python-sdk/firecrawl/__init__.py +++ b/apps/python-sdk/firecrawl/__init__.py @@ -13,7 +13,7 @@ import os from .firecrawl import FirecrawlApp # noqa -__version__ = "1.14.1" +__version__ = "1.15.0" # Define the logger for the Firecrawl project logger: logging.Logger = logging.getLogger("firecrawl") From e8f27bef08ee7c918a3933f7377e76eb37a3d055 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 25 Mar 2025 16:24:01 +0100 Subject: [PATCH 011/160] feat(rate-limiter): manual_etier2c --- apps/api/src/services/rate-limiter.ts | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/apps/api/src/services/rate-limiter.ts b/apps/api/src/services/rate-limiter.ts index be53bf80..288af6c6 100644 --- a/apps/api/src/services/rate-limiter.ts +++ b/apps/api/src/services/rate-limiter.ts @@ -241,6 +241,7 @@ const testSuiteTokens = [ ]; const manual = ["69be9e74-7624-4990-b20d-08e0acc70cf6"]; +const manual_etier2c = ["77545e01-9cec-4fa9-8356-883fc66ac13e"]; function makePlanKey(plan?: string) { return plan ? plan.replace("-", "") : "default"; // "default" @@ -288,7 +289,7 @@ export function getRateLimiter( return etier2aRateLimiter; } - if (teamId && manual.includes(teamId)) { + if (teamId && (manual.includes(teamId) || manual_etier2c.includes(teamId))) { return manualRateLimiter; } @@ -326,6 +327,10 @@ export function getConcurrencyLimitMax( return CONCURRENCY_LIMIT.manual; } + if (teamId && manual_etier2c.includes(teamId)) { + return CONCURRENCY_LIMIT.etier2c; + } + return CONCURRENCY_LIMIT[plan] ?? 10; } From 42236ef0f09c72bebf0838b5b675df05d2db2054 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 25 Mar 2025 20:51:40 +0100 Subject: [PATCH 012/160] feat(admin/check-fire-engine): better logging --- apps/api/src/controllers/v0/admin/check-fire-engine.ts | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/apps/api/src/controllers/v0/admin/check-fire-engine.ts b/apps/api/src/controllers/v0/admin/check-fire-engine.ts index 7073415c..f8bf8beb 100644 --- a/apps/api/src/controllers/v0/admin/check-fire-engine.ts +++ b/apps/api/src/controllers/v0/admin/check-fire-engine.ts @@ -16,7 +16,7 @@ export async function checkFireEngine(req: Request, res: Response) { const timeout = setTimeout(() => controller.abort(), 30000); const urls = ["https://roastmywebsite.ai", "https://example.com"]; - let lastError: string | null = null; + let lastError: any = null; for (const url of urls) { try { @@ -57,7 +57,11 @@ export async function checkFireEngine(req: Request, res: Response) { } // If we get here, all retries failed - logger.error(lastError); + logger.error("An error occurred while checking fire-engine", { + module: "admin", + method: "checkFireEngine", + error: lastError, + }); Sentry.captureException(lastError); return res.status(500).json({ success: false, From 6e8644a14c8b9a5bfc96c0f50cf660f4e54fd84f Mon Sep 17 00:00:00 2001 From: Aparup Ganguly Date: Wed, 26 Mar 2025 17:19:47 +0530 Subject: [PATCH 013/160] Add examples/gemini-2.5-pro crawler --- examples/gemini-2.5-crawler/.env.example | 6 + examples/gemini-2.5-crawler/README.md | 89 +++++ .../gemini-2.5-crawler/gemini-2.5-crawler.py | 374 ++++++++++++++++++ examples/gemini-2.5-crawler/requirements.txt | 5 + 4 files changed, 474 insertions(+) create mode 100644 examples/gemini-2.5-crawler/.env.example create mode 100644 examples/gemini-2.5-crawler/README.md create mode 100644 examples/gemini-2.5-crawler/gemini-2.5-crawler.py create mode 100644 examples/gemini-2.5-crawler/requirements.txt diff --git a/examples/gemini-2.5-crawler/.env.example b/examples/gemini-2.5-crawler/.env.example new file mode 100644 index 00000000..e3dd7329 --- /dev/null +++ b/examples/gemini-2.5-crawler/.env.example @@ -0,0 +1,6 @@ +# Firecrawl API key from your Firecrawl account +FIRECRAWL_API_KEY=your_firecrawl_api_key_here + +# Google Cloud API key with Gemini API access +# Get this from Google Cloud Console: https://console.cloud.google.com/ +GEMINI_API_KEY=your_gemini_api_key_here \ No newline at end of file diff --git a/examples/gemini-2.5-crawler/README.md b/examples/gemini-2.5-crawler/README.md new file mode 100644 index 00000000..ae2367b2 --- /dev/null +++ b/examples/gemini-2.5-crawler/README.md @@ -0,0 +1,89 @@ +# Gemini 2.5 Web Crawler + +A powerful web crawler that uses Google's Gemini 2.5 Pro model to intelligently analyze web content, PDFs, and images based on user-defined objectives. + +## Features + +- Intelligent URL mapping and ranking based on relevance to search objective +- PDF content extraction and analysis +- Image content analysis and description +- Smart content filtering based on user objectives +- Support for multiple content types (markdown, PDFs, images) +- Color-coded console output for better readability + +## Prerequisites + +- Python 3.8+ +- Google Cloud API key with Gemini API access +- Firecrawl API key + +## Installation + +1. Clone the repository: + +```bash +git clone +cd +``` + +2. Install the required dependencies: + +```bash +pip install -r requirements.txt +``` + +3. Create a `.env` file based on `.env.example`: + +```bash +cp .env.example .env +``` + +4. Add your API keys to the `.env` file: + +``` +FIRECRAWL_API_KEY=your_firecrawl_api_key +GEMINI_API_KEY=your_gemini_api_key +``` + +## Usage + +Run the script: + +```bash +python gemini-2.5-crawler.py +``` + +The script will prompt you for: + +1. The website URL to crawl +2. Your search objective + +The crawler will then: + +1. Map the website and find relevant pages +2. Analyze the content using Gemini 2.5 Pro +3. Extract and analyze any PDFs or images found +4. Return structured information related to your objective + +## Output + +The script provides color-coded console output for: + +- Process steps and progress +- Debug information +- Success and error messages +- Final results in JSON format + +## Error Handling + +The script includes comprehensive error handling for: + +- API failures +- Content extraction issues +- Invalid URLs +- Timeouts +- JSON parsing errors + +## Note + +This script uses the experimental Gemini 2.5 Pro model (`gemini-2.5-pro-exp-03-25`). Make sure you have appropriate access and quota for using this model. diff --git a/examples/gemini-2.5-crawler/gemini-2.5-crawler.py b/examples/gemini-2.5-crawler/gemini-2.5-crawler.py new file mode 100644 index 00000000..ade0cc20 --- /dev/null +++ b/examples/gemini-2.5-crawler/gemini-2.5-crawler.py @@ -0,0 +1,374 @@ +import os +from firecrawl import FirecrawlApp +import json +import re +import requests +from requests.exceptions import RequestException +from dotenv import load_dotenv +import google.genai as genai +# Load environment variables +load_dotenv() + +# Retrieve API keys from environment variables +firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY") +gemini_api_key = os.getenv("GEMINI_API_KEY") + +# Initialize the FirecrawlApp and Gemini client +app = FirecrawlApp(api_key=firecrawl_api_key) +client = genai.Client(api_key=gemini_api_key) # Create Gemini client +model_name = "gemini-2.5-pro-exp-03-25" +types = genai.types + +# ANSI color codes + + +class Colors: + CYAN = '\033[96m' + YELLOW = '\033[93m' + GREEN = '\033[92m' + RED = '\033[91m' + MAGENTA = '\033[95m' + BLUE = '\033[94m' + RESET = '\033[0m' + + +def pdf_size_in_mb(data: bytes) -> float: + """Utility function to estimate PDF size in MB from raw bytes.""" + return len(data) / (1024 * 1024) + + +def gemini_extract_pdf_content(pdf_url, objective): + """ + Downloads a PDF from pdf_url, then calls Gemini to extract text. + Returns a string with the extracted text only. + """ + try: + pdf_data = requests.get(pdf_url, timeout=15).content + size_mb = pdf_size_in_mb(pdf_data) + if size_mb > 15: + print( + f"{Colors.YELLOW}Warning: PDF size is {size_mb} MB. Skipping PDF extraction.{Colors.RESET}") + return "" + + prompt = f""" + The objective is: {objective}. + From this PDF, extract only the text that helps address this objective. + If it contains no relevant info, return an empty string. + """ + response = client.models.generate_content( + model=model_name, + contents=[ + types.Part.from_bytes( + data=pdf_data, mime_type="application/pdf"), + prompt + ] + ) + return response.text.strip() + except Exception as e: + print(f"Error using Gemini to process PDF '{pdf_url}': {str(e)}") + return "" + + +def gemini_extract_image_data(image_url): + """ + Downloads an image from image_url, then calls Gemini to: + 1) Summarize what's in the image + Returns a string with the summary. + """ + try: + print(f"Gemini IMAGE extraction from: {image_url}") + image_data = requests.get(image_url, timeout=15).content + # 1) Summarize + resp_summary = client.models.generate_content([ + "Describe the contents of this image in a short paragraph.", + types.Part.from_bytes(data=image_data, mime_type="image/jpeg"), + ]) + summary_text = resp_summary.text.strip() + + return f"**Image Summary**:\n{summary_text}" + except Exception as e: + print(f"Error using Gemini to process Image '{image_url}': {str(e)}") + return "" + + +def extract_urls_from_markdown(markdown_text): + """ + Simple regex-based approach to extract potential URLs from a markdown string. + We look for http(s)://someurl up until a space or parenthesis or quote, etc. + """ + pattern = r'(https?://[^\s\'")]+)' + found = re.findall(pattern, markdown_text) + return list(set(found)) # unique them + + +def detect_mime_type(url, timeout=8): + """ + Attempt a HEAD request to detect the Content-Type. Return 'pdf', 'image' or None if undetermined. + Also validates image extensions for supported formats. + """ + try: + resp = requests.head(url, timeout=timeout, allow_redirects=True) + ctype = resp.headers.get('Content-Type', '').lower() + exts = ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.heic', '.heif'] + + if 'pdf' in ctype: + return 'pdf' + elif ctype.startswith('image/') and any(url.lower().endswith(ext) for ext in exts): + return 'image' + else: + return None + except RequestException as e: + print(f"Warning: HEAD request failed for {url}. Error: {e}") + return None + + +def find_relevant_page_via_map(objective, url, app): + try: + print(f"{Colors.CYAN}Understood. The objective is: {objective}{Colors.RESET}") + print(f"{Colors.CYAN}Initiating search on the website: {url}{Colors.RESET}") + + map_prompt = f""" + Based on the objective of: {objective}, provide a 1-2 word search parameter that will help find the information. + Respond with ONLY 1-2 words, no other text or formatting. + """ + + print( + f"{Colors.YELLOW}Analyzing objective to determine optimal search parameter...{Colors.RESET}") + # Use gemini-pro instead of gemini-2.0-flash + response = client.models.generate_content( + model=model_name, + contents=[map_prompt] + ) + + map_search_parameter = response.text.strip() + print( + f"{Colors.GREEN}Optimal search parameter identified: {map_search_parameter}{Colors.RESET}") + + print( + f"{Colors.YELLOW}Mapping website using the identified search parameter...{Colors.RESET}") + map_website = app.map_url(url, params={"search": map_search_parameter}) + + print(f"{Colors.MAGENTA}Debug - Map response structure: {json.dumps(map_website, indent=2)}{Colors.RESET}") + print(f"{Colors.GREEN}Website mapping completed successfully.{Colors.RESET}") + + if isinstance(map_website, dict): + links = map_website.get('urls', []) or map_website.get('links', []) + elif isinstance(map_website, str): + try: + parsed = json.loads(map_website) + links = parsed.get('urls', []) or parsed.get('links', []) + except json.JSONDecodeError: + links = [] + else: + links = map_website if isinstance(map_website, list) else [] + + if not links: + print(f"{Colors.RED}No links found in map response.{Colors.RESET}") + return None + + rank_prompt = f"""RESPOND ONLY WITH JSON. + Analyze these URLs and rank the top 3 most relevant ones for finding information about: {objective} + + Return ONLY a JSON array in this exact format - no other text or explanation: + [ + {{ + "url": "http://example.com", + "relevance_score": 95, + "reason": "Main about page with company information" + }}, + {{ + "url": "http://example2.com", + "relevance_score": 85, + "reason": "Team page with details" + }}, + {{ + "url": "http://example3.com", + "relevance_score": 75, + "reason": "Blog post about company" + }} + ] + + URLs to analyze: + {json.dumps(links, indent=2)}""" + + print(f"{Colors.YELLOW}Ranking URLs by relevance to objective...{Colors.RESET}") + response = client.models.generate_content( + model=model_name, + contents=[rank_prompt] + ) + + print(f"{Colors.MAGENTA}Debug - Raw Gemini response:{Colors.RESET}") + print(response.text) + + try: + response_text = response.text.strip() + print(f"{Colors.MAGENTA}Debug - Cleaned response:{Colors.RESET}") + print(response_text) + + if '[' in response_text and ']' in response_text: + start_idx = response_text.find('[') + end_idx = response_text.rfind(']') + 1 + json_str = response_text[start_idx:end_idx] + + print( + f"{Colors.MAGENTA}Debug - Extracted JSON string:{Colors.RESET}") + print(json_str) + + ranked_results = json.loads(json_str) + else: + print(f"{Colors.RED}No JSON array found in response{Colors.RESET}") + return None + + links = [result["url"] for result in ranked_results] + + print(f"{Colors.CYAN}Top 3 ranked URLs:{Colors.RESET}") + for result in ranked_results: + print(f"{Colors.GREEN}URL: {result['url']}{Colors.RESET}") + print( + f"{Colors.YELLOW}Relevance Score: {result['relevance_score']}{Colors.RESET}") + print(f"{Colors.BLUE}Reason: {result['reason']}{Colors.RESET}") + print("---") + + if not links: + print(f"{Colors.RED}No relevant links identified.{Colors.RESET}") + return None + + except json.JSONDecodeError as e: + print(f"{Colors.RED}Error parsing ranked results: {str(e)}{Colors.RESET}") + print(f"{Colors.RED}Failed JSON string: {response_text}{Colors.RESET}") + return None + except Exception as e: + print(f"{Colors.RED}Unexpected error: {str(e)}{Colors.RESET}") + return None + + print(f"{Colors.GREEN}Located {len(links)} relevant links.{Colors.RESET}") + return links + + except Exception as e: + print( + f"{Colors.RED}Error encountered during relevant page identification: {str(e)}{Colors.RESET}") + return None + + +def find_objective_in_top_pages(map_website, objective, app): + try: + if not map_website: + print(f"{Colors.RED}No links found to analyze.{Colors.RESET}") + return None + + top_links = map_website[:3] + print( + f"{Colors.CYAN}Proceeding to analyze top {len(top_links)} links: {top_links}{Colors.RESET}") + + for link in top_links: + print(f"{Colors.YELLOW}Initiating scrape of page: {link}{Colors.RESET}") + scrape_result = app.scrape_url( + link, params={'formats': ['markdown']}) + print( + f"{Colors.GREEN}Page scraping completed successfully.{Colors.RESET}") + + # Now detect any PDF or image URLs in the Markdown text + page_markdown = scrape_result.get('markdown', '') + if not page_markdown: + print( + f"{Colors.RED}No markdown returned for {link}, skipping...{Colors.RESET}") + continue + + found_urls = extract_urls_from_markdown(page_markdown) + pdf_image_append = "" + + for sub_url in found_urls: + mime_type_short = detect_mime_type(sub_url) + if mime_type_short == 'pdf': + print( + f"{Colors.YELLOW} Detected PDF: {sub_url}. Extracting content...{Colors.RESET}") + pdf_content = gemini_extract_pdf_content(sub_url) + if pdf_content: + pdf_image_append += f"\n\n---\n[PDF from {sub_url}]:\n{pdf_content}" + elif mime_type_short == 'image': + print( + f"{Colors.YELLOW} Detected Image: {sub_url}. Extracting content...{Colors.RESET}") + image_content = gemini_extract_image_data(sub_url) + if image_content: + pdf_image_append += f"\n\n---\n[Image from {sub_url}]:\n{image_content}" + + # Append extracted PDF/image text to the main markdown for the page + if pdf_image_append: + scrape_result[ + 'markdown'] += f"\n\n---\n**Additional Gemini Extraction:**\n{pdf_image_append}\n" + + check_prompt = f""" + Analyze this content to find: {objective} + If found, return ONLY a JSON object with information related to the objective. If not found, respond EXACTLY with: Objective not met + + Content to analyze: + {scrape_result['markdown']} + + Remember: + - Return valid JSON if information is found + - Return EXACTLY "Objective not met" if not found + - No other text or explanations + """ + + response = client.models.generate_content( + model=model_name, + contents=[check_prompt] + ) + + result = response.text.strip() + + print(f"{Colors.MAGENTA}Debug - Check response:{Colors.RESET}") + print(result) + + if result != "Objective not met": + print( + f"{Colors.GREEN}Objective potentially fulfilled. Relevant information identified.{Colors.RESET}") + try: + if '{' in result and '}' in result: + start_idx = result.find('{') + end_idx = result.rfind('}') + 1 + json_str = result[start_idx:end_idx] + return json.loads(json_str) + else: + print( + f"{Colors.RED}No JSON object found in response{Colors.RESET}") + except json.JSONDecodeError: + print( + f"{Colors.RED}Error in parsing response. Proceeding to next page...{Colors.RESET}") + else: + print( + f"{Colors.YELLOW}Objective not met on this page. Proceeding to next link...{Colors.RESET}") + + print(f"{Colors.RED}All available pages analyzed. Objective not fulfilled in examined content.{Colors.RESET}") + return None + + except Exception as e: + print( + f"{Colors.RED}Error encountered during page analysis: {str(e)}{Colors.RESET}") + return None + + +def main(): + url = input(f"{Colors.BLUE}Enter the website to crawl : {Colors.RESET}") + objective = input(f"{Colors.BLUE}Enter your objective: {Colors.RESET}") + + print(f"{Colors.YELLOW}Initiating web crawling process...{Colors.RESET}") + map_website = find_relevant_page_via_map(objective, url, app) + + if map_website: + print(f"{Colors.GREEN}Relevant pages identified. Proceeding with detailed analysis using gemini-pro...{Colors.RESET}") + result = find_objective_in_top_pages(map_website, objective, app) + + if result: + print( + f"{Colors.GREEN}Objective successfully fulfilled. Extracted information:{Colors.RESET}") + print(f"{Colors.MAGENTA}{json.dumps(result, indent=2)}{Colors.RESET}") + else: + print( + f"{Colors.RED}Unable to fulfill the objective with the available content.{Colors.RESET}") + else: + print(f"{Colors.RED}No relevant pages identified. Consider refining the search parameters or trying a different website.{Colors.RESET}") + + +if __name__ == "__main__": + main() diff --git a/examples/gemini-2.5-crawler/requirements.txt b/examples/gemini-2.5-crawler/requirements.txt new file mode 100644 index 00000000..b33a710c --- /dev/null +++ b/examples/gemini-2.5-crawler/requirements.txt @@ -0,0 +1,5 @@ +google-cloud-aiplatform>=1.36.0 +google-generativeai>=0.3.2 +python-dotenv>=1.0.0 +requests>=2.31.0 +firecrawl>=0.1.0 \ No newline at end of file From cc7f38af5cc5b735fd7b4aa13429da2dd2b18d2d Mon Sep 17 00:00:00 2001 From: Aparup Ganguly Date: Wed, 26 Mar 2025 17:25:46 +0530 Subject: [PATCH 014/160] Minor changes --- examples/gemini-2.5-crawler/gemini-2.5-crawler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/gemini-2.5-crawler/gemini-2.5-crawler.py b/examples/gemini-2.5-crawler/gemini-2.5-crawler.py index ade0cc20..52ceb9f4 100644 --- a/examples/gemini-2.5-crawler/gemini-2.5-crawler.py +++ b/examples/gemini-2.5-crawler/gemini-2.5-crawler.py @@ -134,7 +134,7 @@ def find_relevant_page_via_map(objective, url, app): print( f"{Colors.YELLOW}Analyzing objective to determine optimal search parameter...{Colors.RESET}") - # Use gemini-pro instead of gemini-2.0-flash + response = client.models.generate_content( model=model_name, contents=[map_prompt] From be43598071918cdfb69e983dac367660470f7e83 Mon Sep 17 00:00:00 2001 From: Aparup Ganguly Date: Wed, 26 Mar 2025 17:51:29 +0530 Subject: [PATCH 015/160] feature/gemini-2.5-company-extractor --- .../gemini-2.5-web-extractor/.env.example | 8 + examples/gemini-2.5-web-extractor/.gitignore | 34 +++ examples/gemini-2.5-web-extractor/README.md | 85 +++++++ .../gemini-2.5-web-extractor.py | 213 ++++++++++++++++++ .../gemini-2.5-web-extractor/requirements.txt | 4 + 5 files changed, 344 insertions(+) create mode 100644 examples/gemini-2.5-web-extractor/.env.example create mode 100644 examples/gemini-2.5-web-extractor/.gitignore create mode 100644 examples/gemini-2.5-web-extractor/README.md create mode 100644 examples/gemini-2.5-web-extractor/gemini-2.5-web-extractor.py create mode 100644 examples/gemini-2.5-web-extractor/requirements.txt diff --git a/examples/gemini-2.5-web-extractor/.env.example b/examples/gemini-2.5-web-extractor/.env.example new file mode 100644 index 00000000..9b555916 --- /dev/null +++ b/examples/gemini-2.5-web-extractor/.env.example @@ -0,0 +1,8 @@ +# Google Gemini API Key +GOOGLE_API_KEY=your_google_api_key_here + +# Firecrawl API Key +FIRECRAWL_API_KEY=your_firecrawl_api_key_here + +# SerpAPI Key +SERP_API_KEY=your_serp_api_key_here \ No newline at end of file diff --git a/examples/gemini-2.5-web-extractor/.gitignore b/examples/gemini-2.5-web-extractor/.gitignore new file mode 100644 index 00000000..f5d50e2d --- /dev/null +++ b/examples/gemini-2.5-web-extractor/.gitignore @@ -0,0 +1,34 @@ +# Environment variables +.env + +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# IDE +.idea/ +.vscode/ +*.swp +*.swo + +# OS +.DS_Store +Thumbs.db \ No newline at end of file diff --git a/examples/gemini-2.5-web-extractor/README.md b/examples/gemini-2.5-web-extractor/README.md new file mode 100644 index 00000000..a5554ea4 --- /dev/null +++ b/examples/gemini-2.5-web-extractor/README.md @@ -0,0 +1,85 @@ +# Gemini 2.5 Web Extractor + +A powerful web information extraction tool that combines Google's Gemini 2.5 Pro (Experimental) model with Firecrawl's web extraction capabilities to gather structured information about companies from the web. + +## Features + +- Uses Google Search (via SerpAPI) to find relevant web pages +- Leverages Gemini 2.5 Pro (Experimental) to intelligently select the most relevant URLs +- Extracts structured information using Firecrawl's advanced web extraction +- Real-time progress monitoring and colorized console output + +## Prerequisites + +- Python 3.8 or higher +- Google API Key (Gemini) +- Firecrawl API Key +- SerpAPI Key + +## Setup + +1. Clone the repository: + +```bash +git clone +cd gemini-2.5-web-extractor +``` + +2. Install dependencies: + +```bash +pip install -r requirements.txt +``` + +3. Set up environment variables: + - Copy `.env.example` to `.env` + - Fill in your API keys in the `.env` file: + - `GOOGLE_API_KEY`: Your Google API key for Gemini + - `FIRECRAWL_API_KEY`: Your Firecrawl API key + - `SERP_API_KEY`: Your SerpAPI key + +## Usage + +Run the script: + +```bash +python gemini-2.5-web-extractor.py +``` + +The script will: + +1. Prompt you for a company name +2. Ask what information you want to extract about the company +3. Search for relevant web pages +4. Use Gemini to select the most relevant URLs +5. Extract structured information using Firecrawl +6. Display the results in a formatted JSON output + +## Example + +```bash +Enter the company name: Tesla +Enter what information you want about the company: latest electric vehicle models and their specifications +``` + +The script will then: + +1. Search for relevant Tesla information +2. Select the most informative URLs about Tesla's current EV lineup +3. Extract and structure the vehicle specifications +4. Present the data in a clean, organized format + +## Error Handling + +The script includes comprehensive error handling for: + +- API failures +- Network issues +- Invalid responses +- Timeout scenarios + +All errors are clearly displayed with colored output for better visibility. + +## License + +[Add your license information here] diff --git a/examples/gemini-2.5-web-extractor/gemini-2.5-web-extractor.py b/examples/gemini-2.5-web-extractor/gemini-2.5-web-extractor.py new file mode 100644 index 00000000..53f9e791 --- /dev/null +++ b/examples/gemini-2.5-web-extractor/gemini-2.5-web-extractor.py @@ -0,0 +1,213 @@ +import os +import json +import time +import requests +from dotenv import load_dotenv +from serpapi.google_search import GoogleSearch +from google import genai + +# ANSI color codes +class Colors: + CYAN = '\033[96m' + YELLOW = '\033[93m' + GREEN = '\033[92m' + RED = '\033[91m' + MAGENTA = '\033[95m' + BLUE = '\033[94m' + RESET = '\033[0m' + +# Load environment variables +load_dotenv() + +# Initialize clients +client = genai.Client(api_key=os.getenv("GOOGLE_API_KEY")) +firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY") +serp_api_key = os.getenv("SERP_API_KEY") + + +if not firecrawl_api_key: + print(f"{Colors.RED}Warning: FIRECRAWL_API_KEY not found in environment variables{Colors.RESET}") + +def search_google(query): + """Search Google using SerpAPI and return top results.""" + print(f"{Colors.YELLOW}Searching Google for '{query}'...{Colors.RESET}") + search = GoogleSearch({"q": query, "api_key": serp_api_key}) + return search.get_dict().get("organic_results", []) + +def select_urls_with_gemini(company, objective, serp_results): + """ + Use Gemini 2.5 Flash to select URLs from SERP results. + Returns a list of URLs. + """ + try: + serp_data = [{"title": r.get("title"), "link": r.get("link"), "snippet": r.get("snippet")} + for r in serp_results if r.get("link")] + + prompt = ( + "Task: Select relevant URLs from search results.\n\n" + "Instructions:\n" + "1. Analyze the search results for information about the specified company\n" + "2. Select URLs that are most likely to contain the requested information\n" + "3. Return ONLY a JSON object with the following structure: {\"selected_urls\": [\"url1\", \"url2\"]}\n" + "4. Do not include social media links\n\n" + f"Company: {company}\n" + f"Information Needed: {objective}\n" + f"Search Results: {json.dumps(serp_data, indent=2)}\n\n" + "Response Format: {\"selected_urls\": [\"https://example.com\", \"https://example2.com\"]}" + ) + + response = client.models.generate_content( + model="gemini-2.5-pro-exp-03-25", + contents=prompt + ) + + # Clean the response text + cleaned_response = response.text.strip() + if cleaned_response.startswith('```'): + cleaned_response = cleaned_response.split('```')[1] + if cleaned_response.startswith('json'): + cleaned_response = cleaned_response[4:] + cleaned_response = cleaned_response.strip() + + try: + # Parse JSON response + result = json.loads(cleaned_response) + if isinstance(result, dict) and "selected_urls" in result: + urls = result["selected_urls"] + else: + # Fallback to text parsing + urls = [line.strip() for line in cleaned_response.split('\n') + if line.strip().startswith(('http://', 'https://'))] + except json.JSONDecodeError: + # Fallback to text parsing + urls = [line.strip() for line in cleaned_response.split('\n') + if line.strip().startswith(('http://', 'https://'))] + + # Clean up URLs + cleaned_urls = [url.replace('/*', '').rstrip('/') for url in urls] + cleaned_urls = [url for url in cleaned_urls if url] + + if not cleaned_urls: + print(f"{Colors.YELLOW}No valid URLs found in response.{Colors.RESET}") + return [] + + print(f"{Colors.CYAN}Selected URLs for extraction:{Colors.RESET}") + for url in cleaned_urls: + print(f"- {url}") + + return cleaned_urls + + except Exception as e: + print(f"{Colors.RED}Error selecting URLs: {str(e)}{Colors.RESET}") + return [] + +def extract_company_info(urls, prompt, company, api_key): + """Use requests to call Firecrawl's extract endpoint with selected URLs.""" + print(f"{Colors.YELLOW}Extracting structured data from the provided URLs using Firecrawl...{Colors.RESET}") + + headers = { + 'Content-Type': 'application/json', + 'Authorization': f'Bearer {api_key}' + } + + payload = { + "urls": urls, + "prompt": prompt + " for " + company, + "enableWebSearch": True + } + + try: + response = requests.post( + "https://api.firecrawl.dev/v1/extract", + headers=headers, + json=payload, + timeout=30 + ) + + data = response.json() + + if not data.get('success'): + print(f"{Colors.RED}API returned error: {data.get('error', 'No error message')}{Colors.RESET}") + return None + + extraction_id = data.get('id') + if not extraction_id: + print(f"{Colors.RED}No extraction ID found in response.{Colors.RESET}") + return None + + return poll_firecrawl_result(extraction_id, api_key) + + except requests.exceptions.RequestException as e: + print(f"{Colors.RED}Request failed: {e}{Colors.RESET}") + return None + except json.JSONDecodeError as e: + print(f"{Colors.RED}Failed to parse response: {e}{Colors.RESET}") + return None + except Exception as e: + print(f"{Colors.RED}Failed to extract data: {e}{Colors.RESET}") + return None + +def poll_firecrawl_result(extraction_id, api_key, interval=10, max_attempts=60): + """Poll Firecrawl API to get the extraction result.""" + url = f"https://api.firecrawl.dev/v1/extract/{extraction_id}" + headers = { + 'Authorization': f'Bearer {api_key}' + } + + print(f"{Colors.YELLOW}Waiting for extraction to complete...{Colors.RESET}") + + for attempt in range(1, max_attempts + 1): + try: + response = requests.get(url, headers=headers, timeout=30) + response.raise_for_status() + data = response.json() + + if data.get('success') and data.get('data'): + print(f"{Colors.GREEN}Data successfully extracted:{Colors.RESET}") + print(json.dumps(data['data'], indent=2)) + return data['data'] + elif data.get('success') and not data.get('data'): + if attempt % 6 == 0: + print(f"{Colors.YELLOW}Still processing... (attempt {attempt}/{max_attempts}){Colors.RESET}") + time.sleep(interval) + else: + print(f"{Colors.RED}API Error: {data.get('error', 'No error message provided')}{Colors.RESET}") + return None + + except requests.exceptions.RequestException as e: + print(f"{Colors.RED}Request error: {str(e)}{Colors.RESET}") + return None + except json.JSONDecodeError as e: + print(f"{Colors.RED}JSON parsing error: {str(e)}{Colors.RESET}") + return None + except Exception as e: + print(f"{Colors.RED}Unexpected error: {str(e)}{Colors.RESET}") + return None + + print(f"{Colors.RED}Max polling attempts reached. Extraction did not complete in time.{Colors.RESET}") + return None + +def main(): + company = input(f"{Colors.BLUE}Enter the company name: {Colors.RESET}") + objective = input(f"{Colors.BLUE}Enter what information you want about the company: {Colors.RESET}") + + serp_results = search_google(f"{company}") + if not serp_results: + print(f"{Colors.RED}No search results found.{Colors.RESET}") + return + + selected_urls = select_urls_with_gemini(company, objective, serp_results) + + if not selected_urls: + print(f"{Colors.RED}No URLs were selected.{Colors.RESET}") + return + + data = extract_company_info(selected_urls, objective, company, firecrawl_api_key) + + if data: + print(f"{Colors.GREEN}Extraction completed successfully.{Colors.RESET}") + else: + print(f"{Colors.RED}Failed to extract the requested information. Try refining your prompt or choosing a different company.{Colors.RESET}") + +if __name__ == "__main__": + main() diff --git a/examples/gemini-2.5-web-extractor/requirements.txt b/examples/gemini-2.5-web-extractor/requirements.txt new file mode 100644 index 00000000..c072b3cd --- /dev/null +++ b/examples/gemini-2.5-web-extractor/requirements.txt @@ -0,0 +1,4 @@ +python-dotenv==1.0.0 +google-generativeai==0.3.2 +requests==2.31.0 +serpapi==0.1.5 \ No newline at end of file From 2b39788d73c103060d75eac294d32cb65f70d53c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 26 Mar 2025 20:00:25 +0100 Subject: [PATCH 016/160] manual rl --- apps/api/src/services/rate-limiter.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/services/rate-limiter.ts b/apps/api/src/services/rate-limiter.ts index 288af6c6..29173f76 100644 --- a/apps/api/src/services/rate-limiter.ts +++ b/apps/api/src/services/rate-limiter.ts @@ -240,7 +240,7 @@ const testSuiteTokens = [ "0a18c9e", // gh ]; -const manual = ["69be9e74-7624-4990-b20d-08e0acc70cf6"]; +const manual = ["69be9e74-7624-4990-b20d-08e0acc70cf6", "9661a311-3d75-45d2-bb70-71004d995873"]; const manual_etier2c = ["77545e01-9cec-4fa9-8356-883fc66ac13e"]; function makePlanKey(plan?: string) { From e799cf206b5d357158d242f4dcec76943f47194d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 26 Mar 2025 20:13:32 +0100 Subject: [PATCH 017/160] increase manual rl --- apps/api/src/services/rate-limiter.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/services/rate-limiter.ts b/apps/api/src/services/rate-limiter.ts index 29173f76..a8f885a0 100644 --- a/apps/api/src/services/rate-limiter.ts +++ b/apps/api/src/services/rate-limiter.ts @@ -192,7 +192,7 @@ export const devBRateLimiter = new RateLimiterRedis({ export const manualRateLimiter = new RateLimiterRedis({ storeClient: redisRateLimitClient, keyPrefix: "manual", - points: 2000, + points: 10000, duration: 60, // Duration in seconds }); From da76524771ba702d80b9c5fc869256d929740cf1 Mon Sep 17 00:00:00 2001 From: Aparup Ganguly Date: Fri, 28 Mar 2025 16:05:16 +0530 Subject: [PATCH 018/160] Add examples/deepseek-v3-crawler --- examples/deepseek-v3-crawler/.gitignore | 50 ++++++ examples/deepseek-v3-crawler/README.md | 68 ++++++++ .../deepseek-v3-crawler.py | 164 ++++++++++++++++++ examples/deepseek-v3-crawler/requirements.txt | 3 + 4 files changed, 285 insertions(+) create mode 100644 examples/deepseek-v3-crawler/.gitignore create mode 100644 examples/deepseek-v3-crawler/README.md create mode 100644 examples/deepseek-v3-crawler/deepseek-v3-crawler.py create mode 100644 examples/deepseek-v3-crawler/requirements.txt diff --git a/examples/deepseek-v3-crawler/.gitignore b/examples/deepseek-v3-crawler/.gitignore new file mode 100644 index 00000000..7a36ce52 --- /dev/null +++ b/examples/deepseek-v3-crawler/.gitignore @@ -0,0 +1,50 @@ +# Environment variables +.env +.env.* + +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +*.egg-info/ +.installed.cfg +*.egg + +# Virtual environments +venv/ +ENV/ +env/ + +# Editor files +.idea/ +.vscode/ +*.swp +*.swo +*~ + +# OS specific files +.DS_Store +.DS_Store? +._* +.Spotlight-V100 +.Trashes +ehthumbs.db +Thumbs.db + +# Logs +*.log +logs/ \ No newline at end of file diff --git a/examples/deepseek-v3-crawler/README.md b/examples/deepseek-v3-crawler/README.md new file mode 100644 index 00000000..8f5ffd43 --- /dev/null +++ b/examples/deepseek-v3-crawler/README.md @@ -0,0 +1,68 @@ +# DeepSeek V3 Web Crawler + +This script uses the DeepSeek V3 large language model (via Hugging Face's Inference API) and FireCrawl to crawl websites based on specific objectives. + +## Prerequisites + +- Python 3.8+ +- A FireCrawl API key (get one at [FireCrawl's website](https://firecrawl.app)) +- A Hugging Face API key with access to inference API + +## Installation + +1. Clone this repository: + +```bash +git clone +cd +``` + +2. Install the required packages: + +```bash +pip install -r requirements.txt +``` + +3. Create a `.env` file in the root directory with your API keys: + +``` +FIRECRAWL_API_KEY=your_firecrawl_api_key +HUGGINGFACE_API_KEY=your_huggingface_api_key +``` + +## Usage + +Run the script: + +```bash +python deepseek-v3-crawler.py +``` + +The script will prompt you to: + +1. Enter a website URL to crawl +2. Enter your objective (what information you're looking for) + +The script will then: + +- Use DeepSeek V3 to generate optimal search parameters for the website +- Map the website to find relevant pages +- Crawl the most relevant pages to extract information based on your objective +- Output the results in JSON format if successful + +## Example + +Input: + +- Website: https://www.example.com +- Objective: Find information about their pricing plans + +Output: + +- The script will output structured JSON data containing the pricing information found on the website. + +## Notes + +- The script uses DeepSeek V3, an advanced language model, to analyze web content. +- The model is accessed via Hugging Face's Inference API. +- You may need to adjust temperature or max_new_tokens parameters in the script based on your needs. diff --git a/examples/deepseek-v3-crawler/deepseek-v3-crawler.py b/examples/deepseek-v3-crawler/deepseek-v3-crawler.py new file mode 100644 index 00000000..3e4075b1 --- /dev/null +++ b/examples/deepseek-v3-crawler/deepseek-v3-crawler.py @@ -0,0 +1,164 @@ +import os +from firecrawl import FirecrawlApp +import json +from dotenv import load_dotenv +from openai import OpenAI + +# ANSI color codes +class Colors: + CYAN = '\033[96m' + YELLOW = '\033[93m' + GREEN = '\033[92m' + RED = '\033[91m' + MAGENTA = '\033[95m' + BLUE = '\033[94m' + RESET = '\033[0m' + +# Load environment variables +load_dotenv() + +# Retrieve API keys from environment variables +firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY") +openrouter_api_key = os.getenv("OPENROUTER_API_KEY") + +# Initialize the FirecrawlApp and OpenRouter client +app = FirecrawlApp(api_key=firecrawl_api_key) +client = OpenAI( + base_url="https://openrouter.ai/api/v1", + api_key=openrouter_api_key +) + +def main(): + try: + # Test the model availability first + test_response = client.chat.completions.create( + model="deepseek/deepseek-chat-v3-0324:free", + messages=[{"role": "user", "content": "test"}] + ) + except Exception as e: + print(f"{Colors.RED}Error: Could not connect to the language model. Please try again later.{Colors.RESET}") + print(f"{Colors.RED}Details: {str(e)}{Colors.RESET}") + return + + url = input(f"{Colors.BLUE}Enter the website to crawl: {Colors.RESET}") + objective = input(f"{Colors.BLUE}Enter your objective: {Colors.RESET}") + + print(f"{Colors.YELLOW}Initiating web crawling process...{Colors.RESET}") + + relevant_pages = find_relevant_page_via_map(objective, url, app, client) + + if not relevant_pages: + print(f"{Colors.RED}No relevant pages found. Exiting...{Colors.RESET}") + return + + result = find_objective_in_top_pages(relevant_pages, objective, app, client) + + if result: + print(f"{Colors.GREEN}Objective successfully found! Extracted information:{Colors.RESET}") + print(json.dumps(result, indent=2)) + else: + print(f"{Colors.RED}Objective could not be fulfilled.{Colors.RESET}") + +def find_relevant_page_via_map(objective, url, app, client): + try: + print(f"{Colors.CYAN}Understood. Objective: {objective}{Colors.RESET}") + print(f"{Colors.CYAN}Searching website: {url}{Colors.RESET}") + + map_prompt = f""" + The map function generates a list of URLs from a website and it accepts a search parameter. Based on the objective of: {objective}, come up with a 1-2 word search parameter that will help us find the information we need. Only respond with 1-2 words nothing else. + """ + + + response = client.chat.completions.create( + model="deepseek/deepseek-chat-v3-0324:free", + messages=[{"role": "user", "content": map_prompt}] + ) + map_search_parameter = response.choices[0].message.content.strip() + + print(f"{Colors.GREEN}Optimal search parameter identified: {map_search_parameter}{Colors.RESET}") + + map_website = app.map_url(url, params={"search": map_search_parameter}) + print(f"{Colors.GREEN}Website mapping completed successfully.{Colors.RESET}") + + links = map_website.get('urls', []) or map_website.get('links', []) + + if not links: + print(f"{Colors.RED}No links found in map response.{Colors.RESET}") + return None + + return links + + except Exception as e: + print(f"{Colors.RED}Error encountered: {str(e)}{Colors.RESET}") + return None + +def find_objective_in_top_pages(pages, objective, app, client): + try: + for link in pages[:3]: + print(f"{Colors.YELLOW}Scraping page: {link}{Colors.RESET}") + scrape_result = app.scrape_url(link, params={'formats': ['markdown']}) + + check_prompt = f""" + Given the following scraped content and objective, determine if the objective is met. + If it is, extract the relevant information in a simple JSON format. + If the objective is not met, respond with exactly 'Objective not met'. + + The JSON format should be: + {{ + "found": true, + "data": {{ + // extracted information here + }} + }} + + Important: Do not wrap the JSON in markdown code blocks. Just return the raw JSON. + + Objective: {objective} + Scraped content: {scrape_result['markdown']} + """ + + # Using OpenRouter's API to analyze the content + response = client.chat.completions.create( + model="deepseek/deepseek-chat-v3-0324:free", + messages=[{ + "role": "system", + "content": "You are a helpful assistant that extracts information from web pages. Always respond in valid JSON format when information is found. Do not wrap the JSON in markdown code blocks." + }, { + "role": "user", + "content": check_prompt + }] + ) + result = response.choices[0].message.content.strip() + + print(f"{Colors.CYAN}Model response: {result}{Colors.RESET}") # Debug output + + if result == "Objective not met": + print(f"{Colors.YELLOW}Objective not met in this page, continuing search...{Colors.RESET}") + continue + + try: + # Clean up the response if it's wrapped in code blocks + if result.startswith('```'): + result = result.split('```')[1] + if result.startswith('json'): + result = result[4:] + result = result.strip() + + parsed_result = json.loads(result) + if isinstance(parsed_result, dict) and parsed_result.get('found'): + return parsed_result.get('data') + else: + print(f"{Colors.YELLOW}Invalid response format, continuing search...{Colors.RESET}") + except json.JSONDecodeError as e: + print(f"{Colors.RED}Error parsing JSON response: {str(e)}{Colors.RESET}") + print(f"{Colors.RED}Raw response: {result}{Colors.RESET}") + continue + + return None + + except Exception as e: + print(f"{Colors.RED}Error encountered: {str(e)}{Colors.RESET}") + return None + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/examples/deepseek-v3-crawler/requirements.txt b/examples/deepseek-v3-crawler/requirements.txt new file mode 100644 index 00000000..e179518c --- /dev/null +++ b/examples/deepseek-v3-crawler/requirements.txt @@ -0,0 +1,3 @@ +firecrawl==1.13.5 +python-dotenv==1.0.1 +huggingface-hub>=0.20.0 \ No newline at end of file From 28928f0006902c48899d05261d9e3fda20e7e980 Mon Sep 17 00:00:00 2001 From: Aparup Ganguly Date: Fri, 28 Mar 2025 16:10:22 +0530 Subject: [PATCH 019/160] Add examples/DeepSeekv3 company researcher --- .../deepseek-v3-company-researcher/.gitignore | 50 ++++ .../deepseek-v3-company-researcher/README.md | 81 ++++++ .../deepseek-v3-extract.py | 248 ++++++++++++++++++ .../requirements.txt | 5 + 4 files changed, 384 insertions(+) create mode 100644 examples/deepseek-v3-company-researcher/.gitignore create mode 100644 examples/deepseek-v3-company-researcher/README.md create mode 100644 examples/deepseek-v3-company-researcher/deepseek-v3-extract.py create mode 100644 examples/deepseek-v3-company-researcher/requirements.txt diff --git a/examples/deepseek-v3-company-researcher/.gitignore b/examples/deepseek-v3-company-researcher/.gitignore new file mode 100644 index 00000000..7a36ce52 --- /dev/null +++ b/examples/deepseek-v3-company-researcher/.gitignore @@ -0,0 +1,50 @@ +# Environment variables +.env +.env.* + +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +*.egg-info/ +.installed.cfg +*.egg + +# Virtual environments +venv/ +ENV/ +env/ + +# Editor files +.idea/ +.vscode/ +*.swp +*.swo +*~ + +# OS specific files +.DS_Store +.DS_Store? +._* +.Spotlight-V100 +.Trashes +ehthumbs.db +Thumbs.db + +# Logs +*.log +logs/ \ No newline at end of file diff --git a/examples/deepseek-v3-company-researcher/README.md b/examples/deepseek-v3-company-researcher/README.md new file mode 100644 index 00000000..a72eafa2 --- /dev/null +++ b/examples/deepseek-v3-company-researcher/README.md @@ -0,0 +1,81 @@ +# DeepSeek V3 Company Researcher + +This tool is a powerful company research assistant that combines Google search, DeepSeek Chat V3, and Firecrawl to gather and analyze company information automatically. + +## Features + +- Automated Google search using SerpAPI +- Intelligent URL selection using DeepSeek Chat V3 +- Structured data extraction using Firecrawl +- Real-time progress monitoring and colorized output +- Automated handling of rate limits and polling + +## Prerequisites + +- Python 3.7+ +- API keys for: + - OpenRouter (for DeepSeek Chat V3 access) + - Firecrawl + - SerpAPI + +## Setup + +1. Clone the repository +2. Install the required dependencies: + ```bash + pip install -r requirements.txt + ``` +3. Create a `.env` file in the project root with your API keys: + ``` + OPENROUTER_API_KEY=your_openrouter_api_key + FIRECRAWL_API_KEY=your_firecrawl_api_key + SERP_API_KEY=your_serpapi_key + ``` + +## Usage + +Run the script: + +```bash +python deepseek-v3-extract.py +``` + +Follow the interactive prompts to: + +1. Enter the company name you want to research +2. Specify what information you want to gather about the company + +The tool will: + +- Search for relevant company information +- Select the most promising URLs +- Extract structured data from those URLs +- Present the findings in a clear, formatted output + +## Output + +The script provides real-time feedback with color-coded status messages: + +- 🔵 Blue: User prompts +- 🟡 Yellow: Processing status +- 🟢 Green: Success messages +- 🔴 Red: Error messages +- 🟣 Magenta: Special notifications +- 🔅 Cyan: URL selections + +## Error Handling + +The script includes comprehensive error handling for: + +- API failures +- Network issues +- Invalid responses +- Timeout scenarios + +## License + +MIT License + +## Contributing + +Feel free to open issues or submit pull requests with improvements. diff --git a/examples/deepseek-v3-company-researcher/deepseek-v3-extract.py b/examples/deepseek-v3-company-researcher/deepseek-v3-extract.py new file mode 100644 index 00000000..80af0bdd --- /dev/null +++ b/examples/deepseek-v3-company-researcher/deepseek-v3-extract.py @@ -0,0 +1,248 @@ +import os +import json +import time +import requests +from dotenv import load_dotenv +from serpapi.google_search import GoogleSearch +from openai import OpenAI + +# ANSI color codes +class Colors: + CYAN = '\033[96m' + YELLOW = '\033[93m' + GREEN = '\033[92m' + RED = '\033[91m' + MAGENTA = '\033[95m' + BLUE = '\033[94m' + RESET = '\033[0m' + +# Load environment variables +load_dotenv() + +# Initialize clients +openrouter_api_key = os.getenv("OPENROUTER_API_KEY") +firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY") +serp_api_key = os.getenv("SERP_API_KEY") + +if not openrouter_api_key: + print(f"{Colors.RED}Warning: OPENROUTER_API_KEY not found in environment variables{Colors.RESET}") +if not firecrawl_api_key: + print(f"{Colors.RED}Warning: FIRECRAWL_API_KEY not found in environment variables{Colors.RESET}") +if not serp_api_key: + print(f"{Colors.RED}Warning: SERP_API_KEY not found in environment variables{Colors.RESET}") + +# Initialize OpenRouter client +client = OpenAI( + base_url="https://openrouter.ai/api/v1", + api_key=openrouter_api_key +) + +def clean_url(url): + """Clean a URL by removing tracking parameters and unnecessary query strings.""" + if not isinstance(url, str): + return None + + # Remove any query parameters + base_url = url.split('?')[0] + + # Remove trailing slashes and cleanup + cleaned = base_url.rstrip('/') + cleaned = cleaned.replace('/*', '') + + # Ensure it's a valid http(s) URL + if not cleaned.startswith(('http://', 'https://')): + return None + + return cleaned + +def search_google(query): + """Search Google using SerpAPI and return top results.""" + print(f"{Colors.YELLOW}Searching Google for '{query}'...{Colors.RESET}") + search = GoogleSearch({"q": query, "api_key": serp_api_key}) + return search.get_dict().get("organic_results", []) + +def select_urls_with_claude(company, objective, serp_results): + """ + Use Claude 3.7 Sonnet to select URLs from SERP results. + Returns a list of URLs. + """ + try: + serp_data = [{"title": r.get("title"), "link": r.get("link"), "snippet": r.get("snippet")} + for r in serp_results if r.get("link")] + + prompt = ( + "Task: Select relevant URLs from search results.\n\n" + "Instructions:\n" + "1. Analyze the search results for information about the specified company\n" + "2. Select URLs that are most likely to contain the requested information\n" + "3. Return ONLY a JSON object with the following structure: {\"selected_urls\": [\"url1\", \"url2\"]}\n" + "4. Do not include social media links\n\n" + f"Company: {company}\n" + f"Information Needed: {objective}\n" + f"Search Results: {json.dumps(serp_data, indent=2)}\n\n" + "Response Format: {\"selected_urls\": [\"https://example.com\", \"https://example2.com\"]}" + ) + + # Using OpenRouter's API to select URLs + response = client.chat.completions.create( + model="deepseek/deepseek-chat-v3-0324:free", + messages=[{ + "role": "system", + "content": "You are a URL selection assistant. Your task is to select relevant URLs from search results. You MUST return a valid JSON object containing at least one URL." + }, { + "role": "user", + "content": prompt + }] + ) + + result = response.choices[0].message.content.strip() + + # Clean the response text + if result.startswith('```'): + result = result.split('```')[1] + if result.startswith('json'): + result = result[4:] + result = result.strip() + + try: + # Parse JSON response + parsed_result = json.loads(result) + if isinstance(parsed_result, dict) and "selected_urls" in parsed_result: + urls = parsed_result["selected_urls"] + else: + # Fallback to text parsing + urls = [line.strip() for line in result.split('\n') + if line.strip().startswith(('http://', 'https://'))] + except json.JSONDecodeError: + # Fallback to text parsing + urls = [line.strip() for line in result.split('\n') + if line.strip().startswith(('http://', 'https://'))] + + # Clean up URLs + cleaned_urls = [url.replace('/*', '').rstrip('/') for url in urls] + cleaned_urls = [url for url in cleaned_urls if url] + + if not cleaned_urls: + print(f"{Colors.YELLOW}No valid URLs found in response.{Colors.RESET}") + return [] + + print(f"{Colors.CYAN}Selected URLs for extraction:{Colors.RESET}") + for url in cleaned_urls: + print(f"- {url}") + + return cleaned_urls + + except Exception as e: + print(f"{Colors.RED}Error selecting URLs: {str(e)}{Colors.RESET}") + return [] + +def extract_company_info(urls, prompt, company, api_key): + """Use requests to call Firecrawl's extract endpoint with selected URLs.""" + print(f"{Colors.YELLOW}Extracting structured data from the provided URLs using Firecrawl...{Colors.RESET}") + + headers = { + 'Content-Type': 'application/json', + 'Authorization': f'Bearer {api_key}' + } + + payload = { + "urls": urls, + "prompt": prompt + " for " + company, + "enableWebSearch": True + } + + try: + response = requests.post( + "https://api.firecrawl.dev/v1/extract", + headers=headers, + json=payload, + timeout=30 + ) + + data = response.json() + + if not data.get('success'): + print(f"{Colors.RED}API returned error: {data.get('error', 'No error message')}{Colors.RESET}") + return None + + extraction_id = data.get('id') + if not extraction_id: + print(f"{Colors.RED}No extraction ID found in response.{Colors.RESET}") + return None + + return poll_firecrawl_result(extraction_id, api_key) + + except requests.exceptions.RequestException as e: + print(f"{Colors.RED}Request failed: {e}{Colors.RESET}") + return None + except json.JSONDecodeError as e: + print(f"{Colors.RED}Failed to parse response: {e}{Colors.RESET}") + return None + except Exception as e: + print(f"{Colors.RED}Failed to extract data: {e}{Colors.RESET}") + return None + +def poll_firecrawl_result(extraction_id, api_key, interval=10, max_attempts=60): + """Poll Firecrawl API to get the extraction result.""" + url = f"https://api.firecrawl.dev/v1/extract/{extraction_id}" + headers = { + 'Authorization': f'Bearer {api_key}' + } + + print(f"{Colors.YELLOW}Waiting for extraction to complete...{Colors.RESET}") + + for attempt in range(1, max_attempts + 1): + try: + response = requests.get(url, headers=headers, timeout=30) + response.raise_for_status() + data = response.json() + + if data.get('success') and data.get('data'): + print(f"{Colors.GREEN}Data successfully extracted:{Colors.RESET}") + print(json.dumps(data['data'], indent=2)) + return data['data'] + elif data.get('success') and not data.get('data'): + if attempt % 6 == 0: + print(f"{Colors.YELLOW}Still processing... (attempt {attempt}/{max_attempts}){Colors.RESET}") + time.sleep(interval) + else: + print(f"{Colors.RED}API Error: {data.get('error', 'No error message provided')}{Colors.RESET}") + return None + + except requests.exceptions.RequestException as e: + print(f"{Colors.RED}Request error: {str(e)}{Colors.RESET}") + return None + except json.JSONDecodeError as e: + print(f"{Colors.RED}JSON parsing error: {str(e)}{Colors.RESET}") + return None + except Exception as e: + print(f"{Colors.RED}Unexpected error: {str(e)}{Colors.RESET}") + return None + + print(f"{Colors.RED}Max polling attempts reached. Extraction did not complete in time.{Colors.RESET}") + return None + +def main(): + company = input(f"{Colors.BLUE}Enter the company name: {Colors.RESET}") + objective = input(f"{Colors.BLUE}Enter what information you want about the company: {Colors.RESET}") + + serp_results = search_google(f"{company}") + if not serp_results: + print(f"{Colors.RED}No search results found.{Colors.RESET}") + return + + selected_urls = select_urls_with_claude(company, objective, serp_results) + + if not selected_urls: + print(f"{Colors.RED}No URLs were selected.{Colors.RESET}") + return + + data = extract_company_info(selected_urls, objective, company, firecrawl_api_key) + + if data: + print(f"{Colors.GREEN}Extraction completed successfully.{Colors.RESET}") + else: + print(f"{Colors.RED}Failed to extract the requested information. Try refining your prompt or choosing a different company.{Colors.RESET}") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/examples/deepseek-v3-company-researcher/requirements.txt b/examples/deepseek-v3-company-researcher/requirements.txt new file mode 100644 index 00000000..174461d2 --- /dev/null +++ b/examples/deepseek-v3-company-researcher/requirements.txt @@ -0,0 +1,5 @@ +python-dotenv>=1.0.0 +requests>=2.31.0 +openai>=1.12.0 +google-search-results>=2.4.2 +serpapi>=0.1.5 \ No newline at end of file From 46048bc94d43696277f9575a8be027c1bef60625 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Fri, 28 Mar 2025 12:42:25 +0100 Subject: [PATCH 020/160] feat(scrapeURL): return js returns from f-e (FIR-1535) (#1385) * feat(scrapeURL): return js returns from f-e * feat(js-sdk): handle new results --- apps/api/src/controllers/v1/types.ts | 4 +++ .../engines/fire-engine/checkStatus.ts | 32 ++++++++++++++++++- .../scrapeURL/engines/fire-engine/index.ts | 1 + .../src/scraper/scrapeURL/engines/index.ts | 4 +++ apps/js-sdk/firecrawl/src/index.ts | 8 +++++ 5 files changed, 48 insertions(+), 1 deletion(-) diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 9d9109fb..459e5e56 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -541,6 +541,10 @@ export type Document = { actions?: { screenshots?: string[]; scrapes?: ScrapeActionContent[]; + javascriptReturns?: { + type: string, + value: unknown + }[]; }; metadata: { title?: string; diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts index ac6fabfd..47322ef0 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts @@ -42,6 +42,36 @@ const successSchema = z.object({ }) .array() .optional(), + actionResults: z.union([ + z.object({ + idx: z.number(), + type: z.literal("screenshot"), + result: z.object({ + path: z.string(), + }), + }), + z.object({ + idx: z.number(), + type: z.literal("scrape"), + result: z.union([ + z.object({ + url: z.string(), + html: z.string(), + }), + z.object({ + url: z.string(), + accessibility: z.string(), + }), + ]), + }), + z.object({ + idx: z.number(), + type: z.literal("executeJavascript"), + result: z.object({ + return: z.string(), + }), + }), + ]).array().optional(), // chrome-cdp only -- file download handler file: z @@ -138,7 +168,7 @@ export async function fireEngineCheckStatus( } else if ( typeof status.error === "string" && // TODO: improve this later - status.error.includes("Element") + (status.error.includes("Element") || status.error.includes("Javascript execution failed")) ) { throw new ActionError(status.error.split("Error: ")[1]); } else { diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts index c21d9f90..93596fce 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts @@ -274,6 +274,7 @@ export async function scrapeURLWithFireEngineChromeCDP( actions: { screenshots: response.screenshots ?? [], scrapes: response.actionContent ?? [], + javascriptReturns: (response.actionResults ?? []).filter(x => x.type === "executeJavascript").map(x => JSON.parse((x.result as any as { return: string }).return)), }, } : {}), diff --git a/apps/api/src/scraper/scrapeURL/engines/index.ts b/apps/api/src/scraper/scrapeURL/engines/index.ts index 0a688fc6..ab2fe79b 100644 --- a/apps/api/src/scraper/scrapeURL/engines/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/index.ts @@ -103,6 +103,10 @@ export type EngineScrapeResult = { actions?: { screenshots: string[]; scrapes: ScrapeActionContent[]; + javascriptReturns: { + type: string; + value: unknown + }[]; }; }; diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 5cc9d119..11fb8d74 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -141,6 +141,14 @@ export interface ScrapeParams Date: Fri, 28 Mar 2025 12:47:34 +0100 Subject: [PATCH 021/160] fix(api): crawl origin tracking (FIR-1499) --- apps/api/src/controllers/v1/crawl.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/controllers/v1/crawl.ts b/apps/api/src/controllers/v1/crawl.ts index 9f05e73c..51d373ee 100644 --- a/apps/api/src/controllers/v1/crawl.ts +++ b/apps/api/src/controllers/v1/crawl.ts @@ -108,7 +108,7 @@ export async function crawlController( crawlerOptions, scrapeOptions: sc.scrapeOptions, internalOptions: sc.internalOptions, - origin: "api", + origin: req.body.origin, crawl_id: id, webhook: req.body.webhook, v1: true, From 4f0510e71d78cf89142e5249a1dc92fef8c79ef0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Sat, 29 Mar 2025 18:05:37 +0100 Subject: [PATCH 022/160] temp: switch over crawl fetches to main instance --- apps/api/src/lib/supabase-jobs.ts | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/apps/api/src/lib/supabase-jobs.ts b/apps/api/src/lib/supabase-jobs.ts index e36f3b97..bf25c62f 100644 --- a/apps/api/src/lib/supabase-jobs.ts +++ b/apps/api/src/lib/supabase-jobs.ts @@ -8,7 +8,7 @@ import * as Sentry from "@sentry/node"; * @returns {any | null} Job */ export const supabaseGetJobById = async (jobId: string) => { - const { data, error } = await supabase_rr_service + const { data, error } = await supabase_service .from("firecrawl_jobs") .select("*") .eq("job_id", jobId) @@ -31,7 +31,7 @@ export const supabaseGetJobById = async (jobId: string) => { * @returns {any[]} Jobs */ export const supabaseGetJobsById = async (jobIds: string[]) => { - const { data, error } = await supabase_rr_service + const { data, error } = await supabase_service .from("firecrawl_jobs") .select() .in("job_id", jobIds); @@ -55,7 +55,7 @@ export const supabaseGetJobsById = async (jobIds: string[]) => { * @returns {any[]} Jobs */ export const supabaseGetJobsByCrawlId = async (crawlId: string) => { - const { data, error } = await supabase_rr_service + const { data, error } = await supabase_service .from("firecrawl_jobs") .select() .eq("crawl_id", crawlId); @@ -74,7 +74,7 @@ export const supabaseGetJobsByCrawlId = async (crawlId: string) => { }; export const supabaseGetJobByIdOnlyData = async (jobId: string) => { - const { data, error } = await supabase_rr_service + const { data, error } = await supabase_service .from("firecrawl_jobs") .select("docs, team_id") .eq("job_id", jobId) From b9dde3fc3db78e725af191829ab016e057212d66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Sat, 29 Mar 2025 18:18:55 +0100 Subject: [PATCH 023/160] temp: move more to main instance --- apps/api/src/controllers/auth.ts | 3 +-- apps/api/src/controllers/v1/crawl-status.ts | 2 +- apps/api/src/services/billing/auto_charge.ts | 2 +- apps/api/src/services/idempotency/validate.ts | 2 +- 4 files changed, 4 insertions(+), 5 deletions(-) diff --git a/apps/api/src/controllers/auth.ts b/apps/api/src/controllers/auth.ts index ae9abcff..6f3f0580 100644 --- a/apps/api/src/controllers/auth.ts +++ b/apps/api/src/controllers/auth.ts @@ -97,8 +97,7 @@ export async function getACUC( mode === RateLimiterMode.Extract || mode === RateLimiterMode.ExtractStatus; while (retries < maxRetries) { - const client = - Math.random() > (2/3) ? supabase_rr_service : supabase_service; + const client = supabase_service; ({ data, error } = await client.rpc( "auth_credit_usage_chunk_26_tally", { input_key: api_key, i_is_extract: isExtract, tally_untallied_credits: true }, diff --git a/apps/api/src/controllers/v1/crawl-status.ts b/apps/api/src/controllers/v1/crawl-status.ts index 96aa578e..7e847ad5 100644 --- a/apps/api/src/controllers/v1/crawl-status.ts +++ b/apps/api/src/controllers/v1/crawl-status.ts @@ -246,7 +246,7 @@ export async function crawlStatusController( let totalCount = jobIDs.length; if (totalCount === 0 && process.env.USE_DB_AUTHENTICATION === "true") { - const x = await supabase_rr_service + const x = await supabase_service .from('firecrawl_jobs') .select('*', { count: 'exact', head: true }) .eq("crawl_id", req.params.jobId) diff --git a/apps/api/src/services/billing/auto_charge.ts b/apps/api/src/services/billing/auto_charge.ts index b30be201..8540b1ea 100644 --- a/apps/api/src/services/billing/auto_charge.ts +++ b/apps/api/src/services/billing/auto_charge.ts @@ -124,7 +124,7 @@ export async function autoCharge( if (chunk.sub_user_id) { // Fetch the customer's Stripe information const { data: customer, error: customersError } = - await supabase_rr_service + await supabase_service .from("customers") .select("id, stripe_customer_id") .eq("id", chunk.sub_user_id) diff --git a/apps/api/src/services/idempotency/validate.ts b/apps/api/src/services/idempotency/validate.ts index 54ec7bd0..4a5fbe65 100644 --- a/apps/api/src/services/idempotency/validate.ts +++ b/apps/api/src/services/idempotency/validate.ts @@ -18,7 +18,7 @@ export async function validateIdempotencyKey(req: Request): Promise { return false; } - const { data, error } = await supabase_rr_service + const { data, error } = await supabase_service .from("idempotency_keys") .select("key") .eq("key", idempotencyKey); From e0a3c54967bb1a7d470f8059927131c73d1cc64f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Sun, 30 Mar 2025 17:32:24 +0200 Subject: [PATCH 024/160] new acuc --- apps/api/src/controllers/auth.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/controllers/auth.ts b/apps/api/src/controllers/auth.ts index 6f3f0580..6ba86525 100644 --- a/apps/api/src/controllers/auth.ts +++ b/apps/api/src/controllers/auth.ts @@ -99,7 +99,7 @@ export async function getACUC( while (retries < maxRetries) { const client = supabase_service; ({ data, error } = await client.rpc( - "auth_credit_usage_chunk_26_tally", + "auth_credit_usage_chunk_27_tally", { input_key: api_key, i_is_extract: isExtract, tally_untallied_credits: true }, { get: true }, )); From 58e587d99e8e1134699181a060835f1a372c4d64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adem=C3=ADlson=20F=2E=20Tonato?= Date: Mon, 31 Mar 2025 13:27:36 +0100 Subject: [PATCH 025/160] feat(queue-jobs): update notification logic for concurrency limits and add parameter (jsdocs) to batchScrapeUrls --- apps/api/src/services/queue-jobs.ts | 15 ++++++++------- apps/js-sdk/firecrawl/src/index.ts | 1 + 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/apps/api/src/services/queue-jobs.ts b/apps/api/src/services/queue-jobs.ts index 1ce2211c..d2601d30 100644 --- a/apps/api/src/services/queue-jobs.ts +++ b/apps/api/src/services/queue-jobs.ts @@ -1,6 +1,6 @@ import { getScrapeQueue } from "./queue-service"; import { v4 as uuidv4 } from "uuid"; -import { PlanType, WebScraperOptions } from "../types"; +import { NotificationType, PlanType, WebScraperOptions } from "../types"; import * as Sentry from "@sentry/node"; import { cleanOldConcurrencyLimitEntries, @@ -11,6 +11,7 @@ import { } from "../lib/concurrency-limit"; import { logger } from "../lib/logger"; import { getConcurrencyLimitMax } from "./rate-limiter"; +import { sendNotificationWithCustomDays } from './notification/email_notification'; async function _addScrapeJobToConcurrencyQueue( webScraperOptions: any, @@ -80,9 +81,9 @@ async function addScrapeJobRaw( // No need to 2x as if there are more than the max concurrency in the concurrency queue, it is already 2x if(concurrencyQueueJobs > maxConcurrency) { logger.info("Concurrency limited 2x (single) - ", "Concurrency queue jobs: ", concurrencyQueueJobs, "Max concurrency: ", maxConcurrency, "Team ID: ", webScraperOptions.team_id); - // sendNotificationWithCustomDays(webScraperOptions.team_id, NotificationType.CONCURRENCY_LIMIT_REACHED, 10, false).catch((error) => { - // logger.error("Error sending notification (concurrency limit reached): ", error); - // }); + sendNotificationWithCustomDays(webScraperOptions.team_id, NotificationType.CONCURRENCY_LIMIT_REACHED, 15, false).catch((error) => { + logger.error("Error sending notification (concurrency limit reached): ", error); + }); } webScraperOptions.concurrencyLimited = true; @@ -171,9 +172,9 @@ export async function addScrapeJobs( // equals 2x the max concurrency if(addToCQ.length > maxConcurrency) { logger.info("Concurrency limited 2x (multiple) - ", "Concurrency queue jobs: ", addToCQ.length, "Max concurrency: ", maxConcurrency, "Team ID: ", jobs[0].data.team_id); - // sendNotificationWithCustomDays(jobs[0].data.team_id, NotificationType.CONCURRENCY_LIMIT_REACHED, 10, false).catch((error) => { - // logger.error("Error sending notification (concurrency limit reached): ", error); - // }); + sendNotificationWithCustomDays(jobs[0].data.team_id, NotificationType.CONCURRENCY_LIMIT_REACHED, 15, false).catch((error) => { + logger.error("Error sending notification (concurrency limit reached): ", error); + }); } await Promise.all( diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 11fb8d74..41d13da0 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -922,6 +922,7 @@ export default class FirecrawlApp { * @param pollInterval - Time in seconds for job status checks. * @param idempotencyKey - Optional idempotency key for the request. * @param webhook - Optional webhook for the batch scrape. + * @param ignoreInvalidURLs - Optional flag to ignore invalid URLs. * @returns The response from the crawl operation. */ async batchScrapeUrls( From b79b90fdd1ba0103e18565cb4359f4d74db5ce2c Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 1 Apr 2025 20:53:43 +0400 Subject: [PATCH 026/160] Update auth.ts --- apps/api/src/controllers/auth.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/apps/api/src/controllers/auth.ts b/apps/api/src/controllers/auth.ts index 6ba86525..4bca6809 100644 --- a/apps/api/src/controllers/auth.ts +++ b/apps/api/src/controllers/auth.ts @@ -401,6 +401,7 @@ function getPlanByPriceId(price_id: string | null): PlanType { case process.env.STRIPE_PRICE_ID_ETIER_SCALE_1_YEARLY_FIRECRAWL: return "etierscale1"; case process.env.STRIPE_PRICE_ID_ETIER_SCALE_2_YEARLY: + case process.env.STRIPE_PRICE_ID_ETIER_SCALE_2_MONTHLY: return "etierscale2"; case process.env.STRIPE_PRICE_ID_EXTRACT_STARTER_MONTHLY: case process.env.STRIPE_PRICE_ID_EXTRACT_STARTER_YEARLY: From c4255f4fddf98e5d2114c6e17943dfe394f3a4ce Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 1 Apr 2025 21:00:40 +0400 Subject: [PATCH 027/160] Update auth.ts --- apps/api/src/controllers/auth.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/apps/api/src/controllers/auth.ts b/apps/api/src/controllers/auth.ts index 4bca6809..70a154c8 100644 --- a/apps/api/src/controllers/auth.ts +++ b/apps/api/src/controllers/auth.ts @@ -416,3 +416,4 @@ function getPlanByPriceId(price_id: string | null): PlanType { return "free"; } } + From ee211132c8b538a9d3902a83866e1be792f14866 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 1 Apr 2025 21:06:27 +0400 Subject: [PATCH 028/160] Nick: --- apps/api/package.json | 1 - apps/api/pnpm-lock.yaml | 493 +--------------------------------------- 2 files changed, 4 insertions(+), 490 deletions(-) diff --git a/apps/api/package.json b/apps/api/package.json index 745783b8..3feec21f 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -98,7 +98,6 @@ "json-schema-to-zod": "^2.3.0", "keyword-extractor": "^0.0.28", "koffi": "^2.9.0", - "langchain": "^0.2.8", "languagedetect": "^2.0.0", "lodash": "^4.17.21", "logsnag": "^1.0.0", diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index b11784a3..fdd8aa14 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -149,9 +149,6 @@ importers: koffi: specifier: ^2.9.0 version: 2.9.0 - langchain: - specifier: ^0.2.8 - version: 0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(jsdom@26.0.0)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.24.2))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0) languagedetect: specifier: ^2.0.0 version: 2.0.0 @@ -879,18 +876,6 @@ packages: '@jsdevtools/ono@7.1.3': resolution: {integrity: sha512-4JQNk+3mVzK3xh2rqd6RB4J46qUR19azEHBneZyTZM+c456qOrbbM/5xcR8huNCCcbVt7+UmizG6GuUvPvKUYg==} - '@langchain/core@0.2.12': - resolution: {integrity: sha512-zaKvUcWU1Cxcpd/fxklygY6iUrxls10KTRzyHZGBAIKJq1JD/B10vX59YlFgBs7nqqVTEvaChfIE0O0e2qBttA==} - engines: {node: '>=18'} - - '@langchain/openai@0.2.1': - resolution: {integrity: sha512-Ti3C6ZIUPaueIPAfMljMnLu3GSGNq5KmrlHeWkIbrLShOBlzj4xj7mRfR73oWgAC0qivfxdkfbB0e+WCY+oRJw==} - engines: {node: '>=18'} - - '@langchain/textsplitters@0.0.3': - resolution: {integrity: sha512-cXWgKE3sdWLSqAa8ykbCcUsUF1Kyr5J3HOWYGuobhPEycXW4WI++d5DhzdpL238mzoEXTi90VqfSCra37l5YqA==} - engines: {node: '>=18'} - '@mixmark-io/domino@2.2.0': resolution: {integrity: sha512-Y28PR25bHXUg88kCV7nivXrP2Nj2RueZ3/l/jdx6J9f8J4nsEGcgX0Qe6lt7Pa+J79+kPiJU3LguR6O/6zrLOw==} @@ -1688,9 +1673,6 @@ packages: '@types/range-parser@1.2.7': resolution: {integrity: sha512-hKormJbkJqzQGhziax5PItDUTMAM9uE2XXQmM37dyd4hVM+5aVl7oVxMVUiVQn2oCQFN/LKCZdvSM0pFRqbSmQ==} - '@types/retry@0.12.0': - resolution: {integrity: sha512-wWKOClTTiizcZhXnPY4wikVAwmdYHp8q6DmC+EJUzAMsycb7HB32Kh9RN4+0gExjmPmZSAQjgURXIGATPegAvA==} - '@types/send@0.17.4': resolution: {integrity: sha512-x2EM6TJOybec7c52BX0ZspPodMsQUd5L6PRwOunVyVUhXiBSKf3AezDL8Dgvgt5o0UfKNfuA0eMLr2wLT4AiBA==} @@ -1959,9 +1941,6 @@ packages: resolution: {integrity: sha512-Ceh+7ox5qe7LJuLHoY0feh3pHuUDHAcRUeyL2VYghZwfpkNIy/+8Ocg0a3UuSoYzavmylwuLWQOf3hl0jjMMIw==} engines: {node: '>=8'} - binary-search@1.3.6: - resolution: {integrity: sha512-nbE1WxOTTrUWIfsfZ4aHGYu5DOuNkbxGokjV6Z2kxfJK3uaAb8zNK1muzOeipoLHZjInT4Br88BHpzevc681xA==} - bluebird@3.4.7: resolution: {integrity: sha512-iD3898SR7sWVRHbiQv+sHUtHnMvC1o3nW5rAcqnq3uOn07DSAppZYUkIGslDz6gXC7HfunPe7YVBgoEJASPcHA==} @@ -2285,10 +2264,6 @@ packages: supports-color: optional: true - decamelize@1.2.0: - resolution: {integrity: sha512-z2S+W9X73hAUUki+N+9Za2lBlun89zigOyGrsax+KUQ6wKW4ZoWpEYBkGhQjwAjjDCkWxhY0VKEhk8wzY7F5cA==} - engines: {node: '>=0.10.0'} - decamelize@4.0.0: resolution: {integrity: sha512-9iE1PgSik9HeIIw2JO94IidnE3eBoQrFJ3w7sFuzSX4DpmZ3v5sZpUiV5Swcf6mQEF+Y0ru8Neo+p+nyh2J+hQ==} engines: {node: '>=10'} @@ -2717,11 +2692,6 @@ packages: graceful-fs@4.2.11: resolution: {integrity: sha512-RbJ5/jmFcNNCcDV5o9eTnBLJ/HszWV0P73bc+Ff4nS/rJj+YaS6IGyiOL0VoBYX+l1Wrl3k63h/KrH+nhJ0XvQ==} - handlebars@4.7.8: - resolution: {integrity: sha512-vafaFqs8MZkRrSX7sFVUdo3ap/eNiLnb4IakshzvP56X5Nr1iGKAIqdX6tMlm6HcNRIkr6AxO5jFEoJzzpT8aQ==} - engines: {node: '>=0.4.7'} - hasBin: true - has-flag@3.0.0: resolution: {integrity: sha512-sKJf1+ceQBr4SMkvQnBDNDtf4TXpVhVGateu0t918bl30FnbE2m4vNLX+VWe/dpjlb+HugGYzW7uQXH98HPEYw==} engines: {node: '>=4'} @@ -2873,9 +2843,6 @@ packages: resolution: {integrity: sha512-0KI/607xoxSToH7GjN1FfSbLoU0+btTicjsQSWQlh/hZykN8KpmMf7uYwPW3R+akZ6R/w18ZlXSHBYXiYUPO3g==} engines: {node: '>= 0.10'} - is-any-array@2.0.1: - resolution: {integrity: sha512-UtilS7hLRu++wb/WBAw9bNuP1Eg04Ivn1vERJck8zJthEvXCBEBpGR/33u/xLKWEQf95803oalHrVDptcAvFdQ==} - is-arrayish@0.2.1: resolution: {integrity: sha512-zz06S8t0ozoDXMG+ube26zeCTNXcKIPJZJi8hBrF4idCLms4CG9QtK7qBl1boi5ODzFpjswb5JPmHCbMpjaYzg==} @@ -3116,9 +3083,6 @@ packages: resolution: {integrity: sha512-cEiJEAEoIbWfCZYKWhVwFuvPX1gETRYPw6LlaTKoxD3s2AkXzkCjnp6h0V77ozyqj0jakteJ4YqDJT830+lVGw==} engines: {node: '>=14'} - js-tiktoken@1.0.12: - resolution: {integrity: sha512-L7wURW1fH9Qaext0VzaUDpFGVQgjkdE3Dgsy9/+yXyGEpBKnylTd0mU0bfbNkKDlXRb6TEsZkwuflu1B8uQbJQ==} - js-tokens@4.0.0: resolution: {integrity: sha512-RdJUflcE3cUzKiMqQgsCu06FPu9UdIJO0beYbPhHN4k6apgJtifcoCtT9bcxOpYBtpD2kCM6Sbzg4CausW/PKQ==} @@ -3173,10 +3137,6 @@ packages: jsonfile@6.1.0: resolution: {integrity: sha512-5dgndWOriYSm5cnYaJNhalLNDKOqFwyDB/rr1E9ZsGciGvKPs8R2xYGCacuf3z6K1YKDz182fd+fY3cn3pMqXQ==} - jsonpointer@5.0.1: - resolution: {integrity: sha512-p/nXbhSEcu3pZRdkW1OfJhpsVtW1gd4Wa1fnQc9YLiTfAjn0312eMKimbdIQzuZl9aa9xUGaRlP9T/CJE/ditQ==} - engines: {node: '>=0.10.0'} - jszip@3.10.1: resolution: {integrity: sha512-xXDvecyTpGLrqFrvkrUSoxxfJI5AH7U8zxxtVclpsUtMCq4JQ290LY8AW5c7Ggnr/Y/oK+bQMbqK2qmtk3pN4g==} @@ -3198,179 +3158,6 @@ packages: kuler@2.0.0: resolution: {integrity: sha512-Xq9nH7KlWZmXAtodXDDRE7vs6DU1gTU8zYDHDiWLSip45Egwq3plLHzPn27NgvzL2r1LMPC1vdqh98sQxtqj4A==} - langchain@0.2.8: - resolution: {integrity: sha512-kb2IOMA71xH8e6EXFg0l4S+QSMC/c796pj1+7mPBkR91HHwoyHZhFRrBaZv4tV+Td+Ba91J2uEDBmySklZLpNQ==} - engines: {node: '>=18'} - peerDependencies: - '@aws-sdk/client-s3': ^3.310.0 - '@aws-sdk/client-sagemaker-runtime': ^3.310.0 - '@aws-sdk/client-sfn': ^3.310.0 - '@aws-sdk/credential-provider-node': ^3.388.0 - '@azure/storage-blob': ^12.15.0 - '@browserbasehq/sdk': '*' - '@gomomento/sdk': ^1.51.1 - '@gomomento/sdk-core': ^1.51.1 - '@gomomento/sdk-web': ^1.51.1 - '@mendable/firecrawl-js': ^0.0.13 - '@notionhq/client': ^2.2.10 - '@pinecone-database/pinecone': '*' - '@supabase/supabase-js': ^2.10.0 - '@vercel/kv': ^0.2.3 - '@xata.io/client': ^0.28.0 - apify-client: ^2.7.1 - assemblyai: ^4.0.0 - axios: '*' - cheerio: ^1.0.0-rc.12 - chromadb: '*' - convex: ^1.3.1 - couchbase: ^4.3.0 - d3-dsv: ^2.0.0 - epub2: ^3.0.1 - faiss-node: '*' - fast-xml-parser: '*' - handlebars: ^4.7.8 - html-to-text: ^9.0.5 - ignore: ^5.2.0 - ioredis: ^5.3.2 - jsdom: '*' - mammoth: ^1.6.0 - mongodb: '>=5.2.0' - node-llama-cpp: '*' - notion-to-md: ^3.1.0 - officeparser: ^4.0.4 - pdf-parse: 1.1.1 - peggy: ^3.0.2 - playwright: ^1.32.1 - puppeteer: ^19.7.2 - pyodide: ^0.24.1 - redis: ^4.6.4 - sonix-speech-recognition: ^2.1.1 - srt-parser-2: ^1.2.3 - typeorm: ^0.3.20 - weaviate-ts-client: '*' - web-auth-library: ^1.0.3 - ws: ^8.14.2 - youtube-transcript: ^1.0.6 - youtubei.js: ^9.1.0 - peerDependenciesMeta: - '@aws-sdk/client-s3': - optional: true - '@aws-sdk/client-sagemaker-runtime': - optional: true - '@aws-sdk/client-sfn': - optional: true - '@aws-sdk/credential-provider-node': - optional: true - '@azure/storage-blob': - optional: true - '@browserbasehq/sdk': - optional: true - '@gomomento/sdk': - optional: true - '@gomomento/sdk-core': - optional: true - '@gomomento/sdk-web': - optional: true - '@mendable/firecrawl-js': - optional: true - '@notionhq/client': - optional: true - '@pinecone-database/pinecone': - optional: true - '@supabase/supabase-js': - optional: true - '@vercel/kv': - optional: true - '@xata.io/client': - optional: true - apify-client: - optional: true - assemblyai: - optional: true - axios: - optional: true - cheerio: - optional: true - chromadb: - optional: true - convex: - optional: true - couchbase: - optional: true - d3-dsv: - optional: true - epub2: - optional: true - faiss-node: - optional: true - fast-xml-parser: - optional: true - handlebars: - optional: true - html-to-text: - optional: true - ignore: - optional: true - ioredis: - optional: true - jsdom: - optional: true - mammoth: - optional: true - mongodb: - optional: true - node-llama-cpp: - optional: true - notion-to-md: - optional: true - officeparser: - optional: true - pdf-parse: - optional: true - peggy: - optional: true - playwright: - optional: true - puppeteer: - optional: true - pyodide: - optional: true - redis: - optional: true - sonix-speech-recognition: - optional: true - srt-parser-2: - optional: true - typeorm: - optional: true - weaviate-ts-client: - optional: true - web-auth-library: - optional: true - ws: - optional: true - youtube-transcript: - optional: true - youtubei.js: - optional: true - - langchainhub@0.0.11: - resolution: {integrity: sha512-WnKI4g9kU2bHQP136orXr2bcRdgz9iiTBpTN0jWt9IlScUKnJBoD0aa2HOzHURQKeQDnt2JwqVmQ6Depf5uDLQ==} - - langsmith@0.1.34: - resolution: {integrity: sha512-aMv2k8kEaovhTuZnK6/6DMCoM7Jurvm1AzdESn+yN+HramRxp3sK32jFRz3ogkXP6GjAjOIofcnNkzhHXSUXGA==} - peerDependencies: - '@langchain/core': '*' - langchain: '*' - openai: '*' - peerDependenciesMeta: - '@langchain/core': - optional: true - langchain: - optional: true - openai: - optional: true - languagedetect@2.0.0: resolution: {integrity: sha512-AZb/liiQ+6ZoTj4f1J0aE6OkzhCo8fyH+tuSaPfSo8YHCWLFJrdSixhtO2TYdIkjcDQNaR4RmGaV2A5FJklDMQ==} engines: {node: '>= 0.4.8'} @@ -3404,9 +3191,6 @@ packages: lodash.merge@4.6.2: resolution: {integrity: sha512-0KpjqXRVvrYyCsX1swR/XTK0va6VQkQM6MNo7PqW77ByjAhoARA8EfrP1N4+KlKj8YS0ZUCtRT/YUuhyYDujIQ==} - lodash.set@4.3.2: - resolution: {integrity: sha512-4hNPN5jlm/N/HLMCO43v8BXKq9Z7QdAGc/VGrRD61w8gN9g/6jF9A4L1pbUgBLCffi0w9VsXfTOij5x8iTyFvg==} - lodash@4.17.21: resolution: {integrity: sha512-v2kDEe57lecTulaDIuNTPy3Ry4gLGJ6Z1O3vE1krgXZNrsQ+LFTGHVxVjcXPs17LhbZVGedAJv8XZ1tvj5FvSg==} @@ -3562,21 +3346,6 @@ packages: engines: {node: '>=10'} hasBin: true - ml-array-mean@1.1.6: - resolution: {integrity: sha512-MIdf7Zc8HznwIisyiJGRH9tRigg3Yf4FldW8DxKxpCCv/g5CafTw0RRu51nojVEOXuCQC7DRVVu5c7XXO/5joQ==} - - ml-array-sum@1.1.6: - resolution: {integrity: sha512-29mAh2GwH7ZmiRnup4UyibQZB9+ZLyMShvt4cH4eTK+cL2oEMIZFnSyB3SS8MlsTh6q/w/yh48KmqLxmovN4Dw==} - - ml-distance-euclidean@2.0.0: - resolution: {integrity: sha512-yC9/2o8QF0A3m/0IXqCTXCzz2pNEzvmcE/9HFKOZGnTjatvBbsn4lWYJkxENkA4Ug2fnYl7PXQxnPi21sgMy/Q==} - - ml-distance@4.0.1: - resolution: {integrity: sha512-feZ5ziXs01zhyFUUUeZV5hwc0f5JW0Sh0ckU1koZe/wdVkJdGxcP06KNQuF0WBTj8FttQUzcvQcpcrOp/XrlEw==} - - ml-tree-similarity@1.0.0: - resolution: {integrity: sha512-XJUyYqjSuUQkNQHMscr6tcjldsOoAekxADTplt40QKfwW6nd++1wHWV9AArl0Zvw/TIHgNaZZNvr8QGvE8wLRg==} - module-details-from-path@1.0.3: resolution: {integrity: sha512-ySViT69/76t8VhE1xXHK6Ch4NcDd26gx0MzKXLO+F7NOtnqH68d9zF94nT8ZWSxXh8ELOERsnJO/sWt1xZYw5A==} @@ -3641,10 +3410,6 @@ packages: msgpackr@1.11.2: resolution: {integrity: sha512-F9UngXRlPyWCDEASDpTf6c9uNhGPTqnTeLVt7bN+bU1eajoR/8V9ys2BRaV5C/e5ihE6sJ9uPIKaYt6bFuO32g==} - mustache@4.2.0: - resolution: {integrity: sha512-71ippSywq5Yb7/tVYyGbkBggbU8H3u5Rz56fH60jGFgr8uHwxs+aSKeqmluIVzM0m0kB7xQjKS6qPfd0b2ZoqQ==} - hasBin: true - nanoid@3.3.8: resolution: {integrity: sha512-WNLf5Sd8oZxOm+TzppcYk8gVOgP+l58xNy58D0nbUnOxOWRWvlcCV4kUF7ltmI6PsrLl/BgKEyS4mqsGChFN0w==} engines: {node: ^10 || ^12 || ^13.7 || ^14 || >=15.0.1} @@ -3661,9 +3426,6 @@ packages: resolution: {integrity: sha512-+EUsqGPLsM+j/zdChZjsnX51g4XrHFOIXwfnCVPGlQk/k5giakcKsuxCObBRu6DSm9opw/O6slWbJdghQM4bBg==} engines: {node: '>= 0.6'} - neo-async@2.6.2: - resolution: {integrity: sha512-Yd3UES5mWCSqR+qNT93S3UoYUkqAZ9lLg8a7g9rimsWmYGK8cVToA4/sF3RrshdyV3sAGMXVUmpMYOw+dLpOuw==} - netmask@2.0.2: resolution: {integrity: sha512-dBpDMdxv9Irdq66304OLfEmQ9tbNRFnFTuZiLo+bD+r332bBmMJ8GBLXklIXXgxd3+v9+KUnZaUR5PJMa75Gsg==} engines: {node: '>= 0.4.0'} @@ -3730,10 +3492,6 @@ packages: nth-check@2.1.1: resolution: {integrity: sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w==} - num-sort@2.1.0: - resolution: {integrity: sha512-1MQz1Ed8z2yckoBeSfkQHHO9K1yDRxxtotKSJ9yvcTUUxSvfvzEq5GwBrjjHEpMlq/k5gvXdmJ1SbYxWtpNoVg==} - engines: {node: '>=8'} - nwsapi@2.2.16: resolution: {integrity: sha512-F1I/bimDpj3ncaNDhfyMWuFqmQDBwDB0Fogc2qpL3BWvkQteFD/8BzWuIRl83rq0DXfm8SGt/HFhLXZyljTXcQ==} @@ -3770,18 +3528,6 @@ packages: openai@3.3.0: resolution: {integrity: sha512-uqxI/Au+aPRnsaQRe8CojU0eCR7I0mBiKjD3sNMzY6DaC1ZVrc85u98mtJW6voDug8fgGN+DIZmTDxTthxb7dQ==} - openai@4.57.0: - resolution: {integrity: sha512-JnwBSIYqiZ3jYjB5f2in8hQ0PRA092c6m+/6dYB0MzK0BEbn+0dioxZsPLBm5idJbg9xzLNOiGVm2OSuhZ+BdQ==} - hasBin: true - peerDependencies: - zod: ^3.23.8 - peerDependenciesMeta: - zod: - optional: true - - openapi-types@12.1.3: - resolution: {integrity: sha512-N4YtSYJqghVu4iek2ZUvcN/0aqH1kRDuNqzcycDxhOUpg7GdvLa2F3DgS6yBNhInhv2r/6I0Flkn7CqL8+nIcw==} - opener@1.5.2: resolution: {integrity: sha512-ur5UIdyw5Y7yEj9wLzhqXiy6GZ3Mwx0yGI+5sMn2r0N0v3cKJvUmFH5yPP+WXh9e0xfyzyJX95D8l088DNFj7A==} hasBin: true @@ -3795,10 +3541,6 @@ packages: option@0.2.4: resolution: {integrity: sha512-pkEqbDyl8ou5cpq+VsnQbe/WlEy5qS7xPzMS1U55OCG9KPvwFD46zDbxQIj3egJSFc3D+XhYOPUzz49zQAVy7A==} - p-finally@1.0.0: - resolution: {integrity: sha512-LICb2p9CB7FS+0eR1oqWnHhp0FljGLZCWBE9aix0Uye9W8LTQPwMTYVGWQWIw9RdQiDg4+epXQODwIYJtSJaow==} - engines: {node: '>=4'} - p-limit@2.3.0: resolution: {integrity: sha512-//88mFWSJx8lxCzwdAABTJL2MyWB12+eIY7MDL2SqLmAkeKU9qxRvWuSyTjm3FUmpBEMuFfckAIqEaVGUDxb6w==} engines: {node: '>=6'} @@ -3811,18 +3553,6 @@ packages: resolution: {integrity: sha512-R79ZZ/0wAxKGu3oYMlz8jy/kbhsNrS7SKZ7PxEHBgJ5+F2mtFW2fK2cOtBh1cHYkQsbzFV7I+EoRKe6Yt0oK7A==} engines: {node: '>=8'} - p-queue@6.6.2: - resolution: {integrity: sha512-RwFpb72c/BhQLEXIZ5K2e+AhgNVmIejGlTgiB9MzZ0e93GRvqZ7uSi0dvRF7/XIXDeNkra2fNHBxTyPDGySpjQ==} - engines: {node: '>=8'} - - p-retry@4.6.2: - resolution: {integrity: sha512-312Id396EbJdvRONlngUx0NydfrIQ5lsYu0znKVUzVvArzEIt08V1qhtyESbGVd1FGX7UKtiFp5uwKZdM8wIuQ==} - engines: {node: '>=8'} - - p-timeout@3.2.0: - resolution: {integrity: sha512-rhIwUycgwwKcP9yTOOFK/AKsAopjjCakVqLHePO3CC6Mir1Z99xT+R63jZxAT5lFZLa2inS5h+ZS2GvR99/FBg==} - engines: {node: '>=8'} - p-try@2.2.0: resolution: {integrity: sha512-R4nPAVTAU0B9D35/Gk3uJf/7XYbQcyohSKdvAxIRSNghFl4e71hVoGnBNQz9cWaXxO2I10KTC+3jMdvvoKw6dQ==} engines: {node: '>=6'} @@ -4166,10 +3896,6 @@ packages: resolution: {integrity: sha512-oKWePCxqpd6FlLvGV1VU0x7bkPmmCNolxzjMf4NczoDnQcIWrAF+cPtZn5i6n+RfD2d9i0tzpKnG6Yk168yIyw==} hasBin: true - retry@0.13.1: - resolution: {integrity: sha512-XQBQ3I8W1Cge0Seh+6gjj03LbmRFWuoszgK9ooCpwYIrhhoO80pfq4cUkU5DkknwfOfFteRwlZ56PYOGYyFWdg==} - engines: {node: '>= 4'} - rimraf@5.0.7: resolution: {integrity: sha512-nV6YcJo5wbLW77m+8KjH8aB/7/rxQy9SZ0HY5shnwULfS+9nmTtVXAJET5NdZmCzA4fPI/Hm1wo/Po/4mopOdg==} engines: {node: '>=14.18'} @@ -4584,8 +4310,8 @@ packages: engines: {node: '>=14.17'} hasBin: true - typescript@5.7.3: - resolution: {integrity: sha512-84MVSjMEHP+FQRPy3pX9sTVV/INIex71s9TL2Gm5FG/WG1SqXeKyZ0k7/blY/4FdOzI12CBy1vGc4og/eus0fw==} + typescript@5.8.2: + resolution: {integrity: sha512-aJn6wq13/afZp/jT9QZmwEjDqqvSGp1VT5GVg+f/t6/oVyrgXM6BY1h9BRh/O5p3PlUPAe+WuiEZOmb/49RqoQ==} engines: {node: '>=14.17'} hasBin: true @@ -4595,11 +4321,6 @@ packages: peerDependencies: '@babel/runtime': ^7.23.2 - uglify-js@3.18.0: - resolution: {integrity: sha512-SyVVbcNBCk0dzr9XL/R/ySrmYf0s372K6/hFklzgcp2lBFyXtw4I7BOdDjlLhE1aVqaI/SHWXWmYdlZxuyF38A==} - engines: {node: '>=0.8.0'} - hasBin: true - unbzip2-stream@1.4.3: resolution: {integrity: sha512-mlExGW4w71ebDJviH16lQLtZS32VKqsSfk80GCfUlwT/4/hNRFsoscrF/c++9xinkMzECL1uL9DDwXqFWkruPg==} @@ -4749,9 +4470,6 @@ packages: engines: {node: '>=6'} hasBin: true - wordwrap@1.0.0: - resolution: {integrity: sha512-gvVzJFlPycKc5dZN4yPkP8w7Dc37BtP1yczEneOb4uq34pXZcvrtRTmWV8W+Ume+XCxKgbjM+nevkyFPMybd4Q==} - wrap-ansi@7.0.0: resolution: {integrity: sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==} engines: {node: '>=10'} @@ -4832,11 +4550,6 @@ packages: resolution: {integrity: sha512-YgvUTfwqyc7UXVMrB+SImsVYSmTS8X/tSrtdNZMImM+n7+QTriRXyXim0mBrTXNeqzVF0KWGgHPeiyViFFrNDw==} engines: {node: '>=18'} - yaml@2.4.5: - resolution: {integrity: sha512-aBx2bnqDzVOyNKfsysjA2ms5ZlnjSAW2eG3/L5G/CSujfjLJTJsEw1bGw8kCf04KodQWk1pxlGnZ56CRxiawmg==} - engines: {node: '>= 14'} - hasBin: true - yargs-parser@21.1.1: resolution: {integrity: sha512-tVpsJW7DdjecAiFpbIB1e3qxIQsE6NoPc5/eTdrbbIC4h0LVsWhnoa3g+m2HclBIujHzsxZ4VJVA+GUuc2/LBw==} engines: {node: '>=12'} @@ -4860,11 +4573,6 @@ packages: resolution: {integrity: sha512-rVksvsnNCdJ/ohGc6xgPwyN8eheCxsiLM8mxuE/t/mOVqJewPuO1miLpTHQiRgTKCLexL4MeAFVagts7HmNZ2Q==} engines: {node: '>=10'} - zod-to-json-schema@3.23.1: - resolution: {integrity: sha512-oT9INvydob1XV0v1d2IadrR74rLtDInLvDFfAa1CG0Pmg/vxATk7I2gSelfj271mbzeM4Da0uuDQE/Nkj3DWNw==} - peerDependencies: - zod: ^3.23.3 - zod-to-json-schema@3.24.2: resolution: {integrity: sha512-pNUqrcSxuuB3/+jBbU8qKUbTbDqYUaG1vf5cXFjbhGgoUuA1amO/y4Q8lzfOhHU8HNPK6VFJ18lBDKj3OHyDsg==} peerDependencies: @@ -5926,43 +5634,6 @@ snapshots: '@jsdevtools/ono@7.1.3': {} - '@langchain/core@0.2.12(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(jsdom@26.0.0)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.24.2))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(encoding@0.1.13)(zod@3.24.2))': - dependencies: - ansi-styles: 5.2.0 - camelcase: 6.3.0 - decamelize: 1.2.0 - js-tiktoken: 1.0.12 - langsmith: 0.1.34(7c31787ccbd7899ead3aa20aba61c53a) - ml-distance: 4.0.1 - mustache: 4.2.0 - p-queue: 6.6.2 - p-retry: 4.6.2 - uuid: 9.0.1 - zod: 3.24.2 - zod-to-json-schema: 3.23.1(zod@3.24.2) - transitivePeerDependencies: - - langchain - - openai - - '@langchain/openai@0.2.1(encoding@0.1.13)(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(jsdom@26.0.0)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.24.2))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))': - dependencies: - '@langchain/core': 0.2.12(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(jsdom@26.0.0)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.24.2))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(encoding@0.1.13)(zod@3.24.2)) - js-tiktoken: 1.0.12 - openai: 4.57.0(encoding@0.1.13)(zod@3.24.2) - zod: 3.24.2 - zod-to-json-schema: 3.23.1(zod@3.24.2) - transitivePeerDependencies: - - encoding - - langchain - - '@langchain/textsplitters@0.0.3(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(jsdom@26.0.0)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.24.2))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(encoding@0.1.13)(zod@3.24.2))': - dependencies: - '@langchain/core': 0.2.12(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(jsdom@26.0.0)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.24.2))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(encoding@0.1.13)(zod@3.24.2)) - js-tiktoken: 1.0.12 - transitivePeerDependencies: - - langchain - - openai - '@mixmark-io/domino@2.2.0': {} '@mongodb-js/saslprep@1.1.7': @@ -6999,8 +6670,6 @@ snapshots: '@types/range-parser@1.2.7': {} - '@types/retry@0.12.0': {} - '@types/send@0.17.4': dependencies: '@types/mime': 1.3.5 @@ -7301,8 +6970,6 @@ snapshots: binary-extensions@2.3.0: {} - binary-search@1.3.6: {} - bluebird@3.4.7: {} body-parser@1.20.2: @@ -7657,8 +7324,6 @@ snapshots: dependencies: ms: 2.1.2 - decamelize@1.2.0: {} - decamelize@4.0.0: {} decimal.js@10.5.0: {} @@ -8083,16 +7748,6 @@ snapshots: graceful-fs@4.2.11: {} - handlebars@4.7.8: - dependencies: - minimist: 1.2.8 - neo-async: 2.6.2 - source-map: 0.6.1 - wordwrap: 1.0.0 - optionalDependencies: - uglify-js: 3.18.0 - optional: true - has-flag@3.0.0: {} has-flag@4.0.0: {} @@ -8287,8 +7942,6 @@ snapshots: ipaddr.js@1.9.1: {} - is-any-array@2.0.1: {} - is-arrayish@0.2.1: {} is-arrayish@0.3.2: {} @@ -8718,10 +8371,6 @@ snapshots: js-cookie@3.0.5: {} - js-tiktoken@1.0.12: - dependencies: - base64-js: 1.5.1 - js-tokens@4.0.0: {} js-yaml@3.14.1: @@ -8787,8 +8436,6 @@ snapshots: optionalDependencies: graceful-fs: 4.2.11 - jsonpointer@5.0.1: {} - jszip@3.10.1: dependencies: lie: 3.3.0 @@ -8806,60 +8453,6 @@ snapshots: kuler@2.0.0: {} - langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(jsdom@26.0.0)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.24.2))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0): - dependencies: - '@langchain/core': 0.2.12(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(jsdom@26.0.0)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.24.2))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(encoding@0.1.13)(zod@3.24.2)) - '@langchain/openai': 0.2.1(encoding@0.1.13)(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(jsdom@26.0.0)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.24.2))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0)) - '@langchain/textsplitters': 0.0.3(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(jsdom@26.0.0)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.24.2))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(encoding@0.1.13)(zod@3.24.2)) - binary-extensions: 2.3.0 - js-tiktoken: 1.0.12 - js-yaml: 4.1.0 - jsonpointer: 5.0.1 - langchainhub: 0.0.11 - langsmith: 0.1.34(7c31787ccbd7899ead3aa20aba61c53a) - ml-distance: 4.0.1 - openapi-types: 12.1.3 - p-retry: 4.6.2 - uuid: 9.0.1 - yaml: 2.4.5 - zod: 3.24.2 - zod-to-json-schema: 3.23.1(zod@3.24.2) - optionalDependencies: - '@aws-sdk/credential-provider-node': 3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0) - '@pinecone-database/pinecone': 4.0.0 - '@supabase/supabase-js': 2.44.2 - axios: 1.7.2 - cheerio: 1.0.0-rc.12 - fast-xml-parser: 4.4.1 - handlebars: 4.7.8 - html-to-text: 9.0.5 - ioredis: 5.4.1 - jsdom: 26.0.0 - mammoth: 1.7.2 - mongodb: 6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3) - pdf-parse: 1.1.1 - puppeteer: 22.12.1(typescript@5.4.5) - redis: 4.6.14 - ws: 8.18.0 - transitivePeerDependencies: - - encoding - - openai - - langchainhub@0.0.11: {} - - langsmith@0.1.34(7c31787ccbd7899ead3aa20aba61c53a): - dependencies: - '@types/uuid': 9.0.8 - commander: 10.0.1 - lodash.set: 4.3.2 - p-queue: 6.6.2 - p-retry: 4.6.2 - uuid: 9.0.1 - optionalDependencies: - '@langchain/core': 0.2.12(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(jsdom@26.0.0)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.24.2))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(encoding@0.1.13)(zod@3.24.2)) - langchain: 0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(jsdom@26.0.0)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.24.2))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0) - openai: 4.57.0(encoding@0.1.13)(zod@3.24.2) - languagedetect@2.0.0: {} leac@0.6.0: {} @@ -8884,8 +8477,6 @@ snapshots: lodash.merge@4.6.2: {} - lodash.set@4.3.2: {} - lodash@4.17.21: {} logform@2.6.0: @@ -9033,27 +8624,6 @@ snapshots: mkdirp@3.0.1: {} - ml-array-mean@1.1.6: - dependencies: - ml-array-sum: 1.1.6 - - ml-array-sum@1.1.6: - dependencies: - is-any-array: 2.0.1 - - ml-distance-euclidean@2.0.0: {} - - ml-distance@4.0.1: - dependencies: - ml-array-mean: 1.1.6 - ml-distance-euclidean: 2.0.0 - ml-tree-similarity: 1.0.0 - - ml-tree-similarity@1.0.0: - dependencies: - binary-search: 1.3.6 - num-sort: 2.1.0 - module-details-from-path@1.0.3: {} moment@2.30.1: {} @@ -9121,8 +8691,6 @@ snapshots: optionalDependencies: msgpackr-extract: 3.0.3 - mustache@4.2.0: {} - nanoid@3.3.8: {} natural-compare@1.4.0: {} @@ -9158,9 +8726,6 @@ snapshots: negotiator@0.6.3: {} - neo-async@2.6.2: - optional: true - netmask@2.0.2: {} node-abi@3.67.0: @@ -9223,8 +8788,6 @@ snapshots: dependencies: boolbase: 1.0.0 - num-sort@2.1.0: {} - nwsapi@2.2.16: {} object-assign@4.1.1: {} @@ -9262,24 +8825,6 @@ snapshots: transitivePeerDependencies: - debug - openai@4.57.0(encoding@0.1.13)(zod@3.24.2): - dependencies: - '@types/node': 18.19.39 - '@types/node-fetch': 2.6.11 - '@types/qs': 6.9.15 - abort-controller: 3.0.0 - agentkeepalive: 4.5.0 - form-data-encoder: 1.7.2 - formdata-node: 4.4.1 - node-fetch: 2.7.0(encoding@0.1.13) - qs: 6.12.2 - optionalDependencies: - zod: 3.24.2 - transitivePeerDependencies: - - encoding - - openapi-types@12.1.3: {} - opener@1.5.2: {} opentelemetry-instrumentation-fetch-node@1.2.3(@opentelemetry/api@1.9.0): @@ -9293,8 +8838,6 @@ snapshots: option@0.2.4: {} - p-finally@1.0.0: {} - p-limit@2.3.0: dependencies: p-try: 2.2.0 @@ -9307,20 +8850,6 @@ snapshots: dependencies: p-limit: 2.3.0 - p-queue@6.6.2: - dependencies: - eventemitter3: 4.0.7 - p-timeout: 3.2.0 - - p-retry@4.6.2: - dependencies: - '@types/retry': 0.12.0 - retry: 0.13.1 - - p-timeout@3.2.0: - dependencies: - p-finally: 1.0.0 - p-try@2.2.0: {} pac-proxy-agent@7.0.2: @@ -9507,7 +9036,7 @@ snapshots: csv-parse: 5.5.6 gpt3-tokenizer: 1.1.5 openai: 3.3.0 - typescript: 5.7.3 + typescript: 5.8.2 uuid: 9.0.1 zod: 3.24.2 transitivePeerDependencies: @@ -9705,8 +9234,6 @@ snapshots: path-parse: 1.0.7 supports-preserve-symlinks-flag: 1.0.0 - retry@0.13.1: {} - rimraf@5.0.7: dependencies: glob: 10.4.2 @@ -10121,7 +9648,7 @@ snapshots: typescript@5.4.5: {} - typescript@5.7.3: {} + typescript@5.8.2: {} typesense@1.8.2(@babel/runtime@7.24.6): dependencies: @@ -10131,9 +9658,6 @@ snapshots: transitivePeerDependencies: - debug - uglify-js@3.18.0: - optional: true - unbzip2-stream@1.4.3: dependencies: buffer: 5.7.1 @@ -10272,9 +9796,6 @@ snapshots: underscore: 1.13.6 wordnet-db: 3.1.14 - wordwrap@1.0.0: - optional: true - wrap-ansi@7.0.0: dependencies: ansi-styles: 4.3.0 @@ -10326,8 +9847,6 @@ snapshots: yallist@5.0.0: {} - yaml@2.4.5: {} - yargs-parser@21.1.1: {} yargs-unparser@2.0.0: @@ -10356,10 +9875,6 @@ snapshots: yocto-queue@0.1.0: {} - zod-to-json-schema@3.23.1(zod@3.24.2): - dependencies: - zod: 3.24.2 - zod-to-json-schema@3.24.2(zod@3.24.2): dependencies: zod: 3.24.2 From 74684645521fec0ea0a26655715d02b87bc85661 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adem=C3=ADlson=20F=2E=20Tonato?= Date: Tue, 1 Apr 2025 19:50:26 +0100 Subject: [PATCH 029/160] feat(queue-jobs): implement conditional notification for concurrency limits based on team subscription status --- .../notification/notification-check.ts | 9 +++++ apps/api/src/services/queue-jobs.ts | 21 +++++++--- .../services/subscription/enterprise-check.ts | 39 +++++++++++++++++++ 3 files changed, 63 insertions(+), 6 deletions(-) create mode 100644 apps/api/src/services/notification/notification-check.ts create mode 100644 apps/api/src/services/subscription/enterprise-check.ts diff --git a/apps/api/src/services/notification/notification-check.ts b/apps/api/src/services/notification/notification-check.ts new file mode 100644 index 00000000..86f3eada --- /dev/null +++ b/apps/api/src/services/notification/notification-check.ts @@ -0,0 +1,9 @@ +import { isEnterpriseTeamCreatedAfterRateLimitChange } from "../subscription/enterprise-check"; + +export async function shouldSendConcurrencyLimitNotification( + team_id: string, +): Promise { + const isEnterprise = + await isEnterpriseTeamCreatedAfterRateLimitChange(team_id); + return !isEnterprise; +} diff --git a/apps/api/src/services/queue-jobs.ts b/apps/api/src/services/queue-jobs.ts index d2601d30..535f51b5 100644 --- a/apps/api/src/services/queue-jobs.ts +++ b/apps/api/src/services/queue-jobs.ts @@ -12,6 +12,7 @@ import { import { logger } from "../lib/logger"; import { getConcurrencyLimitMax } from "./rate-limiter"; import { sendNotificationWithCustomDays } from './notification/email_notification'; +import { shouldSendConcurrencyLimitNotification } from './notification/notification-check'; async function _addScrapeJobToConcurrencyQueue( webScraperOptions: any, @@ -81,9 +82,13 @@ async function addScrapeJobRaw( // No need to 2x as if there are more than the max concurrency in the concurrency queue, it is already 2x if(concurrencyQueueJobs > maxConcurrency) { logger.info("Concurrency limited 2x (single) - ", "Concurrency queue jobs: ", concurrencyQueueJobs, "Max concurrency: ", maxConcurrency, "Team ID: ", webScraperOptions.team_id); - sendNotificationWithCustomDays(webScraperOptions.team_id, NotificationType.CONCURRENCY_LIMIT_REACHED, 15, false).catch((error) => { - logger.error("Error sending notification (concurrency limit reached): ", error); - }); + + const shouldSendNotification = await shouldSendConcurrencyLimitNotification(webScraperOptions.team_id); + if (shouldSendNotification) { + sendNotificationWithCustomDays(webScraperOptions.team_id, NotificationType.CONCURRENCY_LIMIT_REACHED, 15, false).catch((error) => { + logger.error("Error sending notification (concurrency limit reached): ", error); + }); + } } webScraperOptions.concurrencyLimited = true; @@ -172,9 +177,13 @@ export async function addScrapeJobs( // equals 2x the max concurrency if(addToCQ.length > maxConcurrency) { logger.info("Concurrency limited 2x (multiple) - ", "Concurrency queue jobs: ", addToCQ.length, "Max concurrency: ", maxConcurrency, "Team ID: ", jobs[0].data.team_id); - sendNotificationWithCustomDays(jobs[0].data.team_id, NotificationType.CONCURRENCY_LIMIT_REACHED, 15, false).catch((error) => { - logger.error("Error sending notification (concurrency limit reached): ", error); - }); + + const shouldSendNotification = await shouldSendConcurrencyLimitNotification(jobs[0].data.team_id); + if (shouldSendNotification) { + sendNotificationWithCustomDays(jobs[0].data.team_id, NotificationType.CONCURRENCY_LIMIT_REACHED, 15, false).catch((error) => { + logger.error("Error sending notification (concurrency limit reached): ", error); + }); + } } await Promise.all( diff --git a/apps/api/src/services/subscription/enterprise-check.ts b/apps/api/src/services/subscription/enterprise-check.ts new file mode 100644 index 00000000..eb966a8d --- /dev/null +++ b/apps/api/src/services/subscription/enterprise-check.ts @@ -0,0 +1,39 @@ +import { supabase_service } from "../supabase"; + +interface SubscriptionResponse { + prices: { + products: { + is_enterprise: boolean; + }; + }; +} + +const RATE_LIMIT_CHANGE_NOTIFICATION_START_DATE = new Date("2025-03-12"); + +export async function isEnterpriseTeamCreatedAfterRateLimitChange( + team_id: string, +): Promise { + const { data, error } = (await supabase_service + .from("subscriptions") + .select("prices(products(is_enterprise))") + .eq("status", "active") + .eq("team_id", team_id) + .gt( + "created", + RATE_LIMIT_CHANGE_NOTIFICATION_START_DATE.toISOString(), + )) as { + data: SubscriptionResponse[] | null; + error: any; + }; + + if (error || !data) { + // If there's an error or no subscription found, assume non-enterprise + return false; + } + + const isEnterprise = data.find( + (sub) => sub.prices?.products?.is_enterprise === true, + ); + + return !!isEnterprise; +} From 7216799ca020b2d39ace19509a1d1f8c3b86c72a Mon Sep 17 00:00:00 2001 From: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com> Date: Wed, 2 Apr 2025 10:45:11 -0300 Subject: [PATCH 030/160] revert mog changes --- apps/api/src/controllers/auth.ts | 3 ++- apps/api/src/controllers/v1/crawl-status.ts | 2 +- apps/api/src/lib/supabase-jobs.ts | 8 ++++---- apps/api/src/services/billing/auto_charge.ts | 2 +- apps/api/src/services/idempotency/validate.ts | 2 +- 5 files changed, 9 insertions(+), 8 deletions(-) diff --git a/apps/api/src/controllers/auth.ts b/apps/api/src/controllers/auth.ts index 70a154c8..7b38181d 100644 --- a/apps/api/src/controllers/auth.ts +++ b/apps/api/src/controllers/auth.ts @@ -97,7 +97,8 @@ export async function getACUC( mode === RateLimiterMode.Extract || mode === RateLimiterMode.ExtractStatus; while (retries < maxRetries) { - const client = supabase_service; + const client = + Math.random() > (2/3) ? supabase_rr_service : supabase_service; ({ data, error } = await client.rpc( "auth_credit_usage_chunk_27_tally", { input_key: api_key, i_is_extract: isExtract, tally_untallied_credits: true }, diff --git a/apps/api/src/controllers/v1/crawl-status.ts b/apps/api/src/controllers/v1/crawl-status.ts index 7e847ad5..96aa578e 100644 --- a/apps/api/src/controllers/v1/crawl-status.ts +++ b/apps/api/src/controllers/v1/crawl-status.ts @@ -246,7 +246,7 @@ export async function crawlStatusController( let totalCount = jobIDs.length; if (totalCount === 0 && process.env.USE_DB_AUTHENTICATION === "true") { - const x = await supabase_service + const x = await supabase_rr_service .from('firecrawl_jobs') .select('*', { count: 'exact', head: true }) .eq("crawl_id", req.params.jobId) diff --git a/apps/api/src/lib/supabase-jobs.ts b/apps/api/src/lib/supabase-jobs.ts index bf25c62f..e36f3b97 100644 --- a/apps/api/src/lib/supabase-jobs.ts +++ b/apps/api/src/lib/supabase-jobs.ts @@ -8,7 +8,7 @@ import * as Sentry from "@sentry/node"; * @returns {any | null} Job */ export const supabaseGetJobById = async (jobId: string) => { - const { data, error } = await supabase_service + const { data, error } = await supabase_rr_service .from("firecrawl_jobs") .select("*") .eq("job_id", jobId) @@ -31,7 +31,7 @@ export const supabaseGetJobById = async (jobId: string) => { * @returns {any[]} Jobs */ export const supabaseGetJobsById = async (jobIds: string[]) => { - const { data, error } = await supabase_service + const { data, error } = await supabase_rr_service .from("firecrawl_jobs") .select() .in("job_id", jobIds); @@ -55,7 +55,7 @@ export const supabaseGetJobsById = async (jobIds: string[]) => { * @returns {any[]} Jobs */ export const supabaseGetJobsByCrawlId = async (crawlId: string) => { - const { data, error } = await supabase_service + const { data, error } = await supabase_rr_service .from("firecrawl_jobs") .select() .eq("crawl_id", crawlId); @@ -74,7 +74,7 @@ export const supabaseGetJobsByCrawlId = async (crawlId: string) => { }; export const supabaseGetJobByIdOnlyData = async (jobId: string) => { - const { data, error } = await supabase_service + const { data, error } = await supabase_rr_service .from("firecrawl_jobs") .select("docs, team_id") .eq("job_id", jobId) diff --git a/apps/api/src/services/billing/auto_charge.ts b/apps/api/src/services/billing/auto_charge.ts index 8540b1ea..b30be201 100644 --- a/apps/api/src/services/billing/auto_charge.ts +++ b/apps/api/src/services/billing/auto_charge.ts @@ -124,7 +124,7 @@ export async function autoCharge( if (chunk.sub_user_id) { // Fetch the customer's Stripe information const { data: customer, error: customersError } = - await supabase_service + await supabase_rr_service .from("customers") .select("id, stripe_customer_id") .eq("id", chunk.sub_user_id) diff --git a/apps/api/src/services/idempotency/validate.ts b/apps/api/src/services/idempotency/validate.ts index 4a5fbe65..54ec7bd0 100644 --- a/apps/api/src/services/idempotency/validate.ts +++ b/apps/api/src/services/idempotency/validate.ts @@ -18,7 +18,7 @@ export async function validateIdempotencyKey(req: Request): Promise { return false; } - const { data, error } = await supabase_service + const { data, error } = await supabase_rr_service .from("idempotency_keys") .select("key") .eq("key", idempotencyKey); From b900f34b5a0d29f219d4abeb7e5cac9fc2d793f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adem=C3=ADlson=20F=2E=20Tonato?= Date: Wed, 2 Apr 2025 17:36:11 +0100 Subject: [PATCH 031/160] feat(notification): add notification message for concurrency limit reached --- apps/api/src/services/notification/notification_string.ts | 2 ++ 1 file changed, 2 insertions(+) diff --git a/apps/api/src/services/notification/notification_string.ts b/apps/api/src/services/notification/notification_string.ts index 46da76e0..39ba608d 100644 --- a/apps/api/src/services/notification/notification_string.ts +++ b/apps/api/src/services/notification/notification_string.ts @@ -15,6 +15,8 @@ export function getNotificationString( return "Auto-recharge successful"; case NotificationType.AUTO_RECHARGE_FAILED: return "Auto-recharge failed"; + case NotificationType.CONCURRENCY_LIMIT_REACHED: + return "Concurrency limit reached"; default: return "Unknown notification type"; } From b3b63486f193d40bcdf2829ebdd779b24a14aff4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 2 Apr 2025 19:27:10 +0200 Subject: [PATCH 032/160] cc manual --- apps/api/src/services/rate-limiter.ts | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/apps/api/src/services/rate-limiter.ts b/apps/api/src/services/rate-limiter.ts index a8f885a0..ddb686a5 100644 --- a/apps/api/src/services/rate-limiter.ts +++ b/apps/api/src/services/rate-limiter.ts @@ -240,6 +240,7 @@ const testSuiteTokens = [ "0a18c9e", // gh ]; +const manual_growth = ["778c62c4-306f-4039-b372-eb20174760c0"]; const manual = ["69be9e74-7624-4990-b20d-08e0acc70cf6", "9661a311-3d75-45d2-bb70-71004d995873"]; const manual_etier2c = ["77545e01-9cec-4fa9-8356-883fc66ac13e"]; @@ -331,6 +332,10 @@ export function getConcurrencyLimitMax( return CONCURRENCY_LIMIT.etier2c; } + if (teamId && manual_growth.includes(teamId)) { + return CONCURRENCY_LIMIT.growth; + } + return CONCURRENCY_LIMIT[plan] ?? 10; } From 24f519935916c368cc91b619477b904cc87fa21f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 2 Apr 2025 19:52:43 +0200 Subject: [PATCH 033/160] compare format (FIR-1560) (#1405) --- apps/api/requests.http | 2 +- apps/api/src/__tests__/snips/scrape.test.ts | 12 +++ apps/api/src/controllers/v0/crawl.ts | 1 + apps/api/src/controllers/v0/crawlPreview.ts | 1 + apps/api/src/controllers/v0/scrape.ts | 2 + apps/api/src/controllers/v0/search.ts | 1 + apps/api/src/controllers/v1/batch-scrape.ts | 2 +- apps/api/src/controllers/v1/crawl.ts | 2 +- apps/api/src/controllers/v1/map.ts | 2 +- apps/api/src/controllers/v1/scrape.ts | 2 +- apps/api/src/controllers/v1/search.ts | 2 +- apps/api/src/controllers/v1/types.ts | 22 +++++- apps/api/src/lib/crawl-redis.ts | 26 +++++-- apps/api/src/lib/extract/document-scraper.ts | 1 + apps/api/src/main/runWebScraper.ts | 4 + apps/api/src/scraper/WebScraper/sitemap.ts | 1 + apps/api/src/scraper/scrapeURL/index.ts | 5 +- .../src/scraper/scrapeURL/scrapeURL.test.ts | 32 ++++---- .../scraper/scrapeURL/transformers/diff.ts | 42 +++++++++++ .../scraper/scrapeURL/transformers/index.ts | 14 +++- apps/api/src/services/queue-worker.ts | 73 ++++++++++++++++++- apps/api/src/types.ts | 1 + apps/js-sdk/firecrawl/package.json | 2 +- apps/js-sdk/firecrawl/src/index.ts | 7 +- 24 files changed, 223 insertions(+), 36 deletions(-) create mode 100644 apps/api/src/scraper/scrapeURL/transformers/diff.ts diff --git a/apps/api/requests.http b/apps/api/requests.http index a3997371..4c69d011 100644 --- a/apps/api/requests.http +++ b/apps/api/requests.http @@ -19,7 +19,7 @@ Authorization: Bearer {{$dotenv TEST_API_KEY}} content-type: application/json { - "url": "https://firecrawl.dev" + "url":"https://firecrawl.dev" } ### Check Crawl Status diff --git a/apps/api/src/__tests__/snips/scrape.test.ts b/apps/api/src/__tests__/snips/scrape.test.ts index 2b697ca9..7495e789 100644 --- a/apps/api/src/__tests__/snips/scrape.test.ts +++ b/apps/api/src/__tests__/snips/scrape.test.ts @@ -84,6 +84,18 @@ describe("Scrape tests", () => { // expect(response.markdown).toMatch(/(\.g\.doubleclick\.net|amazon-adsystem\.com)\//); // }, 30000); // }); + + describe("Compare format", () => { + it.concurrent("works", async () => { + const response = await scrape({ + url: "https://example.com", + formats: ["markdown", "compare"], + }); + + expect(response.compare).toBeDefined(); + expect(response.compare?.previousScrapeAt).not.toBeNull(); + }); + }); describe("Location API (f-e dependant)", () => { it.concurrent("works without specifying an explicit location", async () => { diff --git a/apps/api/src/controllers/v0/crawl.ts b/apps/api/src/controllers/v0/crawl.ts index 2eba651d..c8b186b0 100644 --- a/apps/api/src/controllers/v0/crawl.ts +++ b/apps/api/src/controllers/v0/crawl.ts @@ -158,6 +158,7 @@ export async function crawlController(req: Request, res: Response) { pageOptions, undefined, undefined, + team_id ); internalOptions.disableSmartWaitCache = true; // NOTE: smart wait disabled for crawls to ensure contentful scrape, speed does not matter diff --git a/apps/api/src/controllers/v0/crawlPreview.ts b/apps/api/src/controllers/v0/crawlPreview.ts index ffb8ebba..9153ea79 100644 --- a/apps/api/src/controllers/v0/crawlPreview.ts +++ b/apps/api/src/controllers/v0/crawlPreview.ts @@ -99,6 +99,7 @@ export async function crawlPreviewController(req: Request, res: Response) { pageOptions, undefined, undefined, + team_id ); const sc: StoredCrawl = { diff --git a/apps/api/src/controllers/v0/scrape.ts b/apps/api/src/controllers/v0/scrape.ts index 62d62b09..0bdd197b 100644 --- a/apps/api/src/controllers/v0/scrape.ts +++ b/apps/api/src/controllers/v0/scrape.ts @@ -66,6 +66,7 @@ export async function scrapeHelper( extractorOptions, timeout, crawlerOptions, + team_id, ); await addScrapeJob( @@ -297,6 +298,7 @@ export async function scrapeController(req: Request, res: Response) { pageOptions, extractorOptions, timeout, + team_id, ); logJob({ diff --git a/apps/api/src/controllers/v0/search.ts b/apps/api/src/controllers/v0/search.ts index ac7d7f62..d8649a52 100644 --- a/apps/api/src/controllers/v0/search.ts +++ b/apps/api/src/controllers/v0/search.ts @@ -72,6 +72,7 @@ export async function searchHelper( undefined, 60000, crawlerOptions, + team_id, ); if (justSearch) { diff --git a/apps/api/src/controllers/v1/batch-scrape.ts b/apps/api/src/controllers/v1/batch-scrape.ts index d2c079bf..20fab47c 100644 --- a/apps/api/src/controllers/v1/batch-scrape.ts +++ b/apps/api/src/controllers/v1/batch-scrape.ts @@ -82,7 +82,7 @@ export async function batchScrapeController( : { crawlerOptions: null, scrapeOptions: req.body, - internalOptions: { disableSmartWaitCache: true }, // NOTE: smart wait disabled for batch scrapes to ensure contentful scrape, speed does not matter + internalOptions: { disableSmartWaitCache: true, teamId: req.auth.team_id }, // NOTE: smart wait disabled for batch scrapes to ensure contentful scrape, speed does not matter team_id: req.auth.team_id, createdAt: Date.now(), plan: req.auth.plan, diff --git a/apps/api/src/controllers/v1/crawl.ts b/apps/api/src/controllers/v1/crawl.ts index 51d373ee..31e39502 100644 --- a/apps/api/src/controllers/v1/crawl.ts +++ b/apps/api/src/controllers/v1/crawl.ts @@ -81,7 +81,7 @@ export async function crawlController( originUrl: req.body.url, crawlerOptions: toLegacyCrawlerOptions(crawlerOptions), scrapeOptions, - internalOptions: { disableSmartWaitCache: true }, // NOTE: smart wait disabled for crawls to ensure contentful scrape, speed does not matter + internalOptions: { disableSmartWaitCache: true, teamId: req.auth.team_id }, // NOTE: smart wait disabled for crawls to ensure contentful scrape, speed does not matter team_id: req.auth.team_id, createdAt: Date.now(), plan: req.auth.plan, diff --git a/apps/api/src/controllers/v1/map.ts b/apps/api/src/controllers/v1/map.ts index ebb0b324..49890d90 100644 --- a/apps/api/src/controllers/v1/map.ts +++ b/apps/api/src/controllers/v1/map.ts @@ -85,7 +85,7 @@ export async function getMapResults({ scrapeOptions: undefined, }, scrapeOptions: scrapeOptions.parse({}), - internalOptions: {}, + internalOptions: { teamId }, team_id: teamId, createdAt: Date.now(), plan: plan, diff --git a/apps/api/src/controllers/v1/scrape.ts b/apps/api/src/controllers/v1/scrape.ts index ec11e2cb..44214ee2 100644 --- a/apps/api/src/controllers/v1/scrape.ts +++ b/apps/api/src/controllers/v1/scrape.ts @@ -50,7 +50,7 @@ export async function scrapeController( mode: "single_urls", team_id: req.auth.team_id, scrapeOptions: req.body, - internalOptions: {}, + internalOptions: { teamId: req.auth.team_id }, plan: req.auth.plan!, origin: req.body.origin, is_scrape: true, diff --git a/apps/api/src/controllers/v1/search.ts b/apps/api/src/controllers/v1/search.ts index 18ff9579..082cd8cd 100644 --- a/apps/api/src/controllers/v1/search.ts +++ b/apps/api/src/controllers/v1/search.ts @@ -83,7 +83,7 @@ async function scrapeSearchResult( mode: "single_urls" as Mode, team_id: options.teamId, scrapeOptions: options.scrapeOptions, - internalOptions: {}, + internalOptions: { teamId: options.teamId }, plan: options.plan || "free", origin: options.origin, is_scrape: true, diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 459e5e56..b610826d 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -20,7 +20,8 @@ export type Format = | "links" | "screenshot" | "screenshot@fullPage" - | "extract"; + | "extract" + | "compare"; export const url = z.preprocess( (x) => { @@ -165,6 +166,7 @@ const baseScrapeOptions = z "screenshot@fullPage", "extract", "json", + "compare", ]) .array() .optional() @@ -172,6 +174,10 @@ const baseScrapeOptions = z .refine( (x) => !(x.includes("screenshot") && x.includes("screenshot@fullPage")), "You may only specify either screenshot or screenshot@fullPage", + ) + .refine( + (x) => !x.includes("compare") || x.includes("markdown"), + "The compare format requires the markdown format to be specified as well", ), headers: z.record(z.string(), z.string()).optional(), includeTags: z.string().array().optional(), @@ -546,6 +552,11 @@ export type Document = { value: unknown }[]; }; + compare?: { + previousScrapeAt: string | null; + changeStatus: "new" | "same" | "changed" | "removed"; + visibility: "visible" | "hidden"; + } metadata: { title?: string; description?: string; @@ -812,7 +823,7 @@ export function toLegacyCrawlerOptions(x: CrawlerOptions) { }; } -export function fromLegacyCrawlerOptions(x: any): { +export function fromLegacyCrawlerOptions(x: any, teamId: string): { crawlOptions: CrawlerOptions; internalOptions: InternalOptions; } { @@ -834,6 +845,7 @@ export function fromLegacyCrawlerOptions(x: any): { }), internalOptions: { v0CrawlOnlyUrls: x.returnOnlyUrls, + teamId, }, }; } @@ -847,6 +859,7 @@ export function fromLegacyScrapeOptions( pageOptions: PageOptions, extractorOptions: ExtractorOptions | undefined, timeout: number | undefined, + teamId: string, ): { scrapeOptions: ScrapeOptions; internalOptions: InternalOptions } { return { scrapeOptions: scrapeOptions.parse({ @@ -896,6 +909,7 @@ export function fromLegacyScrapeOptions( internalOptions: { atsv: pageOptions.atsv, v0DisableJsDom: pageOptions.disableJsDom, + teamId, }, // TODO: fallback, fetchPageContent, replaceAllPathsWithAbsolutePaths, includeLinks }; @@ -906,13 +920,15 @@ export function fromLegacyCombo( extractorOptions: ExtractorOptions | undefined, timeout: number | undefined, crawlerOptions: any, + teamId: string, ): { scrapeOptions: ScrapeOptions; internalOptions: InternalOptions } { const { scrapeOptions, internalOptions: i1 } = fromLegacyScrapeOptions( pageOptions, extractorOptions, timeout, + teamId, ); - const { internalOptions: i2 } = fromLegacyCrawlerOptions(crawlerOptions); + const { internalOptions: i2 } = fromLegacyCrawlerOptions(crawlerOptions, teamId); return { scrapeOptions, internalOptions: Object.assign(i1, i2) }; } diff --git a/apps/api/src/lib/crawl-redis.ts b/apps/api/src/lib/crawl-redis.ts index b741e615..e261800f 100644 --- a/apps/api/src/lib/crawl-redis.ts +++ b/apps/api/src/lib/crawl-redis.ts @@ -154,20 +154,20 @@ export async function finishCrawlKickoff(id: string) { ); } -export async function finishCrawl(id: string) { +export async function finishCrawlPre(id: string) { if (await isCrawlFinished(id)) { - _logger.debug("Marking crawl as finished.", { + _logger.debug("Marking crawl as pre-finished.", { module: "crawl-redis", - method: "finishCrawl", + method: "finishCrawlPre", crawlId: id, }); - const set = await redisConnection.setnx("crawl:" + id + ":finish", "yes"); - await redisConnection.expire("crawl:" + id + ":finish", 24 * 60 * 60); + const set = await redisConnection.setnx("crawl:" + id + ":finished_pre", "yes"); + await redisConnection.expire("crawl:" + id + ":finished_pre", 24 * 60 * 60); return set === 1; } else { - _logger.debug("Crawl can not be finished yet, not marking as finished.", { + _logger.debug("Crawl can not be pre-finished yet, not marking as finished.", { module: "crawl-redis", - method: "finishCrawl", + method: "finishCrawlPre", crawlId: id, jobs_done: await redisConnection.scard("crawl:" + id + ":jobs_done"), jobs: await redisConnection.scard("crawl:" + id + ":jobs"), @@ -177,6 +177,16 @@ export async function finishCrawl(id: string) { } } +export async function finishCrawl(id: string) { + _logger.debug("Marking crawl as finished.", { + module: "crawl-redis", + method: "finishCrawl", + crawlId: id, + }); + await redisConnection.set("crawl:" + id + ":finish", "yes"); + await redisConnection.expire("crawl:" + id + ":finish", 24 * 60 * 60); +} + export async function getCrawlJobs(id: string): Promise { return await redisConnection.smembers("crawl:" + id + ":jobs"); } @@ -250,7 +260,7 @@ export function generateURLPermutations(url: string | URL): URL[] { return [urlWithHTML, urlWithPHP, urlWithSlash, urlWithBare]; }); - return permutations; + return [...new Set(permutations.map(x => x.href))].map(x => new URL(x)); } export async function lockURL( diff --git a/apps/api/src/lib/extract/document-scraper.ts b/apps/api/src/lib/extract/document-scraper.ts index 8cbc75fd..e9bd729a 100644 --- a/apps/api/src/lib/extract/document-scraper.ts +++ b/apps/api/src/lib/extract/document-scraper.ts @@ -44,6 +44,7 @@ export async function scrapeDocument( scrapeOptions: scrapeOptions.parse({ ...internalScrapeOptions }), internalOptions: { useCache: true, + teamId: options.teamId, }, plan: options.plan, origin: options.origin, diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index c6751218..ba983e07 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -51,6 +51,7 @@ export async function startWebScraperPipeline({ priority: job.opts.priority, is_scrape: job.data.is_scrape ?? false, is_crawl: !!(job.data.crawl_id && job.data.crawlerOptions !== null), + urlInvisibleInCurrentCrawl: job.data.crawlerOptions?.urlInvisibleInCurrentCrawl ?? false, }); } @@ -66,6 +67,7 @@ export async function runWebScraper({ priority, is_scrape = false, is_crawl = false, + urlInvisibleInCurrentCrawl = false, }: RunWebScraperParams): Promise { const logger = _logger.child({ method: "runWebScraper", @@ -97,6 +99,8 @@ export async function runWebScraper({ response = await scrapeURL(bull_job_id, url, scrapeOptions, { priority, ...internalOptions, + urlInvisibleInCurrentCrawl, + teamId: internalOptions?.teamId ?? team_id, }); if (!response.success) { if (response.error instanceof Error) { diff --git a/apps/api/src/scraper/WebScraper/sitemap.ts b/apps/api/src/scraper/WebScraper/sitemap.ts index c2c60383..f945cd22 100644 --- a/apps/api/src/scraper/WebScraper/sitemap.ts +++ b/apps/api/src/scraper/WebScraper/sitemap.ts @@ -47,6 +47,7 @@ export async function getLinksFromSitemap( ], v0DisableJsDom: true, abort, + teamId: "sitemap", }, ); diff --git a/apps/api/src/scraper/scrapeURL/index.ts b/apps/api/src/scraper/scrapeURL/index.ts index eaf5497a..e047d1cb 100644 --- a/apps/api/src/scraper/scrapeURL/index.ts +++ b/apps/api/src/scraper/scrapeURL/index.ts @@ -162,6 +162,8 @@ async function buildMetaObject( } export type InternalOptions = { + teamId: string; + priority?: number; // Passed along to fire-engine forceEngine?: Engine | Engine[]; atsv?: boolean; // anti-bot solver, beta @@ -173,6 +175,7 @@ export type InternalOptions = { isBackgroundIndex?: boolean; fromCache?: boolean; // Indicates if the document was retrieved from cache abort?: AbortSignal; + urlInvisibleInCurrentCrawl?: boolean; }; export type EngineResultsTracker = { @@ -383,7 +386,7 @@ export async function scrapeURL( id: string, url: string, options: ScrapeOptions, - internalOptions: InternalOptions = {}, + internalOptions: InternalOptions, ): Promise { const meta = await buildMetaObject(id, url, options, internalOptions); try { diff --git a/apps/api/src/scraper/scrapeURL/scrapeURL.test.ts b/apps/api/src/scraper/scrapeURL/scrapeURL.test.ts index 8b783821..b545266f 100644 --- a/apps/api/src/scraper/scrapeURL/scrapeURL.test.ts +++ b/apps/api/src/scraper/scrapeURL/scrapeURL.test.ts @@ -31,7 +31,7 @@ describe("Standalone scrapeURL tests", () => { "test:scrape-basic", "https://www.roastmywebsite.ai/", scrapeOptions.parse({}), - { forceEngine }, + { forceEngine, teamId: "test" }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -78,7 +78,7 @@ describe("Standalone scrapeURL tests", () => { scrapeOptions.parse({ formats: ["markdown", "html"], }), - { forceEngine }, + { forceEngine, teamId: "test" }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -102,7 +102,7 @@ describe("Standalone scrapeURL tests", () => { scrapeOptions.parse({ onlyMainContent: false, }), - { forceEngine }, + { forceEngine, teamId: "test" }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -125,7 +125,7 @@ describe("Standalone scrapeURL tests", () => { onlyMainContent: false, excludeTags: [".nav", "#footer", "strong"], }), - { forceEngine }, + { forceEngine, teamId: "test" }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -145,7 +145,7 @@ describe("Standalone scrapeURL tests", () => { "test:scrape-400", "https://httpstat.us/400", scrapeOptions.parse({}), - { forceEngine }, + { forceEngine, teamId: "test" }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -163,7 +163,7 @@ describe("Standalone scrapeURL tests", () => { "test:scrape-401", "https://httpstat.us/401", scrapeOptions.parse({}), - { forceEngine }, + { forceEngine, teamId: "test" }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -181,7 +181,7 @@ describe("Standalone scrapeURL tests", () => { "test:scrape-403", "https://httpstat.us/403", scrapeOptions.parse({}), - { forceEngine }, + { forceEngine, teamId: "test" }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -199,7 +199,7 @@ describe("Standalone scrapeURL tests", () => { "test:scrape-404", "https://httpstat.us/404", scrapeOptions.parse({}), - { forceEngine }, + { forceEngine, teamId: "test" }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -217,7 +217,7 @@ describe("Standalone scrapeURL tests", () => { "test:scrape-405", "https://httpstat.us/405", scrapeOptions.parse({}), - { forceEngine }, + { forceEngine, teamId: "test" }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -235,7 +235,7 @@ describe("Standalone scrapeURL tests", () => { "test:scrape-500", "https://httpstat.us/500", scrapeOptions.parse({}), - { forceEngine }, + { forceEngine, teamId: "test" }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -253,7 +253,7 @@ describe("Standalone scrapeURL tests", () => { "test:scrape-redirect", "https://scrapethissite.com/", scrapeOptions.parse({}), - { forceEngine }, + { forceEngine, teamId: "test" }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -285,7 +285,7 @@ describe("Standalone scrapeURL tests", () => { scrapeOptions.parse({ formats: ["screenshot"], }), - { forceEngine }, + { forceEngine, teamId: "test" }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -313,7 +313,7 @@ describe("Standalone scrapeURL tests", () => { scrapeOptions.parse({ formats: ["screenshot@fullPage"], }), - { forceEngine }, + { forceEngine, teamId: "test" }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -341,6 +341,7 @@ describe("Standalone scrapeURL tests", () => { "test:scrape-pdf", "https://arxiv.org/pdf/astro-ph/9301001.pdf", scrapeOptions.parse({}), + { teamId: "test" }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -359,6 +360,7 @@ describe("Standalone scrapeURL tests", () => { "test:scrape-docx", "https://nvca.org/wp-content/uploads/2019/06/NVCA-Model-Document-Stock-Purchase-Agreement.docx", scrapeOptions.parse({}), + { teamId: "test" }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -395,6 +397,7 @@ describe("Standalone scrapeURL tests", () => { }, }, }), + { teamId: "test" }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -430,6 +433,7 @@ describe("Standalone scrapeURL tests", () => { }, }, }), + { teamId: "test" }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -451,7 +455,7 @@ describe("Standalone scrapeURL tests", () => { async (i) => { const url = "https://www.scrapethissite.com/?i=" + i; const id = "test:concurrent:" + url; - const out = await scrapeURL(id, url, scrapeOptions.parse({})); + const out = await scrapeURL(id, url, scrapeOptions.parse({}), { teamId: "test" }); const replacer = (key: string, value: any) => { if (value instanceof Error) { diff --git a/apps/api/src/scraper/scrapeURL/transformers/diff.ts b/apps/api/src/scraper/scrapeURL/transformers/diff.ts new file mode 100644 index 00000000..9628844d --- /dev/null +++ b/apps/api/src/scraper/scrapeURL/transformers/diff.ts @@ -0,0 +1,42 @@ +import { supabase_service } from "../../../services/supabase"; +import { Document } from "../../../controllers/v1/types"; +import { Meta } from "../index"; + +export async function deriveDiff(meta: Meta, document: Document): Promise { + if (meta.options.formats.includes("compare")) { + const res = await supabase_service + .rpc("diff_get_last_scrape_1", { + i_team_id: meta.internalOptions.teamId, + i_url: document.metadata.sourceURL ?? meta.url, + }); + + const data: { + o_docs: Document[], + o_date_added: string, + } | undefined | null = (res.data ?? [])[0] as any; + + if (data && data.o_docs.length > 0) { + const previousMarkdown = data.o_docs[0].markdown!; + const currentMarkdown = document.markdown!; + + const transformer = (x: string) => [...x.replace(/\s+/g, "").replace(/\[iframe\]\(.+?\)/g, "")].sort().join(""); + + document.compare = { + previousScrapeAt: data.o_date_added, + changeStatus: document.metadata.statusCode === 404 ? "removed" : transformer(previousMarkdown) === transformer(currentMarkdown) ? "same" : "changed", + visibility: meta.internalOptions.urlInvisibleInCurrentCrawl ? "hidden" : "visible", + } + } else if (!res.error) { + document.compare = { + previousScrapeAt: null, + changeStatus: document.metadata.statusCode === 404 ? "removed" : "new", + visibility: meta.internalOptions.urlInvisibleInCurrentCrawl ? "hidden" : "visible", + } + } else { + meta.logger.error("Error fetching previous scrape", { error: res.error }); + document.warning = "Comparing failed, please try again later." + (document.warning ? ` ${document.warning}` : ""); + } + } + + return document; +} diff --git a/apps/api/src/scraper/scrapeURL/transformers/index.ts b/apps/api/src/scraper/scrapeURL/transformers/index.ts index ea149dba..114c59a8 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/index.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/index.ts @@ -8,7 +8,7 @@ import { performLLMExtract } from "./llmExtract"; import { uploadScreenshot } from "./uploadScreenshot"; import { removeBase64Images } from "./removeBase64Images"; import { saveToCache } from "./cache"; - +import { deriveDiff } from "./diff"; export type Transformer = ( meta: Meta, document: Document, @@ -148,6 +148,17 @@ export function coerceFieldsToFormats( ); } + if (!formats.has("compare") && document.compare !== undefined) { + meta.logger.warn( + "Removed compare from Document because it wasn't in formats -- this is extremely wasteful and indicates a bug.", + ); + delete document.compare; + } else if (formats.has("compare") && document.compare === undefined) { + meta.logger.warn( + "Request had format compare, but there was no compare field in the result.", + ); + } + if (meta.options.actions === undefined || meta.options.actions.length === 0) { delete document.actions; } @@ -164,6 +175,7 @@ export const transformerStack: Transformer[] = [ deriveMetadataFromRawHTML, uploadScreenshot, performLLMExtract, + deriveDiff, coerceFieldsToFormats, removeBase64Images, ]; diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index b6cfc454..a7da10f7 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -28,6 +28,7 @@ import { addCrawlJobs, crawlToCrawler, finishCrawl, + finishCrawlPre, finishCrawlKickoff, generateURLPermutations, getCrawl, @@ -100,7 +101,77 @@ const gotJobInterval = Number(process.env.CONNECTION_MONITOR_INTERVAL) || 20; const runningJobs: Set = new Set(); async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) { - if (await finishCrawl(job.data.crawl_id)) { + if (await finishCrawlPre(job.data.crawl_id)) { + if (job.data.crawlerOptions && !await redisConnection.exists("crawl:" + job.data.crawl_id + ":invisible_urls")) { + await redisConnection.set("crawl:" + job.data.crawl_id + ":invisible_urls", "done", "EX", 60 * 60 * 24); + + const sc = (await getCrawl(job.data.crawl_id))!; + + const visitedUrls = new Set(await redisConnection.smembers( + "crawl:" + job.data.crawl_id + ":visited_unique", + )); + + const lastUrls: string[] = ((await supabase_service.rpc("diff_get_last_crawl_urls", { + i_team_id: job.data.team_id, + i_url: sc.originUrl!, + })).data ?? []).map(x => x.url); + + const lastUrlsSet = new Set(lastUrls); + + const univistedUrls = Array.from(lastUrlsSet).filter(x => !visitedUrls.has(x)); + const addableJobCount = sc.crawlerOptions.limit === undefined ? Infinity : (sc.crawlerOptions.limit - await getDoneJobsOrderedLength(job.data.crawl_id)); + + console.log(sc.originUrl!, univistedUrls, visitedUrls, lastUrls, addableJobCount); + + if (univistedUrls.length !== 0 && addableJobCount > 0) { + const jobs = univistedUrls.slice(0, addableJobCount).map((url) => { + const uuid = uuidv4(); + return { + name: uuid, + data: { + url, + mode: "single_urls" as const, + team_id: job.data.team_id, + plan: job.data.plan!, + crawlerOptions: { + ...job.data.crawlerOptions, + urlInvisibleInCurrentCrawl: true, + }, + scrapeOptions: job.data.scrapeOptions, + internalOptions: sc.internalOptions, + origin: job.data.origin, + crawl_id: job.data.crawl_id, + sitemapped: true, + webhook: job.data.webhook, + v1: job.data.v1, + }, + opts: { + jobId: uuid, + priority: 20, + }, + }; + }); + + const lockedIds = await lockURLsIndividually( + job.data.crawl_id, + sc, + jobs.map((x) => ({ id: x.opts.jobId, url: x.data.url })), + ); + const lockedJobs = jobs.filter((x) => + lockedIds.find((y) => y.id === x.opts.jobId), + ); + await addCrawlJobs( + job.data.crawl_id, + lockedJobs.map((x) => x.opts.jobId), + ); + await addScrapeJobs(lockedJobs); + + return; + } + } + + await finishCrawl(job.data.crawl_id); + (async () => { const originUrl = sc.originUrl ? normalizeUrlOnlyHostname(sc.originUrl) diff --git a/apps/api/src/types.ts b/apps/api/src/types.ts index f051b183..adc7df37 100644 --- a/apps/api/src/types.ts +++ b/apps/api/src/types.ts @@ -59,6 +59,7 @@ export interface RunWebScraperParams { priority?: number; is_scrape?: boolean; is_crawl?: boolean; + urlInvisibleInCurrentCrawl?: boolean; } export type RunWebScraperResult = diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 0e6dbf7c..772f2a3e 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.21.0", + "version": "1.21.1", "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts", diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 41d13da0..bec5288f 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -69,6 +69,11 @@ export interface FirecrawlDocument; includeTags?: string[]; excludeTags?: string[]; From 2e2c3d52ced4003611c124e7cf6df1e069c8fff9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 3 Apr 2025 09:57:16 +0200 Subject: [PATCH 034/160] feat: add swoogo classes to force include main tags --- .../api/sharedLibs/html-transformer/src/lib.rs | 18 ++++++++++++++++-- .../scrapeURL/lib/removeUnwantedElements.ts | 16 +++++++++++++++- 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/apps/api/sharedLibs/html-transformer/src/lib.rs b/apps/api/sharedLibs/html-transformer/src/lib.rs index 8643208d..8d944f22 100644 --- a/apps/api/sharedLibs/html-transformer/src/lib.rs +++ b/apps/api/sharedLibs/html-transformer/src/lib.rs @@ -197,8 +197,22 @@ const EXCLUDE_NON_MAIN_TAGS: [&str; 41] = [ "#cookie", ]; -const FORCE_INCLUDE_MAIN_TAGS: [&str; 1] = [ - "#main" +const FORCE_INCLUDE_MAIN_TAGS: [&str; 13] = [ + "#main", + + // swoogo event software as .widget in all of their content + ".swoogo-cols", + ".swoogo-text", + ".swoogo-table-div", + ".swoogo-space", + ".swoogo-alert", + ".swoogo-sponsors", + ".swoogo-title", + ".swoogo-tabs", + ".swoogo-logo", + ".swoogo-image", + ".swoogo-button", + ".swoogo-agenda", ]; #[derive(Deserialize)] diff --git a/apps/api/src/scraper/scrapeURL/lib/removeUnwantedElements.ts b/apps/api/src/scraper/scrapeURL/lib/removeUnwantedElements.ts index 693b02bb..62a0a726 100644 --- a/apps/api/src/scraper/scrapeURL/lib/removeUnwantedElements.ts +++ b/apps/api/src/scraper/scrapeURL/lib/removeUnwantedElements.ts @@ -49,7 +49,21 @@ const excludeNonMainTags = [ "#cookie", ]; -const forceIncludeMainTags = ["#main"]; +const forceIncludeMainTags = [ + "#main", + ".swoogo-cols", + ".swoogo-text", + ".swoogo-table-div", + ".swoogo-space", + ".swoogo-alert", + ".swoogo-sponsors", + ".swoogo-title", + ".swoogo-tabs", + ".swoogo-logo", + ".swoogo-image", + ".swoogo-button", + ".swoogo-agenda" +]; export const htmlTransform = async ( html: string, From 8c1579df5145aaff3faabd9adeb945ca580bf96f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 3 Apr 2025 11:56:24 +0200 Subject: [PATCH 035/160] bump cc --- apps/api/src/services/rate-limiter.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/services/rate-limiter.ts b/apps/api/src/services/rate-limiter.ts index ddb686a5..f2a1eb77 100644 --- a/apps/api/src/services/rate-limiter.ts +++ b/apps/api/src/services/rate-limiter.ts @@ -240,7 +240,7 @@ const testSuiteTokens = [ "0a18c9e", // gh ]; -const manual_growth = ["778c62c4-306f-4039-b372-eb20174760c0"]; +const manual_growth = ["778c62c4-306f-4039-b372-eb20174760c0", "22a07b64-cbfe-4924-9273-e3f01709cdf2"]; const manual = ["69be9e74-7624-4990-b20d-08e0acc70cf6", "9661a311-3d75-45d2-bb70-71004d995873"]; const manual_etier2c = ["77545e01-9cec-4fa9-8356-883fc66ac13e"]; From 426151c9c97394b3e752a75b85adddb3f8dda0fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adem=C3=ADlson=20F=2E=20Tonato?= Date: Thu, 3 Apr 2025 17:02:51 +0100 Subject: [PATCH 036/160] feat(queue-jobs): add function to determine job type and update notification logic for concurrency limits --- apps/api/src/services/queue-jobs.ts | 37 +++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/apps/api/src/services/queue-jobs.ts b/apps/api/src/services/queue-jobs.ts index 535f51b5..1b4530b9 100644 --- a/apps/api/src/services/queue-jobs.ts +++ b/apps/api/src/services/queue-jobs.ts @@ -14,6 +14,17 @@ import { getConcurrencyLimitMax } from "./rate-limiter"; import { sendNotificationWithCustomDays } from './notification/email_notification'; import { shouldSendConcurrencyLimitNotification } from './notification/notification-check'; +/** + * Checks if a job is a crawl or batch scrape based on its options + * @param options The job options containing crawlerOptions and crawl_id + * @returns true if the job is either a crawl or batch scrape + */ +function isCrawlOrBatchScrape(options: { crawlerOptions?: any; crawl_id?: string }): boolean { + // If crawlerOptions exists, it's a crawl + // If crawl_id exists but no crawlerOptions, it's a batch scrape + return !!options.crawlerOptions || !!options.crawl_id; +} + async function _addScrapeJobToConcurrencyQueue( webScraperOptions: any, options: any, @@ -83,11 +94,14 @@ async function addScrapeJobRaw( if(concurrencyQueueJobs > maxConcurrency) { logger.info("Concurrency limited 2x (single) - ", "Concurrency queue jobs: ", concurrencyQueueJobs, "Max concurrency: ", maxConcurrency, "Team ID: ", webScraperOptions.team_id); - const shouldSendNotification = await shouldSendConcurrencyLimitNotification(webScraperOptions.team_id); - if (shouldSendNotification) { - sendNotificationWithCustomDays(webScraperOptions.team_id, NotificationType.CONCURRENCY_LIMIT_REACHED, 15, false).catch((error) => { - logger.error("Error sending notification (concurrency limit reached): ", error); - }); + // Only send notification if it's not a crawl or batch scrape + if (!isCrawlOrBatchScrape(webScraperOptions)) { + const shouldSendNotification = await shouldSendConcurrencyLimitNotification(webScraperOptions.team_id); + if (shouldSendNotification) { + sendNotificationWithCustomDays(webScraperOptions.team_id, NotificationType.CONCURRENCY_LIMIT_REACHED, 15, false).catch((error) => { + logger.error("Error sending notification (concurrency limit reached): ", error); + }); + } } } @@ -178,11 +192,14 @@ export async function addScrapeJobs( if(addToCQ.length > maxConcurrency) { logger.info("Concurrency limited 2x (multiple) - ", "Concurrency queue jobs: ", addToCQ.length, "Max concurrency: ", maxConcurrency, "Team ID: ", jobs[0].data.team_id); - const shouldSendNotification = await shouldSendConcurrencyLimitNotification(jobs[0].data.team_id); - if (shouldSendNotification) { - sendNotificationWithCustomDays(jobs[0].data.team_id, NotificationType.CONCURRENCY_LIMIT_REACHED, 15, false).catch((error) => { - logger.error("Error sending notification (concurrency limit reached): ", error); - }); + // Only send notification if it's not a crawl or batch scrape + if (!isCrawlOrBatchScrape(jobs[0].data)) { + const shouldSendNotification = await shouldSendConcurrencyLimitNotification(jobs[0].data.team_id); + if (shouldSendNotification) { + sendNotificationWithCustomDays(jobs[0].data.team_id, NotificationType.CONCURRENCY_LIMIT_REACHED, 15, false).catch((error) => { + logger.error("Error sending notification (concurrency limit reached): ", error); + }); + } } } From 7128f83a7a2546c9cde1dcbcaf22b8bdd5825dd5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Fri, 4 Apr 2025 17:54:37 +0200 Subject: [PATCH 037/160] fix(js-sdk): isows import issues (FIR-1586) (FIR-1536) (#1411) * attempt * improvements * kill isows -- there's been native websocket support in node since 21 * clean up the diff --- apps/js-sdk/firecrawl/package-lock.json | 61 ++++++------------------- apps/js-sdk/firecrawl/package.json | 14 +++--- apps/js-sdk/firecrawl/src/index.ts | 1 - apps/js-sdk/firecrawl/tsup.config.ts | 9 ++++ 4 files changed, 32 insertions(+), 53 deletions(-) diff --git a/apps/js-sdk/firecrawl/package-lock.json b/apps/js-sdk/firecrawl/package-lock.json index ff78d90e..593a039e 100644 --- a/apps/js-sdk/firecrawl/package-lock.json +++ b/apps/js-sdk/firecrawl/package-lock.json @@ -1,16 +1,15 @@ { "name": "@mendable/firecrawl-js", - "version": "1.9.1", + "version": "1.22.0", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "@mendable/firecrawl-js", - "version": "1.9.1", + "version": "1.22.0", "license": "MIT", "dependencies": { "axios": "^1.6.8", - "isows": "^1.0.4", "typescript-event-target": "^1.1.1", "zod": "^3.23.8", "zod-to-json-schema": "^3.23.0" @@ -29,6 +28,9 @@ "tsup": "^8.2.4", "typescript": "^5.4.5", "uuid": "^9.0.1" + }, + "engines": { + "node": ">=22.0.0" } }, "node_modules/@ampproject/remapping": { @@ -1826,12 +1828,13 @@ "dev": true }, "node_modules/@types/node": { - "version": "20.12.12", - "resolved": "https://registry.npmjs.org/@types/node/-/node-20.12.12.tgz", - "integrity": "sha512-eWLDGF/FOSPtAvEqeRAQ4C8LSA7M1I7i0ky1I8U7kD1J5ITyW3AsRhQrKVoWf5pFKZ2kILsEGJhsI9r93PYnOw==", + "version": "20.17.30", + "resolved": "https://registry.npmjs.org/@types/node/-/node-20.17.30.tgz", + "integrity": "sha512-7zf4YyHA+jvBNfVrk2Gtvs6x7E8V+YDW05bNfG2XkWDJfYRXrTiP/DsB2zSYTaHX0bGIujTBQdMVAhb+j7mwpg==", "dev": true, + "license": "MIT", "dependencies": { - "undici-types": "~5.26.4" + "undici-types": "~6.19.2" } }, "node_modules/@types/stack-utils": { @@ -3157,20 +3160,6 @@ "integrity": "sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw==", "dev": true }, - "node_modules/isows": { - "version": "1.0.4", - "resolved": "https://registry.npmjs.org/isows/-/isows-1.0.4.tgz", - "integrity": "sha512-hEzjY+x9u9hPmBom9IIAqdJCwNLax+xrPb51vEPpERoFlIxgmZcHzsT5jKG06nvInKOBGvReAVz80Umed5CczQ==", - "funding": [ - { - "type": "github", - "url": "https://github.com/sponsors/wagmi-dev" - } - ], - "peerDependencies": { - "ws": "*" - } - }, "node_modules/istanbul-lib-coverage": { "version": "3.2.2", "resolved": "https://registry.npmjs.org/istanbul-lib-coverage/-/istanbul-lib-coverage-3.2.2.tgz", @@ -5252,10 +5241,11 @@ "integrity": "sha512-dFSOFBKV6uwaloBCCUhxlD3Pr/P1a/tJdcmPrTXCHlEFD3faj0mztjcGn6VBAhQ0/Bdy8K3VWrrqwbt/ffsYsg==" }, "node_modules/undici-types": { - "version": "5.26.5", - "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz", - "integrity": "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==", - "dev": true + "version": "6.19.8", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.19.8.tgz", + "integrity": "sha512-ve2KP6f/JnbPBFyobGHuerC9g1FYGn/F8n1LWTwNxCEzd6IfqTwUQcNXgEtmmQ6DlRrC1hrSrBnCZPokRrDHjw==", + "dev": true, + "license": "MIT" }, "node_modules/update-browserslist-db": { "version": "1.1.0", @@ -5409,27 +5399,6 @@ "node": "^12.13.0 || ^14.15.0 || >=16.0.0" } }, - "node_modules/ws": { - "version": "8.18.0", - "resolved": "https://registry.npmjs.org/ws/-/ws-8.18.0.tgz", - "integrity": "sha512-8VbfWfHLbbwu3+N6OKsOMpBdT4kXPDDB9cJk2bJ6mh9ucxdlnNvH1e+roYkKmN9Nxw2yjz7VzeO9oOz2zJ04Pw==", - "peer": true, - "engines": { - "node": ">=10.0.0" - }, - "peerDependencies": { - "bufferutil": "^4.0.1", - "utf-8-validate": ">=5.0.2" - }, - "peerDependenciesMeta": { - "bufferutil": { - "optional": true - }, - "utf-8-validate": { - "optional": true - } - } - }, "node_modules/y18n": { "version": "5.0.8", "resolved": "https://registry.npmjs.org/y18n/-/y18n-5.0.8.tgz", diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 772f2a3e..8a17ed18 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.21.1", + "version": "1.22.0", "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts", @@ -26,7 +26,6 @@ "license": "MIT", "dependencies": { "axios": "^1.6.8", - "isows": "^1.0.4", "typescript-event-target": "^1.1.1", "zod": "^3.23.8", "zod-to-json-schema": "^3.23.0" @@ -36,8 +35,6 @@ }, "homepage": "https://github.com/mendableai/firecrawl#readme", "devDependencies": { - "uuid": "^9.0.1", - "dotenv": "^16.4.5", "@jest/globals": "^29.7.0", "@types/axios": "^0.14.0", "@types/dotenv": "^8.2.0", @@ -45,10 +42,12 @@ "@types/mocha": "^10.0.6", "@types/node": "^20.12.12", "@types/uuid": "^9.0.8", + "dotenv": "^16.4.5", "jest": "^29.7.0", "ts-jest": "^29.2.2", "tsup": "^8.2.4", - "typescript": "^5.4.5" + "typescript": "^5.4.5", + "uuid": "^9.0.1" }, "keywords": [ "firecrawl", @@ -58,5 +57,8 @@ "scraper", "api", "sdk" - ] + ], + "engines": { + "node": ">=22.0.0" + } } diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index bec5288f..4942ec69 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -1,7 +1,6 @@ import axios, { type AxiosResponse, type AxiosRequestHeaders, AxiosError } from "axios"; import * as zt from "zod"; import { zodToJsonSchema } from "zod-to-json-schema"; -import { WebSocket } from "isows"; import { TypedEventTarget } from "typescript-event-target"; /** diff --git a/apps/js-sdk/firecrawl/tsup.config.ts b/apps/js-sdk/firecrawl/tsup.config.ts index b3b7e42d..31d67739 100644 --- a/apps/js-sdk/firecrawl/tsup.config.ts +++ b/apps/js-sdk/firecrawl/tsup.config.ts @@ -6,4 +6,13 @@ export default defineConfig({ dts: true, outDir: "dist", clean: true, + platform: "node", + target: "node22", + noExternal: ["typescript-event-target"], + esbuildOptions(options) { + options.define = { + ...options.define, + "process.env.NODE_ENV": JSON.stringify(process.env.NODE_ENV || "production"), + }; + }, }); \ No newline at end of file From e1e39f8836a48d6d1d1a921028850f452b1d58ca Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 4 Apr 2025 14:34:48 -0400 Subject: [PATCH 038/160] Nick: send notifications for crawl+batch scrape --- apps/api/src/services/notification/email_notification.ts | 2 +- apps/api/src/services/queue-jobs.ts | 4 ---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/apps/api/src/services/notification/email_notification.ts b/apps/api/src/services/notification/email_notification.ts index 58a38cc0..d17896e8 100644 --- a/apps/api/src/services/notification/email_notification.ts +++ b/apps/api/src/services/notification/email_notification.ts @@ -44,7 +44,7 @@ const emailTemplates: Record<

We've improved our system by transitioning to concurrency limits, allowing faster scraping by default and eliminating* the often rate limit errors.

You're hitting the concurrency limit for your plan quite often, which means Firecrawl can't scrape as fast as it could. But don't worry, it is not failing your requests and you are still getting your results.

-

This is just to let you know that you could be scraping more pages faster. Consider upgrading your plan at firecrawl.dev/pricing.


Thanks,
Firecrawl Team
`, +

This is just to let you know that you could be scraping faster. Consider upgrading your plan at firecrawl.dev/pricing.


Thanks,
Firecrawl Team
`, }, }; diff --git a/apps/api/src/services/queue-jobs.ts b/apps/api/src/services/queue-jobs.ts index 1b4530b9..0a1a49fb 100644 --- a/apps/api/src/services/queue-jobs.ts +++ b/apps/api/src/services/queue-jobs.ts @@ -95,14 +95,12 @@ async function addScrapeJobRaw( logger.info("Concurrency limited 2x (single) - ", "Concurrency queue jobs: ", concurrencyQueueJobs, "Max concurrency: ", maxConcurrency, "Team ID: ", webScraperOptions.team_id); // Only send notification if it's not a crawl or batch scrape - if (!isCrawlOrBatchScrape(webScraperOptions)) { const shouldSendNotification = await shouldSendConcurrencyLimitNotification(webScraperOptions.team_id); if (shouldSendNotification) { sendNotificationWithCustomDays(webScraperOptions.team_id, NotificationType.CONCURRENCY_LIMIT_REACHED, 15, false).catch((error) => { logger.error("Error sending notification (concurrency limit reached): ", error); }); } - } } webScraperOptions.concurrencyLimited = true; @@ -193,14 +191,12 @@ export async function addScrapeJobs( logger.info("Concurrency limited 2x (multiple) - ", "Concurrency queue jobs: ", addToCQ.length, "Max concurrency: ", maxConcurrency, "Team ID: ", jobs[0].data.team_id); // Only send notification if it's not a crawl or batch scrape - if (!isCrawlOrBatchScrape(jobs[0].data)) { const shouldSendNotification = await shouldSendConcurrencyLimitNotification(jobs[0].data.team_id); if (shouldSendNotification) { sendNotificationWithCustomDays(jobs[0].data.team_id, NotificationType.CONCURRENCY_LIMIT_REACHED, 15, false).catch((error) => { logger.error("Error sending notification (concurrency limit reached): ", error); }); } - } } await Promise.all( From 41e094032f8ffb1b9bca40b46a9f3ccde6caa324 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 4 Apr 2025 14:36:41 -0400 Subject: [PATCH 039/160] Update email_notification.ts --- apps/api/src/services/notification/email_notification.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/services/notification/email_notification.ts b/apps/api/src/services/notification/email_notification.ts index d17896e8..660c83d8 100644 --- a/apps/api/src/services/notification/email_notification.ts +++ b/apps/api/src/services/notification/email_notification.ts @@ -273,7 +273,7 @@ export async function sendNotificationWithCustomDays( }, ]); - if (process.env.SLACK_ADMIN_WEBHOOK_URL && emails.length > 0) { + if (process.env.SLACK_ADMIN_WEBHOOK_URL && emails.length > 0 && notificationType !== NotificationType.CONCURRENCY_LIMIT_REACHED) { sendSlackWebhook( `${getNotificationString(notificationType)}: Team ${team_id}, with email ${emails[0].email}.`, false, From 6bed5eca50f621b1c0d8cf9e9e1c791983a6f284 Mon Sep 17 00:00:00 2001 From: Jingyu <56581242+washanhanzi@users.noreply.github.com> Date: Sat, 5 Apr 2025 04:05:51 +0800 Subject: [PATCH 040/160] fix(rust-sdk): remove rustfmt (#1392) rustfmt is deprecated, it depends on a outdated extprim crate which cause test failed to run --- apps/rust-sdk/Cargo.lock | 461 ++------------------------------------- apps/rust-sdk/Cargo.toml | 1 - 2 files changed, 21 insertions(+), 441 deletions(-) diff --git a/apps/rust-sdk/Cargo.lock b/apps/rust-sdk/Cargo.lock index bf128210..2ea5de69 100644 --- a/apps/rust-sdk/Cargo.lock +++ b/apps/rust-sdk/Cargo.lock @@ -1,6 +1,6 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. -version = 3 +version = 4 [[package]] name = "addr2line" @@ -17,15 +17,6 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" -[[package]] -name = "aho-corasick" -version = "0.6.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81ce3d38065e618af2d7b77e10c5ad9a069859b4be3c2250f674af3840d9c8a5" -dependencies = [ - "memchr", -] - [[package]] name = "android-tzdata" version = "0.1.1" @@ -65,15 +56,6 @@ version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" -[[package]] -name = "autocfg" -version = "0.1.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0dde43e75fd43e8a1bf86103336bc699aa8d17ad1be60c76c0bdfd4828e19b78" -dependencies = [ - "autocfg 1.3.0", -] - [[package]] name = "autocfg" version = "1.3.0" @@ -107,12 +89,6 @@ version = "0.22.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" -[[package]] -name = "bitflags" -version = "0.9.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4efd02e230a02e18f92fc2735f44597385ed02ad8f831e7c1c1156ee5e1ab3a5" - [[package]] name = "bitflags" version = "1.3.2" @@ -185,16 +161,7 @@ version = "0.0.302" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d911ee15579a3f50880d8c1d59ef6e79f9533127a3bd342462f5d584f5e8c294" dependencies = [ - "term 0.5.2", -] - -[[package]] -name = "cloudabi" -version = "0.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f" -dependencies = [ - "bitflags 1.3.2", + "term", ] [[package]] @@ -270,12 +237,6 @@ dependencies = [ "serde", ] -[[package]] -name = "diff" -version = "0.1.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56254986775e3233ffa9c4d7d3faaf6d36a2c09d30b20687e9f88bc8bafc16c8" - [[package]] name = "dirs" version = "1.0.5" @@ -284,7 +245,7 @@ checksum = "3fd78930633bd1c6e35c4b42b1df7b0cbc6bc191146e512bb3bedf243fcc3901" dependencies = [ "libc", "redox_users", - "winapi 0.3.9", + "winapi", ] [[package]] @@ -302,16 +263,6 @@ dependencies = [ "cfg-if", ] -[[package]] -name = "env_logger" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ddf21e73e016298f5cb37d6ef8e8da8e39f91f9ec8b0df44b7deb16a9f8cd5b" -dependencies = [ - "log 0.3.9", - "regex", -] - [[package]] name = "equivalent" version = "1.0.1" @@ -328,19 +279,6 @@ dependencies = [ "windows-sys 0.52.0", ] -[[package]] -name = "extprim" -version = "1.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b1a357c911c352439b460d7b375b5c85977b9db395b703dfee5a94dfb4d66a2" -dependencies = [ - "num-traits", - "rand", - "rustc_version", - "semver", - "serde", -] - [[package]] name = "fastrand" version = "2.1.0" @@ -354,9 +292,8 @@ dependencies = [ "assert_matches", "clippy", "dotenvy", - "log 0.4.22", + "log", "reqwest", - "rustfmt", "serde", "serde_json", "serde_with", @@ -395,12 +332,6 @@ dependencies = [ "percent-encoding", ] -[[package]] -name = "fuchsia-cprng" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a06f77d526c1a601b7c4cdd98f54b5eaabffc14d5f2f0296febdc7f357c6d3ba" - [[package]] name = "futures-channel" version = "0.3.30" @@ -451,15 +382,6 @@ dependencies = [ "slab", ] -[[package]] -name = "getopts" -version = "0.2.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14dbbfd5c71d70241ecf9e6f13737f7b5ce823821063188d7e46c41d371eebd5" -dependencies = [ - "unicode-width", -] - [[package]] name = "getrandom" version = "0.1.16" @@ -689,7 +611,7 @@ version = "1.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99" dependencies = [ - "autocfg 1.3.0", + "autocfg", "hashbrown 0.12.3", "serde", ] @@ -726,22 +648,6 @@ dependencies = [ "wasm-bindgen", ] -[[package]] -name = "kernel32-sys" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d" -dependencies = [ - "winapi 0.2.8", - "winapi-build", -] - -[[package]] -name = "lazy_static" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" - [[package]] name = "libc" version = "0.2.155" @@ -760,19 +666,10 @@ version = "0.4.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17" dependencies = [ - "autocfg 1.3.0", + "autocfg", "scopeguard", ] -[[package]] -name = "log" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e19e8d5c34a3e0e2223db8e060f9e8264aeeb5c5fc64a4ee9965c062211c024b" -dependencies = [ - "log 0.4.22", -] - [[package]] name = "log" version = "0.4.22" @@ -802,13 +699,14 @@ dependencies = [ [[package]] name = "mio" -version = "0.8.11" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c" +checksum = "80e04d1dcff3aae0704555fe5fee3bcfaf3d1fdf8a7e521d5b9d2b42acb52cec" dependencies = [ + "hermit-abi", "libc", "wasi 0.11.0+wasi-snapshot-preview1", - "windows-sys 0.48.0", + "windows-sys 0.52.0", ] [[package]] @@ -818,7 +716,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a8614eb2c83d59d1c8cc974dd3f920198647674a0a035e1af1fa58707e317466" dependencies = [ "libc", - "log 0.4.22", + "log", "openssl", "openssl-probe", "openssl-sys", @@ -840,17 +738,7 @@ version = "0.2.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" dependencies = [ - "autocfg 1.3.0", -] - -[[package]] -name = "num_cpus" -version = "1.16.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" -dependencies = [ - "hermit-abi", - "libc", + "autocfg", ] [[package]] @@ -1003,121 +891,6 @@ dependencies = [ "proc-macro2", ] -[[package]] -name = "rand" -version = "0.6.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d71dacdc3c88c1fde3885a3be3fbab9f35724e6ce99467f7d9c5026132184ca" -dependencies = [ - "autocfg 0.1.8", - "libc", - "rand_chacha", - "rand_core 0.4.2", - "rand_hc", - "rand_isaac", - "rand_jitter", - "rand_os", - "rand_pcg", - "rand_xorshift", - "winapi 0.3.9", -] - -[[package]] -name = "rand_chacha" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "556d3a1ca6600bfcbab7c7c91ccb085ac7fbbcd70e008a98742e7847f4f7bcef" -dependencies = [ - "autocfg 0.1.8", - "rand_core 0.3.1", -] - -[[package]] -name = "rand_core" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a6fdeb83b075e8266dcc8762c22776f6877a63111121f5f8c7411e5be7eed4b" -dependencies = [ - "rand_core 0.4.2", -] - -[[package]] -name = "rand_core" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c33a3c44ca05fa6f1807d8e6743f3824e8509beca625669633be0acbdf509dc" - -[[package]] -name = "rand_hc" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b40677c7be09ae76218dc623efbf7b18e34bced3f38883af07bb75630a21bc4" -dependencies = [ - "rand_core 0.3.1", -] - -[[package]] -name = "rand_isaac" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ded997c9d5f13925be2a6fd7e66bf1872597f759fd9dd93513dd7e92e5a5ee08" -dependencies = [ - "rand_core 0.3.1", -] - -[[package]] -name = "rand_jitter" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1166d5c91dc97b88d1decc3285bb0a99ed84b05cfd0bc2341bdf2d43fc41e39b" -dependencies = [ - "libc", - "rand_core 0.4.2", - "winapi 0.3.9", -] - -[[package]] -name = "rand_os" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b75f676a1e053fc562eafbb47838d67c84801e38fc1ba459e8f180deabd5071" -dependencies = [ - "cloudabi", - "fuchsia-cprng", - "libc", - "rand_core 0.4.2", - "rdrand", - "winapi 0.3.9", -] - -[[package]] -name = "rand_pcg" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "abf9b09b01790cfe0364f52bf32995ea3c39f4d2dd011eac241d2914146d0b44" -dependencies = [ - "autocfg 0.1.8", - "rand_core 0.4.2", -] - -[[package]] -name = "rand_xorshift" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cbf7e9e623549b0e21f6e97cf8ecf247c1a8fd2e8a992ae265314300b2455d5c" -dependencies = [ - "rand_core 0.3.1", -] - -[[package]] -name = "rdrand" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "678054eb77286b51581ba43620cc911abf02758c91f93f479767aed0f90458b2" -dependencies = [ - "rand_core 0.3.1", -] - [[package]] name = "redox_syscall" version = "0.1.57" @@ -1144,28 +917,6 @@ dependencies = [ "rust-argon2", ] -[[package]] -name = "regex" -version = "0.2.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9329abc99e39129fcceabd24cf5d85b4671ef7c29c50e972bc5afe32438ec384" -dependencies = [ - "aho-corasick", - "memchr", - "regex-syntax", - "thread_local", - "utf8-ranges", -] - -[[package]] -name = "regex-syntax" -version = "0.5.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d707a4fa2637f2dca2ef9fd02225ec7661fe01a53623c1e6515b6916511f7a7" -dependencies = [ - "ucd-util", -] - [[package]] name = "reqwest" version = "0.12.5" @@ -1188,7 +939,7 @@ dependencies = [ "hyper-util", "ipnet", "js-sys", - "log 0.4.22", + "log", "mime", "native-tls", "once_cell", @@ -1243,40 +994,6 @@ version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" -[[package]] -name = "rustc_version" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "138e3e0acb6c9fb258b19b67cb8abd63c00679d2851805ea151465464fe9030a" -dependencies = [ - "semver", -] - -[[package]] -name = "rustfmt" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec940eed814db0fb7ab928c5f5025f97dc55d1c0e345e39dda2ce9f945557500" -dependencies = [ - "diff", - "env_logger", - "getopts", - "kernel32-sys", - "libc", - "log 0.3.9", - "regex", - "serde", - "serde_derive", - "serde_json", - "strings", - "syntex_errors", - "syntex_syntax", - "term 0.4.6", - "toml", - "unicode-segmentation", - "winapi 0.2.8", -] - [[package]] name = "rustix" version = "0.38.34" @@ -1374,21 +1091,6 @@ dependencies = [ "libc", ] -[[package]] -name = "semver" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403" -dependencies = [ - "semver-parser", -] - -[[package]] -name = "semver-parser" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3" - [[package]] name = "serde" version = "1.0.204" @@ -1477,7 +1179,7 @@ version = "0.4.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f92a496fb766b417c996b9c5e57daf2f7ad3b0bebe1ccfca4856390e3d3bb67" dependencies = [ - "autocfg 1.3.0", + "autocfg", ] [[package]] @@ -1502,15 +1204,6 @@ version = "0.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" -[[package]] -name = "strings" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa481ee1bc42fc3df8195f91f7cb43cf8f2b71b48bac40bf5381cfaf7e481f3c" -dependencies = [ - "log 0.3.9", -] - [[package]] name = "strsim" version = "0.11.1" @@ -1540,47 +1233,6 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7065abeca94b6a8a577f9bd45aa0867a2238b74e8eb67cf10d492bc39351394" -[[package]] -name = "syntex_errors" -version = "0.59.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3133289179676c9f5c5b2845bf5a2e127769f4889fcbada43035ef6bd662605e" -dependencies = [ - "libc", - "serde", - "serde_derive", - "syntex_pos", - "term 0.4.6", - "unicode-xid", -] - -[[package]] -name = "syntex_pos" -version = "0.59.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30ab669fa003d208c681f874bbc76d91cc3d32550d16b5d9d2087cf477316470" -dependencies = [ - "serde", - "serde_derive", -] - -[[package]] -name = "syntex_syntax" -version = "0.59.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03815b9f04d95828770d9c974aa39c6e1f6ef3114eb77a3ce09008a0d15dd142" -dependencies = [ - "bitflags 0.9.1", - "extprim", - "log 0.3.9", - "serde", - "serde_derive", - "serde_json", - "syntex_errors", - "syntex_pos", - "unicode-xid", -] - [[package]] name = "system-configuration" version = "0.5.1" @@ -1614,16 +1266,6 @@ dependencies = [ "windows-sys 0.52.0", ] -[[package]] -name = "term" -version = "0.4.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa63644f74ce96fbeb9b794f66aff2a52d601cbd5e80f4b97123e3899f4570f1" -dependencies = [ - "kernel32-sys", - "winapi 0.2.8", -] - [[package]] name = "term" version = "0.5.2" @@ -1632,7 +1274,7 @@ checksum = "edd106a334b7657c10b7c540a0106114feadeb4dc314513e97df481d5d966f42" dependencies = [ "byteorder", "dirs", - "winapi 0.3.9", + "winapi", ] [[package]] @@ -1655,15 +1297,6 @@ dependencies = [ "syn", ] -[[package]] -name = "thread_local" -version = "0.3.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c6b53e329000edc2b34dbe8545fd20e55a333362d0a321909685a19bd28c3f1b" -dependencies = [ - "lazy_static", -] - [[package]] name = "time" version = "0.3.36" @@ -1712,28 +1345,27 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.38.0" +version = "1.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba4f4a02a7a80d6f274636f0aa95c7e383b912d41fe721a31f29e29698585a4a" +checksum = "5cec9b21b0450273377fc97bd4c33a8acffc8c996c987a7c5b319a0083707551" dependencies = [ "backtrace", "bytes", "libc", "mio", - "num_cpus", "parking_lot", "pin-project-lite", "signal-hook-registry", "socket2", "tokio-macros", - "windows-sys 0.48.0", + "windows-sys 0.52.0", ] [[package]] name = "tokio-macros" -version = "2.3.0" +version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f5ae998a069d4b5aba8ee9dad856af7d520c3699e6159b185c2acd48155d39a" +checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752" dependencies = [ "proc-macro2", "quote", @@ -1774,15 +1406,6 @@ dependencies = [ "tokio", ] -[[package]] -name = "toml" -version = "0.4.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "758664fc71a3a69038656bee8b6be6477d2a6c315a6b81f7081f591bffa4111f" -dependencies = [ - "serde", -] - [[package]] name = "tower" version = "0.4.13" @@ -1835,12 +1458,6 @@ version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" -[[package]] -name = "ucd-util" -version = "0.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "abd2fc5d32b590614af8b0a20d837f32eca055edd0bbead59a9cfe80858be003" - [[package]] name = "unicode-bidi" version = "0.3.15" @@ -1862,24 +1479,6 @@ dependencies = [ "tinyvec", ] -[[package]] -name = "unicode-segmentation" -version = "1.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4c87d22b6e3f4a18d4d40ef354e97c90fcb14dd91d7dc0aa9d8a1172ebf7202" - -[[package]] -name = "unicode-width" -version = "0.1.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0336d538f7abc86d282a4189614dfaa90810dfc2c6f6427eaf88e16311dd225d" - -[[package]] -name = "unicode-xid" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc" - [[package]] name = "untrusted" version = "0.9.0" @@ -1897,12 +1496,6 @@ dependencies = [ "percent-encoding", ] -[[package]] -name = "utf8-ranges" -version = "1.0.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fcfc827f90e53a02eaef5e535ee14266c1d569214c6aa70133a624d8a3164ba" - [[package]] name = "uuid" version = "1.10.0" @@ -1956,7 +1549,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "614d787b966d3989fa7bb98a654e369c762374fd3213d212cfc0251257e747da" dependencies = [ "bumpalo", - "log 0.4.22", + "log", "once_cell", "proc-macro2", "quote", @@ -2015,12 +1608,6 @@ dependencies = [ "wasm-bindgen", ] -[[package]] -name = "winapi" -version = "0.2.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "167dc9d6949a9b857f3451275e911c3f44255842c1f7a76f33c55103a909087a" - [[package]] name = "winapi" version = "0.3.9" @@ -2031,12 +1618,6 @@ dependencies = [ "winapi-x86_64-pc-windows-gnu", ] -[[package]] -name = "winapi-build" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d315eee3b34aca4797b2da6b13ed88266e6d612562a0c46390af8299fc699bc" - [[package]] name = "winapi-i686-pc-windows-gnu" version = "0.4.0" diff --git a/apps/rust-sdk/Cargo.toml b/apps/rust-sdk/Cargo.toml index 4f34203b..6ea8d179 100644 --- a/apps/rust-sdk/Cargo.toml +++ b/apps/rust-sdk/Cargo.toml @@ -26,7 +26,6 @@ tokio = { version = "^1", features = ["full"] } [dev-dependencies] clippy = "^0.0.302" -rustfmt = "^0.10" assert_matches = "^1.5" dotenvy = "^0.15" tokio = { version = "1", features = ["full"] } From 570809aa59eac5c8bb820842637c5ebf80c44259 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Fri, 4 Apr 2025 22:12:59 +0200 Subject: [PATCH 041/160] fix(unvisitedUrls): filter with crawler Fixes #1410 --- apps/api/src/services/queue-worker.ts | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index a7da10f7..36f8ed2d 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -118,7 +118,19 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) { const lastUrlsSet = new Set(lastUrls); - const univistedUrls = Array.from(lastUrlsSet).filter(x => !visitedUrls.has(x)); + const crawler = crawlToCrawler( + job.data.crawl_id, + sc, + sc.originUrl!, + job.data.crawlerOptions, + ); + + const univistedUrls = crawler.filterLinks( + Array.from(lastUrlsSet).filter(x => !visitedUrls.has(x)), + Infinity, + sc.crawlerOptions.maxDepth ?? 10, + ); + const addableJobCount = sc.crawlerOptions.limit === undefined ? Infinity : (sc.crawlerOptions.limit - await getDoneJobsOrderedLength(job.data.crawl_id)); console.log(sc.originUrl!, univistedUrls, visitedUrls, lastUrls, addableJobCount); From f5e5bdb710ef6305664add2af77a840d80de24ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Sat, 5 Apr 2025 15:48:47 +0200 Subject: [PATCH 042/160] fix(llmExtract): arbitrary objects caused error to be thrown --- apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts index e3c76362..74b3ff9d 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts @@ -74,9 +74,9 @@ function normalizeSchema(x: any): any { return { ...x, properties: Object.fromEntries( - Object.entries(x.properties).map(([k, v]) => [k, normalizeSchema(v)]), + Object.entries(x.properties || {}).map(([k, v]) => [k, normalizeSchema(v)]), ), - required: Object.keys(x.properties), + required: Object.keys(x.properties || {}), additionalProperties: false, }; } else if (x && x.type === "array") { From f45fa1207507a110d94e558a361ece8db48d5d7f Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sat, 5 Apr 2025 12:42:24 -0400 Subject: [PATCH 043/160] Update rate-limiter.ts --- apps/api/src/services/rate-limiter.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/api/src/services/rate-limiter.ts b/apps/api/src/services/rate-limiter.ts index f2a1eb77..522f4ed3 100644 --- a/apps/api/src/services/rate-limiter.ts +++ b/apps/api/src/services/rate-limiter.ts @@ -240,9 +240,9 @@ const testSuiteTokens = [ "0a18c9e", // gh ]; -const manual_growth = ["778c62c4-306f-4039-b372-eb20174760c0", "22a07b64-cbfe-4924-9273-e3f01709cdf2"]; +const manual_growth = ["22a07b64-cbfe-4924-9273-e3f01709cdf2"]; const manual = ["69be9e74-7624-4990-b20d-08e0acc70cf6", "9661a311-3d75-45d2-bb70-71004d995873"]; -const manual_etier2c = ["77545e01-9cec-4fa9-8356-883fc66ac13e"]; +const manual_etier2c = ["77545e01-9cec-4fa9-8356-883fc66ac13e", "778c62c4-306f-4039-b372-eb20174760c0"]; function makePlanKey(plan?: string) { return plan ? plan.replace("-", "") : "default"; // "default" From 17ea3ff3553512b41334cd7d1a74a9d380f1e2ad Mon Sep 17 00:00:00 2001 From: Aparup Ganguly Date: Mon, 7 Apr 2025 18:35:23 +0530 Subject: [PATCH 044/160] Add examples/ Llama 4 Maverick Crawler --- .../llama-4-maverick-web-crawler/.env.example | 5 + .../llama-4-maverick-web-crawler/.gitignore | 48 ++++ .../llama-4-maverick-web-crawler/README.md | 78 ++++++ .../llama4-maverick-web-crawler.py | 239 ++++++++++++++++++ .../requirements.txt | 3 + 5 files changed, 373 insertions(+) create mode 100644 examples/llama-4-maverick-web-crawler/.env.example create mode 100644 examples/llama-4-maverick-web-crawler/.gitignore create mode 100644 examples/llama-4-maverick-web-crawler/README.md create mode 100644 examples/llama-4-maverick-web-crawler/llama4-maverick-web-crawler.py create mode 100644 examples/llama-4-maverick-web-crawler/requirements.txt diff --git a/examples/llama-4-maverick-web-crawler/.env.example b/examples/llama-4-maverick-web-crawler/.env.example new file mode 100644 index 00000000..5636e594 --- /dev/null +++ b/examples/llama-4-maverick-web-crawler/.env.example @@ -0,0 +1,5 @@ +# Firecrawl API Key +FIRECRAWL_API_KEY=your_firecrawl_api_key_here + +# Together AI API Key +TOGETHER_API_KEY=your_together_api_key_here \ No newline at end of file diff --git a/examples/llama-4-maverick-web-crawler/.gitignore b/examples/llama-4-maverick-web-crawler/.gitignore new file mode 100644 index 00000000..7b391148 --- /dev/null +++ b/examples/llama-4-maverick-web-crawler/.gitignore @@ -0,0 +1,48 @@ +# Dependencies +node_modules/ +venv/ +.env +.env.local +.env.*.local + +# Build outputs +dist/ +build/ +*.pyc +__pycache__/ +.cache/ +.pytest_cache/ + +# IDE and editor files +.idea/ +.vscode/ +*.swp +*.swo +.DS_Store +Thumbs.db + +# Logs +*.log +npm-debug.log* +yarn-debug.log* +yarn-error.log* + +# Coverage and test reports +coverage/ +.coverage +htmlcov/ + +# Temporary files +*.tmp +*.temp +.tmp/ +temp/ + +# System files +.DS_Store +.DS_Store? +._* +.Spotlight-V100 +.Trashes +ehthumbs.db +Thumbs.db \ No newline at end of file diff --git a/examples/llama-4-maverick-web-crawler/README.md b/examples/llama-4-maverick-web-crawler/README.md new file mode 100644 index 00000000..aa249550 --- /dev/null +++ b/examples/llama-4-maverick-web-crawler/README.md @@ -0,0 +1,78 @@ +# Llama 4 Maverick Web Crawler + +This project combines the power of Firecrawl for web crawling and Llama 4 Maverick (via Together AI) for intelligent content analysis. It helps you find specific information on websites by crawling pages and analyzing their content using advanced language models. + +## Features + +- Intelligent URL mapping and relevance ranking +- Content analysis using Llama 4 Maverick model +- Automatic extraction of relevant information +- Color-coded console output for better readability + +## Prerequisites + +- Python 3.8 or higher +- Firecrawl API key +- Together AI API key + +## Installation + +1. Clone this repository +2. Install the required packages: + ```bash + pip install -r requirements.txt + ``` +3. Copy the `.env.example` file to `.env`: + ```bash + cp .env.example .env + ``` +4. Add your API keys to the `.env` file: + ``` + FIRECRAWL_API_KEY=your_firecrawl_api_key_here + TOGETHER_API_KEY=your_together_api_key_here + ``` + +## Usage + +Run the script using: + +```bash +python llama4-maverick-web-crawler.py +``` + +You will be prompted to: + +1. Enter the website URL to crawl +2. Specify your objective/what information you're looking for + +The script will then: + +1. Map the website and find relevant pages +2. Analyze the content using Llama 4 Maverick +3. Extract and return the requested information in JSON format + +## Example + +```bash +Enter the website to crawl: https://example.com +Enter your objective: Find the company's contact information +``` + +## Error Handling + +The script includes comprehensive error handling and will provide clear feedback if: + +- API keys are missing +- Website is inaccessible +- No relevant information is found +- Any other errors occur during execution + +## Dependencies + +- firecrawl: For web crawling and content extraction +- together: For accessing the Llama 4 Maverick model +- python-dotenv: For environment variable management + +## License + +[Your chosen license] diff --git a/examples/llama-4-maverick-web-crawler/llama4-maverick-web-crawler.py b/examples/llama-4-maverick-web-crawler/llama4-maverick-web-crawler.py new file mode 100644 index 00000000..eaccfbf3 --- /dev/null +++ b/examples/llama-4-maverick-web-crawler/llama4-maverick-web-crawler.py @@ -0,0 +1,239 @@ +import os +from firecrawl import FirecrawlApp +import json +from dotenv import load_dotenv +from together import Together + +# ANSI color codes +class Colors: + CYAN = '\033[96m' + YELLOW = '\033[93m' + GREEN = '\033[92m' + RED = '\033[91m' + MAGENTA = '\033[95m' + BLUE = '\033[94m' + RESET = '\033[0m' + +# Load environment variables +load_dotenv() + +# Retrieve API keys from environment variables +firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY") +together_api_key = os.getenv("TOGETHER_API_KEY") + +# Initialize the FirecrawlApp and Together client +app = FirecrawlApp(api_key=firecrawl_api_key) +client = Together(api_key=together_api_key) + +# Find the page that most likely contains the objective +def find_relevant_page_via_map(objective, url, app, client): + try: + print(f"{Colors.CYAN}Understood. The objective is: {objective}{Colors.RESET}") + print(f"{Colors.CYAN}Initiating search on the website: {url}{Colors.RESET}") + + map_prompt = f""" + The map function generates a list of URLs from a website and it accepts a search parameter. Based on the objective of: {objective}, come up with a 1-2 word search parameter that will help us find the information we need. Only respond with 1-2 words nothing else. + """ + + print(f"{Colors.YELLOW}Analyzing objective to determine optimal search parameter...{Colors.RESET}") + completion = client.chat.completions.create( + model="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + messages=[ + { + "role": "user", + "content": map_prompt + } + ] + ) + + map_search_parameter = completion.choices[0].message.content + print(f"{Colors.GREEN}Optimal search parameter identified: {map_search_parameter}{Colors.RESET}") + + print(f"{Colors.YELLOW}Mapping website using the identified search parameter...{Colors.RESET}") + map_website = app.map_url(url, params={"search": map_search_parameter}) + + # Debug print to see the response structure + print(f"{Colors.MAGENTA}Debug - Map response structure: {json.dumps(map_website, indent=2)}{Colors.RESET}") + + print(f"{Colors.GREEN}Website mapping completed successfully.{Colors.RESET}") + + # Handle the response based on its structure + if isinstance(map_website, dict): + # Assuming the links are in a 'urls' or similar key + links = map_website.get('urls', []) or map_website.get('links', []) + elif isinstance(map_website, str): + try: + parsed = json.loads(map_website) + links = parsed.get('urls', []) or parsed.get('links', []) + except json.JSONDecodeError: + links = [] + else: + links = map_website if isinstance(map_website, list) else [] + + if not links: + print(f"{Colors.RED}No links found in map response.{Colors.RESET}") + return None + + rank_prompt = f""" + Given this list of URLs and the objective: {objective} + Analyze each URL and rank the top 3 most relevant ones that are most likely to contain the information we need. + + IMPORTANT: You must ONLY return a JSON array with exactly 3 objects. Do not include ANY explanation text. + Do not include markdown formatting or ```json blocks. Return ONLY the raw JSON array. + + Each object in the array must have exactly these fields: + - "url": the full URL + - "relevance_score": number between 0-100 + - "reason": brief explanation of why this URL is relevant + + URLs to analyze: + {json.dumps(links, indent=2)} + """ + + print(f"{Colors.YELLOW}Ranking URLs by relevance to objective...{Colors.RESET}") + completion = client.chat.completions.create( + model="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + messages=[ + { + "role": "user", + "content": rank_prompt + } + ] + ) + + # Debug print to see LLM's raw response + print(f"{Colors.MAGENTA}Debug - LLM raw response:{Colors.RESET}") + print(f"{Colors.MAGENTA}{completion.choices[0].message.content}{Colors.RESET}") + + try: + # Try to clean the response by stripping any potential markdown or extra whitespace + cleaned_response = completion.choices[0].message.content.strip() + if cleaned_response.startswith("```json"): + cleaned_response = cleaned_response.split("```json")[1] + if cleaned_response.endswith("```"): + cleaned_response = cleaned_response.rsplit("```", 1)[0] + cleaned_response = cleaned_response.strip() + + ranked_results = json.loads(cleaned_response) + + # Validate the structure of the results + if not isinstance(ranked_results, list): + raise ValueError("Response is not a list") + + for result in ranked_results: + if not all(key in result for key in ["url", "relevance_score", "reason"]): + raise ValueError("Response items missing required fields") + + links = [result["url"] for result in ranked_results] + + # Print detailed ranking info + print(f"{Colors.CYAN}Top 3 ranked URLs:{Colors.RESET}") + for result in ranked_results: + print(f"{Colors.GREEN}URL: {result['url']}{Colors.RESET}") + print(f"{Colors.YELLOW}Relevance Score: {result['relevance_score']}{Colors.RESET}") + print(f"{Colors.BLUE}Reason: {result['reason']}{Colors.RESET}") + print("---") + + if not links: + print(f"{Colors.RED}No relevant links identified.{Colors.RESET}") + return None + + except (json.JSONDecodeError, KeyError) as e: + print(f"{Colors.RED}Error parsing ranked results: {str(e)}{Colors.RESET}") + return None + + print(f"{Colors.GREEN}Located {len(links)} relevant links.{Colors.RESET}") + return links + + except Exception as e: + print(f"{Colors.RED}Error encountered during relevant page identification: {str(e)}{Colors.RESET}") + return None + +# Scrape the top 3 pages and see if the objective is met, if so return in json format else return None +def find_objective_in_top_pages(map_website, objective, app, client): + try: + # Get top 3 links from the map result + if not map_website: + print(f"{Colors.RED}No links found to analyze.{Colors.RESET}") + return None + + top_links = map_website[:3] + print(f"{Colors.CYAN}Proceeding to analyze top {len(top_links)} links: {top_links}{Colors.RESET}") + + for link in top_links: + print(f"{Colors.YELLOW}Initiating scrape of page: {link}{Colors.RESET}") + scrape_result = app.scrape_url(link, params={'formats': ['markdown']}) + print(f"{Colors.GREEN}Page scraping completed successfully.{Colors.RESET}") + + check_prompt = f""" + Given the following scraped content and objective, determine if the objective is met. + + IMPORTANT: You must ONLY return one of two possible responses: + 1. If objective is NOT met, respond with exactly: Objective not met + 2. If objective IS met, respond with ONLY a JSON object containing the relevant information. + Do not include ANY explanation text, markdown formatting, or ```json blocks. + Return ONLY the raw JSON object. + + Objective: {objective} + Scraped content: {scrape_result['markdown']} + """ + + completion = client.chat.completions.create( + model="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + messages=[{"role": "user", "content": check_prompt}] + ) + + result = completion.choices[0].message.content.strip() + + # Clean up the response if it contains markdown formatting + if result.startswith("```json"): + result = result.split("```json")[1] + if result.endswith("```"): + result = result.rsplit("```", 1)[0] + result = result.strip() + + if result == "Objective not met": + print(f"{Colors.YELLOW}Objective not met on this page. Proceeding to next link...{Colors.RESET}") + continue + + try: + json_result = json.loads(result) + print(f"{Colors.GREEN}Objective fulfilled. Relevant information found.{Colors.RESET}") + return json_result + except json.JSONDecodeError as e: + print(f"{Colors.RED}Error parsing JSON response: {str(e)}{Colors.RESET}") + print(f"{Colors.MAGENTA}Raw response: {result}{Colors.RESET}") + continue + + print(f"{Colors.RED}All available pages analyzed. Objective not fulfilled in examined content.{Colors.RESET}") + return None + + except Exception as e: + print(f"{Colors.RED}Error encountered during page analysis: {str(e)}{Colors.RESET}") + return None + +# Main function to execute the process +def main(): + # Get user input + url = input(f"{Colors.BLUE}Enter the website to crawl : {Colors.RESET}") + objective = input(f"{Colors.BLUE}Enter your objective: {Colors.RESET}") + + print(f"{Colors.YELLOW}Initiating web crawling process...{Colors.RESET}") + # Find the relevant page + map_website = find_relevant_page_via_map(objective, url, app, client) + + if map_website: + print(f"{Colors.GREEN}Relevant pages identified. Proceeding with detailed analysis using Llama 4 Maverick...{Colors.RESET}") + # Find objective in top pages + result = find_objective_in_top_pages(map_website, objective, app, client) + + if result: + print(f"{Colors.GREEN}Objective successfully fulfilled. Extracted information :{Colors.RESET}") + print(f"{Colors.MAGENTA}{json.dumps(result, indent=2)}{Colors.RESET}") + else: + print(f"{Colors.RED}Unable to fulfill the objective with the available content.{Colors.RESET}") + else: + print(f"{Colors.RED}No relevant pages identified. Consider refining the search parameters or trying a different website.{Colors.RESET}") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/examples/llama-4-maverick-web-crawler/requirements.txt b/examples/llama-4-maverick-web-crawler/requirements.txt new file mode 100644 index 00000000..d9f05edc --- /dev/null +++ b/examples/llama-4-maverick-web-crawler/requirements.txt @@ -0,0 +1,3 @@ +firecrawl>=0.1.0 +together>=0.2.0 +python-dotenv>=0.19.0 \ No newline at end of file From 2f037fa1a7f8d84257442610582c572631a09b74 Mon Sep 17 00:00:00 2001 From: Aparup Ganguly Date: Mon, 7 Apr 2025 19:00:10 +0530 Subject: [PATCH 045/160] Add examples/llama4-maverick-web-extractor --- .../.env.example | 11 + .../llama-4-maverick-web-extractor/.gitignore | 1 + .../llama-4-maverick-web-extractor/README.md | 84 ++++++ .../llama-4-maverick-extractor.py | 240 ++++++++++++++++++ .../requirements.txt | 4 + 5 files changed, 340 insertions(+) create mode 100644 examples/llama-4-maverick-web-extractor/.env.example create mode 100644 examples/llama-4-maverick-web-extractor/.gitignore create mode 100644 examples/llama-4-maverick-web-extractor/README.md create mode 100644 examples/llama-4-maverick-web-extractor/llama-4-maverick-extractor.py create mode 100644 examples/llama-4-maverick-web-extractor/requirements.txt diff --git a/examples/llama-4-maverick-web-extractor/.env.example b/examples/llama-4-maverick-web-extractor/.env.example new file mode 100644 index 00000000..55db5bce --- /dev/null +++ b/examples/llama-4-maverick-web-extractor/.env.example @@ -0,0 +1,11 @@ +# Together AI API Key (Required) +# Get it from: https://www.together.ai/ +TOGETHER_API_KEY=your_together_ai_key_here + +# SerpAPI Key (Required) +# Get it from: https://serpapi.com/ +SERP_API_KEY=your_serpapi_key_here + +# Firecrawl API Key (Required) +# Get it from: https://firecrawl.dev/ +FIRECRAWL_API_KEY=your_firecrawl_key_here \ No newline at end of file diff --git a/examples/llama-4-maverick-web-extractor/.gitignore b/examples/llama-4-maverick-web-extractor/.gitignore new file mode 100644 index 00000000..0519ecba --- /dev/null +++ b/examples/llama-4-maverick-web-extractor/.gitignore @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/examples/llama-4-maverick-web-extractor/README.md b/examples/llama-4-maverick-web-extractor/README.md new file mode 100644 index 00000000..c2be6744 --- /dev/null +++ b/examples/llama-4-maverick-web-extractor/README.md @@ -0,0 +1,84 @@ +# Web Information Extractor with Llama 4 Maverick + +This tool uses Llama 4 Maverick (via Together AI), SerpAPI, and Firecrawl to automatically extract structured information about companies from the web. It performs intelligent URL selection and information extraction from web content. + +## Features + +- Automated Google search using SerpAPI +- Intelligent URL selection using Llama 4 Maverick +- Structured data extraction using Firecrawl +- Color-coded console output for better readability + +## Prerequisites + +- Python 3.8+ +- Together AI API key +- SerpAPI API key +- Firecrawl API key + +## Installation + +1. Clone the repository: + +```bash +git clone +cd +``` + +2. Install dependencies: + +```bash +pip install -r requirements.txt +``` + +3. Copy the example environment file and fill in your API keys: + +```bash +cp .env.example .env +``` + +4. Edit the `.env` file with your API keys: + +``` +TOGETHER_API_KEY=your_together_ai_key +SERP_API_KEY=your_serpapi_key +FIRECRAWL_API_KEY=your_firecrawl_key +``` + +## Usage + +Run the script: + +```bash +python llama-4-maverick-extractor.py +``` + +The script will: + +1. Prompt you for a company name +2. Ask what information you want to extract +3. Search for relevant URLs +4. Extract and structure the requested information +5. Display the results + +## Example + +```bash +$ python llama-4-maverick-extractor.py +Enter the company name: Tesla +Enter what information you want about the company: latest electric vehicle models and their prices +``` + +## Error Handling + +The script includes comprehensive error handling for: + +- Missing API keys +- API rate limits +- Network issues +- Invalid responses +- JSON parsing errors + +## License + +MIT License - feel free to use and modify as needed. diff --git a/examples/llama-4-maverick-web-extractor/llama-4-maverick-extractor.py b/examples/llama-4-maverick-web-extractor/llama-4-maverick-extractor.py new file mode 100644 index 00000000..a2e5fcb9 --- /dev/null +++ b/examples/llama-4-maverick-web-extractor/llama-4-maverick-extractor.py @@ -0,0 +1,240 @@ +import os +import json +import time +import requests +from dotenv import load_dotenv +from serpapi.google_search import GoogleSearch +from together import Together + +# ANSI color codes +class Colors: + CYAN = '\033[96m' + YELLOW = '\033[93m' + GREEN = '\033[92m' + RED = '\033[91m' + MAGENTA = '\033[95m' + BLUE = '\033[94m' + RESET = '\033[0m' + +# Load environment variables +load_dotenv() + +# Initialize clients +together_api_key = os.getenv("TOGETHER_API_KEY") +firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY") +serp_api_key = os.getenv("SERP_API_KEY") + +if not together_api_key: + print(f"{Colors.RED}Warning: TOGETHER_API_KEY not found in environment variables{Colors.RESET}") +if not firecrawl_api_key: + print(f"{Colors.RED}Warning: FIRECRAWL_API_KEY not found in environment variables{Colors.RESET}") + +# Initialize Together AI client +together_client = Together(api_key=together_api_key) + +def search_google(query): + """Search Google using SerpAPI and return top results.""" + print(f"{Colors.YELLOW}Searching Google for '{query}'...{Colors.RESET}") + search = GoogleSearch({"q": query, "api_key": serp_api_key}) + results = search.get_dict().get("organic_results", []) + print(f"{Colors.CYAN}Found {len(results)} search results{Colors.RESET}") + return results + +def select_urls_with_llama(company, objective, serp_results): + """ + Use Llama 4 Maverick to select URLs from SERP results. + Returns a list of URLs. + """ + try: + serp_data = [{"title": r.get("title"), "link": r.get("link"), "snippet": r.get("snippet")} + for r in serp_results if r.get("link")] + + print(f"{Colors.CYAN}Processing {len(serp_data)} valid search results{Colors.RESET}") + + prompt = ( + "You are a URL selection assistant. Your task is to analyze search results and select relevant URLs.\n\n" + "IMPORTANT: You must respond ONLY with a JSON object containing selected URLs. Do not include any explanation or additional text.\n\n" + "Instructions:\n" + "1. Analyze the search results for information about the specified company\n" + "2. Select URLs that are most likely to contain the requested information\n" + "3. Return EXACTLY in this format: {\"selected_urls\": [\"url1\", \"url2\"]}\n" + "4. Do not include social media links\n" + "5. DO NOT include any explanation or analysis in your response\n" + "6. ONLY output the JSON object\n\n" + f"Company: {company}\n" + f"Information Needed: {objective}\n" + f"Search Results: {json.dumps(serp_data, indent=2)}\n\n" + "YOUR RESPONSE MUST BE ONLY THE JSON OBJECT. NO OTHER TEXT." + ) + + try: + print(f"{Colors.YELLOW}Asking Llama to analyze URLs...{Colors.RESET}") + response = together_client.chat.completions.create( + model="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + messages=[{"role": "user", "content": prompt}], + temperature=0.1 # Lower temperature for more focused responses + ) + cleaned_response = response.choices[0].message.content.strip() + print(f"{Colors.MAGENTA}Llama response: {cleaned_response}{Colors.RESET}") + + # Clean the response text + if cleaned_response.startswith('```'): + cleaned_response = cleaned_response.split('```')[1] + if cleaned_response.startswith('json'): + cleaned_response = cleaned_response[4:] + cleaned_response = cleaned_response.strip() + + # Try to find JSON object in the response + json_start = cleaned_response.find('{') + json_end = cleaned_response.rfind('}') + 1 + if json_start != -1 and json_end != -1: + cleaned_response = cleaned_response[json_start:json_end] + + try: + # Parse JSON response + result = json.loads(cleaned_response) + if isinstance(result, dict) and "selected_urls" in result: + urls = result["selected_urls"] + else: + print(f"{Colors.YELLOW}Response not in expected format. Falling back to text parsing...{Colors.RESET}") + # Fallback to text parsing + urls = [line.strip() for line in cleaned_response.split('\n') + if line.strip().startswith(('http://', 'https://'))] + except json.JSONDecodeError: + print(f"{Colors.YELLOW}Could not parse JSON response. Falling back to text parsing...{Colors.RESET}") + # Fallback to text parsing + urls = [line.strip() for line in cleaned_response.split('\n') + if line.strip().startswith(('http://', 'https://'))] + + # Clean up URLs + cleaned_urls = [url.replace('/*', '').rstrip('/') for url in urls] + cleaned_urls = [url for url in cleaned_urls if url] + + if not cleaned_urls: + print(f"{Colors.YELLOW}No valid URLs found in response.{Colors.RESET}") + return [] + + print(f"{Colors.CYAN}Selected URLs for extraction:{Colors.RESET}") + for url in cleaned_urls: + print(f"- {url}") + + return cleaned_urls + + except Exception as e: + print(f"{Colors.RED}Error with Together AI API call: {str(e)}{Colors.RESET}") + return [] + + except Exception as e: + print(f"{Colors.RED}Error selecting URLs: {str(e)}{Colors.RESET}") + return [] + +def extract_company_info(urls, prompt, company, api_key): + """Use requests to call Firecrawl's extract endpoint with selected URLs.""" + print(f"{Colors.YELLOW}Extracting structured data from the provided URLs using Firecrawl...{Colors.RESET}") + + headers = { + 'Content-Type': 'application/json', + 'Authorization': f'Bearer {api_key}' + } + + payload = { + "urls": urls, + "prompt": prompt + " for " + company, + "enableWebSearch": True + } + + try: + response = requests.post( + "https://api.firecrawl.dev/v1/extract", + headers=headers, + json=payload, + timeout=30 + ) + + data = response.json() + + if not data.get('success'): + print(f"{Colors.RED}API returned error: {data.get('error', 'No error message')}{Colors.RESET}") + return None + + extraction_id = data.get('id') + if not extraction_id: + print(f"{Colors.RED}No extraction ID found in response.{Colors.RESET}") + return None + + return poll_firecrawl_result(extraction_id, api_key) + + except requests.exceptions.RequestException as e: + print(f"{Colors.RED}Request failed: {e}{Colors.RESET}") + return None + except json.JSONDecodeError as e: + print(f"{Colors.RED}Failed to parse response: {e}{Colors.RESET}") + return None + except Exception as e: + print(f"{Colors.RED}Failed to extract data: {e}{Colors.RESET}") + return None + +def poll_firecrawl_result(extraction_id, api_key, interval=10, max_attempts=60): + """Poll Firecrawl API to get the extraction result.""" + url = f"https://api.firecrawl.dev/v1/extract/{extraction_id}" + headers = { + 'Authorization': f'Bearer {api_key}' + } + + print(f"{Colors.YELLOW}Waiting for extraction to complete...{Colors.RESET}") + + for attempt in range(1, max_attempts + 1): + try: + response = requests.get(url, headers=headers, timeout=30) + response.raise_for_status() + data = response.json() + + if data.get('success') and data.get('data'): + print(f"{Colors.GREEN}Data successfully extracted:{Colors.RESET}") + print(json.dumps(data['data'], indent=2)) + return data['data'] + elif data.get('success') and not data.get('data'): + if attempt % 6 == 0: + print(f"{Colors.YELLOW}Still processing... (attempt {attempt}/{max_attempts}){Colors.RESET}") + time.sleep(interval) + else: + print(f"{Colors.RED}API Error: {data.get('error', 'No error message provided')}{Colors.RESET}") + return None + + except requests.exceptions.RequestException as e: + print(f"{Colors.RED}Request error: {str(e)}{Colors.RESET}") + return None + except json.JSONDecodeError as e: + print(f"{Colors.RED}JSON parsing error: {str(e)}{Colors.RESET}") + return None + except Exception as e: + print(f"{Colors.RED}Unexpected error: {str(e)}{Colors.RESET}") + return None + + print(f"{Colors.RED}Max polling attempts reached. Extraction did not complete in time.{Colors.RESET}") + return None + +def main(): + company = input(f"{Colors.BLUE}Enter the company name: {Colors.RESET}") + objective = input(f"{Colors.BLUE}Enter what information you want about the company: {Colors.RESET}") + + serp_results = search_google(f"{company}") + if not serp_results: + print(f"{Colors.RED}No search results found.{Colors.RESET}") + return + + selected_urls = select_urls_with_llama(company, objective, serp_results) + + if not selected_urls: + print(f"{Colors.RED}No URLs were selected.{Colors.RESET}") + return + + data = extract_company_info(selected_urls, objective, company, firecrawl_api_key) + + if data: + print(f"{Colors.GREEN}Extraction completed successfully.{Colors.RESET}") + else: + print(f"{Colors.RED}Failed to extract the requested information. Try refining your prompt or choosing a different company.{Colors.RESET}") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/examples/llama-4-maverick-web-extractor/requirements.txt b/examples/llama-4-maverick-web-extractor/requirements.txt new file mode 100644 index 00000000..33272430 --- /dev/null +++ b/examples/llama-4-maverick-web-extractor/requirements.txt @@ -0,0 +1,4 @@ +together>=0.2.5 +python-dotenv>=1.0.0 +requests>=2.31.0 +google-search-results>=2.4.2 \ No newline at end of file From 80bf732f5088ec0ee94f3885891138ece6b319ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adem=C3=ADlson=20F=2E=20Tonato?= Date: Mon, 7 Apr 2025 15:59:29 +0100 Subject: [PATCH 046/160] feat: incorporate user preferences and notification categories --- .../notification/email_notification.ts | 62 ++++++++++++++++++- 1 file changed, 59 insertions(+), 3 deletions(-) diff --git a/apps/api/src/services/notification/email_notification.ts b/apps/api/src/services/notification/email_notification.ts index 660c83d8..cf238b06 100644 --- a/apps/api/src/services/notification/email_notification.ts +++ b/apps/api/src/services/notification/email_notification.ts @@ -48,6 +48,17 @@ const emailTemplates: Record< }, }; +// Map notification types to email categories +const notificationToEmailCategory: Record = { + [NotificationType.APPROACHING_LIMIT]: 'system_alerts', + [NotificationType.LIMIT_REACHED]: 'system_alerts', + [NotificationType.RATE_LIMIT_REACHED]: 'rate_limit_warnings', + [NotificationType.AUTO_RECHARGE_SUCCESS]: 'system_alerts', + [NotificationType.AUTO_RECHARGE_FAILED]: 'system_alerts', + [NotificationType.CONCURRENCY_LIMIT_REACHED]: 'rate_limit_warnings', + [NotificationType.AUTO_RECHARGE_FREQUENT]: 'system_alerts', +}; + export async function sendNotification( team_id: string, notificationType: NotificationType, @@ -66,14 +77,57 @@ export async function sendNotification( ); } -export async function sendEmailNotification( +async function sendEmailNotification( email: string, notificationType: NotificationType, ) { const resend = new Resend(process.env.RESEND_API_KEY); try { - const { data, error } = await resend.emails.send({ + // Get user's email preferences + const { data: user, error: userError } = await supabase_service + .from("users") + .select("id") + .eq("email", email) + .single(); + + if (userError) { + logger.debug(`Error fetching user: ${userError}`); + return { success: false }; + } + + // Check user's email preferences + const { data: preferences, error: prefError } = await supabase_service + .from("notification_preferences") + .select("unsubscribed_all, email_preferences") + .eq("user_id", user.id) + .single(); + + if (prefError) { + logger.debug(`Error fetching preferences: ${prefError}`); + return { success: false }; + } + + // If user has unsubscribed from all emails or we can't find their preferences, don't send + if (!preferences || preferences.unsubscribed_all) { + logger.debug(`User ${email} has unsubscribed from all emails or preferences not found`); + return { success: true }; // Return success since this is an expected case + } + + // Get the email category for this notification type + const emailCategory = notificationToEmailCategory[notificationType]; + + // If user has unsubscribed from this category of emails, don't send + if ( + preferences.email_preferences && + Array.isArray(preferences.email_preferences) && + !preferences.email_preferences.includes(emailCategory) + ) { + logger.debug(`User ${email} has unsubscribed from ${emailCategory} emails`); + return { success: true }; // Return success since this is an expected case + } + + const { error } = await resend.emails.send({ from: "Firecrawl ", to: [email], reply_to: "help@firecrawl.com", @@ -85,13 +139,15 @@ export async function sendEmailNotification( logger.debug(`Error sending email: ${error}`); return { success: false }; } + + return { success: true }; } catch (error) { logger.debug(`Error sending email (2): ${error}`); return { success: false }; } } -export async function sendNotificationInternal( +async function sendNotificationInternal( team_id: string, notificationType: NotificationType, startDateString: string | null, From 132127510209183ad224d23380fed0cf2f628bad Mon Sep 17 00:00:00 2001 From: Aparup Ganguly Date: Tue, 8 Apr 2025 20:52:06 +0530 Subject: [PATCH 047/160] Update Llama 4 Maverick extractor implementation --- .../llama-4-maverick-extractor.py | 92 +++++++++++-------- 1 file changed, 52 insertions(+), 40 deletions(-) diff --git a/examples/llama-4-maverick-web-extractor/llama-4-maverick-extractor.py b/examples/llama-4-maverick-web-extractor/llama-4-maverick-extractor.py index a2e5fcb9..eb87f871 100644 --- a/examples/llama-4-maverick-web-extractor/llama-4-maverick-extractor.py +++ b/examples/llama-4-maverick-web-extractor/llama-4-maverick-extractor.py @@ -21,26 +21,23 @@ load_dotenv() # Initialize clients together_api_key = os.getenv("TOGETHER_API_KEY") +if not together_api_key: + print(f"{Colors.RED}Error: TOGETHER_API_KEY not found in environment variables{Colors.RESET}") + +client = Together(api_key=together_api_key) firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY") serp_api_key = os.getenv("SERP_API_KEY") -if not together_api_key: - print(f"{Colors.RED}Warning: TOGETHER_API_KEY not found in environment variables{Colors.RESET}") if not firecrawl_api_key: print(f"{Colors.RED}Warning: FIRECRAWL_API_KEY not found in environment variables{Colors.RESET}") -# Initialize Together AI client -together_client = Together(api_key=together_api_key) - def search_google(query): """Search Google using SerpAPI and return top results.""" print(f"{Colors.YELLOW}Searching Google for '{query}'...{Colors.RESET}") search = GoogleSearch({"q": query, "api_key": serp_api_key}) - results = search.get_dict().get("organic_results", []) - print(f"{Colors.CYAN}Found {len(results)} search results{Colors.RESET}") - return results + return search.get_dict().get("organic_results", []) -def select_urls_with_llama(company, objective, serp_results): +def select_urls_with_gemini(company, objective, serp_results): """ Use Llama 4 Maverick to select URLs from SERP results. Returns a list of URLs. @@ -49,33 +46,47 @@ def select_urls_with_llama(company, objective, serp_results): serp_data = [{"title": r.get("title"), "link": r.get("link"), "snippet": r.get("snippet")} for r in serp_results if r.get("link")] - print(f"{Colors.CYAN}Processing {len(serp_data)} valid search results{Colors.RESET}") + print(f"{Colors.CYAN}Found {len(serp_data)} search results to analyze{Colors.RESET}") + + if not serp_data: + print(f"{Colors.YELLOW}No search results found to analyze{Colors.RESET}") + return [] prompt = ( - "You are a URL selection assistant. Your task is to analyze search results and select relevant URLs.\n\n" - "IMPORTANT: You must respond ONLY with a JSON object containing selected URLs. Do not include any explanation or additional text.\n\n" + "Task: Select the most relevant URLs from search results, prioritizing official sources.\n\n" "Instructions:\n" - "1. Analyze the search results for information about the specified company\n" - "2. Select URLs that are most likely to contain the requested information\n" - "3. Return EXACTLY in this format: {\"selected_urls\": [\"url1\", \"url2\"]}\n" - "4. Do not include social media links\n" - "5. DO NOT include any explanation or analysis in your response\n" - "6. ONLY output the JSON object\n\n" + "1. PRIORITIZE official company websites, documentation, and press releases first\n" + "2. Select ONLY URLs that directly contain information about the requested topic\n" + "3. Return ONLY a JSON object with the following structure: {\"selected_urls\": [\"url1\", \"url2\"]}\n" + "4. Do not include social media links (Twitter, LinkedIn, Facebook, etc.)\n" + "5. Exclude any LinkedIn URLs as they cannot be accessed\n" + "6. Select a MAXIMUM of 3 most relevant URLs\n" + "7. Order URLs by relevance: official sources first, then trusted news/industry sources\n" + "8. IMPORTANT: Only output the JSON object, no other text or explanation\n\n" f"Company: {company}\n" f"Information Needed: {objective}\n" f"Search Results: {json.dumps(serp_data, indent=2)}\n\n" - "YOUR RESPONSE MUST BE ONLY THE JSON OBJECT. NO OTHER TEXT." + "Response Format: {\"selected_urls\": [\"https://example.com\", \"https://example2.com\"]}\n\n" + "Remember: Prioritize OFFICIAL sources and limit to 3 MOST RELEVANT URLs only." ) - + try: - print(f"{Colors.YELLOW}Asking Llama to analyze URLs...{Colors.RESET}") - response = together_client.chat.completions.create( + print(f"{Colors.YELLOW}Calling Together AI model...{Colors.RESET}") + response = client.chat.completions.create( model="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", messages=[{"role": "user", "content": prompt}], - temperature=0.1 # Lower temperature for more focused responses ) + print(f"{Colors.GREEN}Got response from Together AI{Colors.RESET}") + print(f"{Colors.CYAN}Raw response: {response.choices[0].message.content}{Colors.RESET}") + cleaned_response = response.choices[0].message.content.strip() - print(f"{Colors.MAGENTA}Llama response: {cleaned_response}{Colors.RESET}") + + # Find the JSON object in the response + import re + json_match = re.search(r'\{[\s\S]*"selected_urls"[\s\S]*\}', cleaned_response) + if json_match: + cleaned_response = json_match.group(0) + print(f"{Colors.CYAN}Extracted JSON: {cleaned_response}{Colors.RESET}") # Clean the response text if cleaned_response.startswith('```'): @@ -84,24 +95,16 @@ def select_urls_with_llama(company, objective, serp_results): cleaned_response = cleaned_response[4:] cleaned_response = cleaned_response.strip() - # Try to find JSON object in the response - json_start = cleaned_response.find('{') - json_end = cleaned_response.rfind('}') + 1 - if json_start != -1 and json_end != -1: - cleaned_response = cleaned_response[json_start:json_end] - try: # Parse JSON response result = json.loads(cleaned_response) if isinstance(result, dict) and "selected_urls" in result: urls = result["selected_urls"] else: - print(f"{Colors.YELLOW}Response not in expected format. Falling back to text parsing...{Colors.RESET}") - # Fallback to text parsing - urls = [line.strip() for line in cleaned_response.split('\n') - if line.strip().startswith(('http://', 'https://'))] - except json.JSONDecodeError: - print(f"{Colors.YELLOW}Could not parse JSON response. Falling back to text parsing...{Colors.RESET}") + print(f"{Colors.YELLOW}Response did not contain the expected 'selected_urls' key{Colors.RESET}") + urls = [] + except json.JSONDecodeError as e: + print(f"{Colors.YELLOW}Failed to parse JSON: {str(e)}{Colors.RESET}") # Fallback to text parsing urls = [line.strip() for line in cleaned_response.split('\n') if line.strip().startswith(('http://', 'https://'))] @@ -121,7 +124,7 @@ def select_urls_with_llama(company, objective, serp_results): return cleaned_urls except Exception as e: - print(f"{Colors.RED}Error with Together AI API call: {str(e)}{Colors.RESET}") + print(f"{Colors.RED}Error calling Together AI: {str(e)}{Colors.RESET}") return [] except Exception as e: @@ -144,13 +147,18 @@ def extract_company_info(urls, prompt, company, api_key): } try: + print(f"{Colors.CYAN}Making request to Firecrawl API...{Colors.RESET}") response = requests.post( "https://api.firecrawl.dev/v1/extract", headers=headers, json=payload, - timeout=30 + timeout=120 # Increased timeout to 120 seconds ) + if response.status_code != 200: + print(f"{Colors.RED}API returned status code {response.status_code}: {response.text}{Colors.RESET}") + return None + data = response.json() if not data.get('success'): @@ -162,8 +170,12 @@ def extract_company_info(urls, prompt, company, api_key): print(f"{Colors.RED}No extraction ID found in response.{Colors.RESET}") return None - return poll_firecrawl_result(extraction_id, api_key) + return poll_firecrawl_result(extraction_id, api_key, interval=5, max_attempts=120) # Increased polling attempts + except requests.exceptions.Timeout: + print(f"{Colors.RED}Request timed out. The operation might still be processing in the background.{Colors.RESET}") + print(f"{Colors.YELLOW}You may want to try again with fewer URLs or a more specific prompt.{Colors.RESET}") + return None except requests.exceptions.RequestException as e: print(f"{Colors.RED}Request failed: {e}{Colors.RESET}") return None @@ -223,7 +235,7 @@ def main(): print(f"{Colors.RED}No search results found.{Colors.RESET}") return - selected_urls = select_urls_with_llama(company, objective, serp_results) + selected_urls = select_urls_with_gemini(company, objective, serp_results) if not selected_urls: print(f"{Colors.RED}No URLs were selected.{Colors.RESET}") From bd1c1b0012e8cd4305b9ccffbe0f37c6a5a838ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 8 Apr 2025 19:28:21 +0200 Subject: [PATCH 048/160] feat(log_job): start saving jobs to GCS (#1424) --- apps/api/package.json | 3 +- apps/api/pnpm-lock.yaml | 466 +++++++++++++++++++++-- apps/api/src/services/logging/log_job.ts | 43 ++- 3 files changed, 471 insertions(+), 41 deletions(-) diff --git a/apps/api/package.json b/apps/api/package.json index 3feec21f..35791483 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -48,7 +48,7 @@ "supertest": "^6.3.3", "ts-jest": "^29.1.1", "ts-node": "^10.9.1", - "typescript": "^5.4.2" + "typescript": "^5.8.3" }, "dependencies": { "@ai-sdk/openai": "^1.1.13", @@ -59,6 +59,7 @@ "@bull-board/express": "^5.20.5", "@devil7softwares/pos": "^1.0.2", "@dqbd/tiktoken": "^1.0.17", + "@google-cloud/storage": "^7.16.0", "@nangohq/node": "^0.40.8", "@pinecone-database/pinecone": "^4.0.0", "@sentry/cli": "^2.33.1", diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index fdd8aa14..508fb7cd 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -32,6 +32,9 @@ importers: '@dqbd/tiktoken': specifier: ^1.0.17 version: 1.0.17 + '@google-cloud/storage': + specifier: ^7.16.0 + version: 7.16.0(encoding@0.1.13) '@nangohq/node': specifier: ^0.40.8 version: 0.40.8 @@ -196,7 +199,7 @@ importers: version: 0.0.10 puppeteer: specifier: ^22.12.1 - version: 22.12.1(typescript@5.4.5) + version: 22.12.1(typescript@5.8.3) rate-limiter-flexible: specifier: 2.4.2 version: 2.4.2 @@ -296,7 +299,7 @@ importers: version: 6.0.2 jest: specifier: ^29.6.3 - version: 29.7.0(@types/node@20.14.1)(ts-node@10.9.2(@types/node@20.14.1)(typescript@5.4.5)) + version: 29.7.0(@types/node@20.14.1)(ts-node@10.9.2(@types/node@20.14.1)(typescript@5.8.3)) jest-fetch-mock: specifier: ^3.0.3 version: 3.0.3(encoding@0.1.13) @@ -311,13 +314,13 @@ importers: version: 6.3.4 ts-jest: specifier: ^29.1.1 - version: 29.1.4(@babel/core@7.24.6)(@jest/transform@29.7.0)(@jest/types@29.6.3)(babel-jest@29.7.0(@babel/core@7.24.6))(jest@29.7.0(@types/node@20.14.1)(ts-node@10.9.2(@types/node@20.14.1)(typescript@5.4.5)))(typescript@5.4.5) + version: 29.1.4(@babel/core@7.24.6)(@jest/transform@29.7.0)(@jest/types@29.6.3)(babel-jest@29.7.0(@babel/core@7.24.6))(jest@29.7.0(@types/node@20.14.1)(ts-node@10.9.2(@types/node@20.14.1)(typescript@5.8.3)))(typescript@5.8.3) ts-node: specifier: ^10.9.1 - version: 10.9.2(@types/node@20.14.1)(typescript@5.4.5) + version: 10.9.2(@types/node@20.14.1)(typescript@5.8.3) typescript: - specifier: ^5.4.2 - version: 5.4.5 + specifier: ^5.8.3 + version: 5.8.3 packages: @@ -767,6 +770,22 @@ packages: '@dqbd/tiktoken@1.0.17': resolution: {integrity: sha512-v2gz0V6DiuR2TsALM32TkBThf6LdjLbxe6HS/nx9/KJxuDX0Z7SGX7N7PvQfqIvRyus42jI9poVUqezc/j/aQw==} + '@google-cloud/paginator@5.0.2': + resolution: {integrity: sha512-DJS3s0OVH4zFDB1PzjxAsHqJT6sKVbRwwML0ZBP9PbU7Yebtu/7SWMRzvO2J3nUi9pRNITCfu4LJeooM2w4pjg==} + engines: {node: '>=14.0.0'} + + '@google-cloud/projectify@4.0.0': + resolution: {integrity: sha512-MmaX6HeSvyPbWGwFq7mXdo0uQZLGBYCwziiLIGq5JVX+/bdI3SAq6bP98trV5eTWfLuvsMcIC1YJOF2vfteLFA==} + engines: {node: '>=14.0.0'} + + '@google-cloud/promisify@4.0.0': + resolution: {integrity: sha512-Orxzlfb9c67A15cq2JQEyVc7wEsmFBmHjZWZYQMUyJ1qivXyMwdyNOs9odi79hze+2zqdTtu1E19IM/FtqZ10g==} + engines: {node: '>=14'} + + '@google-cloud/storage@7.16.0': + resolution: {integrity: sha512-7/5LRgykyOfQENcm6hDKP8SX/u9XxE5YOiWOkgkwcoO+cG8xT/cyOvp9wwN3IxfdYgpHs8CE7Nq2PKX2lNaEXw==} + engines: {node: '>=14'} + '@ioredis/commands@1.2.0': resolution: {integrity: sha512-Sx1pU8EM64o2BrqNpEO1CNLtKQwyhuXuqyfH7oGKCk+1a33d2r5saW8zNwm3j6BTExtjrv2BxTgzzkMwts6vGg==} @@ -1553,6 +1572,10 @@ packages: '@supabase/supabase-js@2.44.2': resolution: {integrity: sha512-fouCwL1OxqftOwLNgdDUPlNnFuCnt30nS4kLcnTpe6NYKn1PmjxRRBFmKscgHs6FjWyU+32ZG4uBJ29+/BWiDw==} + '@tootallnate/once@2.0.0': + resolution: {integrity: sha512-XCuKFP5PS55gnMVu3dty8KPatLqUoy/ZYzDzAGCQ8JNFCkLXzmI7vNHCR+XpbZaMWQK/vQubr7PkYq8g470J/A==} + engines: {node: '>= 10'} + '@tootallnate/quickjs-emscripten@0.23.0': resolution: {integrity: sha512-C5Mc6rdnsaJDjO3UpGW/CQTHtCKaYlScZTly4JIu97Jxo/odCiH0ITnDXSJPTOrEKk/ycSZ0AOgTmkDtkOsvIA==} @@ -1586,6 +1609,9 @@ packages: '@types/body-parser@1.19.5': resolution: {integrity: sha512-fB3Zu92ucau0iQ0JMCFQE7b/dv8Ot07NI3KaZIkIUNXq82k4eBAqUaneXfleGY9JWskeS9y+u0nXMyspcuQrCg==} + '@types/caseless@0.12.5': + resolution: {integrity: sha512-hWtVTC2q7hc7xZ/RLbxapMvDMgUnDvKvMOpKal4DrMyfGBUfB1oKaZlIRr6mJL+If3bAP6sV/QneGzF6tJjZDg==} + '@types/connect@3.4.36': resolution: {integrity: sha512-P63Zd/JUGq+PdrM1lv0Wv5SBYeA2+CORvbrXbngriYY0jzLUWfQMQQxOhjONEz/wlHOAxOdY7CY65rgQdTjq2w==} @@ -1673,6 +1699,9 @@ packages: '@types/range-parser@1.2.7': resolution: {integrity: sha512-hKormJbkJqzQGhziax5PItDUTMAM9uE2XXQmM37dyd4hVM+5aVl7oVxMVUiVQn2oCQFN/LKCZdvSM0pFRqbSmQ==} + '@types/request@2.48.12': + resolution: {integrity: sha512-G3sY+NpsA9jnwm0ixhAFQSJ3Q9JkpLZpJbI3GMv0mIAT0y3mRabYeINzal5WOChIiaTEGQYlHOKgkaM9EisWHw==} + '@types/send@0.17.4': resolution: {integrity: sha512-x2EM6TJOybec7c52BX0ZspPodMsQUd5L6PRwOunVyVUhXiBSKf3AezDL8Dgvgt5o0UfKNfuA0eMLr2wLT4AiBA==} @@ -1691,6 +1720,9 @@ packages: '@types/supertest@6.0.2': resolution: {integrity: sha512-137ypx2lk/wTQbW6An6safu9hXmajAifU/s7szAHLN/FeIm5w7yR0Wkl9fdJMRSHwOn4HLAI0DaB2TOORuhPDg==} + '@types/tough-cookie@4.0.5': + resolution: {integrity: sha512-/Ad8+nIOV7Rl++6f1BdKxFSMgmoqEoYbHRpPcx3JEfv8VRsQe9Z4mCXeJBzxs7mbHY/XOZZuXlRNfhpVPbs6ZA==} + '@types/triple-beam@1.3.5': resolution: {integrity: sha512-6WaYesThRMCl19iryMYP7/x2OVgCtbIVflDGFpWnb9irXI3UjYE4AzmYuiUKY1AJstGijoY+MgUszMgRxIYTYw==} @@ -1843,6 +1875,10 @@ packages: array-keyed-map@2.1.3: resolution: {integrity: sha512-JIUwuFakO+jHjxyp4YgSiKXSZeC0U+R1jR94bXWBcVlFRBycqXlb+kH9JHxBGcxnVuSqx5bnn0Qz9xtSeKOjiA==} + arrify@2.0.1: + resolution: {integrity: sha512-3duEwti880xqi4eAMN8AyR4a0ByT90zoYdLlevfrvU43vb0YZwZVfxOgxWrLXXXpyugL0hNZc9G6BiB5B3nUug==} + engines: {node: '>=8'} + asap@2.0.6: resolution: {integrity: sha512-BSHWgDSAiKs50o2Re8ppvp3seVHXSRM44cdSsT9FfNEUUZLOGWVCsiWaRPWM1Znn+mqZ1OfVZ3z3DWEzSp7hRA==} @@ -1853,6 +1889,9 @@ packages: async-mutex@0.5.0: resolution: {integrity: sha512-1A94B18jkJ3DYq284ohPxoXbfTA5HsQ7/Mf4DEhcyLx3Bz27Rh59iScbB6EPiP+B+joue6YCxcMXSbFC1tZKwA==} + async-retry@1.3.3: + resolution: {integrity: sha512-wfr/jstw9xNi/0teMHrRW7dsz3Lt5ARhYNZ2ewpadnhaIp5mbALhOAP+EAdsC7t4Z6wqsDVv9+W6gm1Dk9mEyw==} + async@2.6.4: resolution: {integrity: sha512-mzo5dfJYwAn29PeiJ0zvwTo04zj8HDJj0Mn8TD7sno7q12prdbnasKJHhkm2c1LgrhlJ0teaea8860oxi51mGA==} @@ -1933,6 +1972,9 @@ packages: resolution: {integrity: sha512-4Bcg1P8xhUuqcii/S0Z9wiHIrQVPMermM1any+MX5GeGD7faD3/msQUDGLol9wOcz4/jbg/WJnGqoJF6LiBdtg==} engines: {node: '>=10.0.0'} + bignumber.js@9.2.0: + resolution: {integrity: sha512-JocpCSOixzy5XFJi2ub6IMmV/G9i8Lrm2lZvwBv9xPdglmZM0ufDVBbjbrfU/zuLvBfD7Bv2eYxz9i+OHTgkew==} + bin-links@4.0.4: resolution: {integrity: sha512-cMtq4W5ZsEwcutJrVId+a/tjt8GSbS+h0oNkdl6+6rBuEv8Ot33Bevj5KPm40t309zuhVic8NjpuL42QCiJWWA==} engines: {node: ^14.17.0 || ^16.13.0 || >=18.0.0} @@ -1986,6 +2028,9 @@ packages: buffer-crc32@0.2.13: resolution: {integrity: sha512-VO9Ht/+p3SN7SKWqcrgEzjGbRSJYTx+Q1pTQC0wrWqHx0vpJraQ6GtHx8tvcg1rlK1byhU5gccxgOgj7B0TDkQ==} + buffer-equal-constant-time@1.0.1: + resolution: {integrity: sha512-zRpUiDwd/xk6ADqPMATG8vc9VPrkck7T07OIx0gnjmJAnHnTVXNQG3vfvWNuiZIkwu9KrKdA1iJKfsfTVxE6NA==} + buffer-from@1.1.2: resolution: {integrity: sha512-E+XQCRwSbaaiChtv6k6Dwgc+bx+Bs6vuKJHHl5kox/BaKbhiXzqQOwK4cO22yElGp2OCmjwVhT3HmxgyPGnJfQ==} @@ -2006,6 +2051,10 @@ packages: resolution: {integrity: sha512-KJ/Dmo1lDDhmW2XDPMo+9oiy/CeqosPguPCrgcVzKyZrL6pM1gU2GmPY/xo6OQPTUaA/c0kwHuywB4E6nmT9ww==} engines: {node: '>=10.6.0'} + call-bind-apply-helpers@1.0.2: + resolution: {integrity: sha512-Sp1ablJ0ivDkSzjcaJdxEunN5/XvksFJ2sMBFfq6x0ryhQV/2b/KwFe21cMpmHtPOSij8K99/wSfoEuTObmuMQ==} + engines: {node: '>= 0.4'} + call-bind@1.0.7: resolution: {integrity: sha512-GHTSNSYICQ7scH7sZ+M2rFopRoLh8t2bLSW6BbgrtLsahOIB5iyAVJf9GjWK3cYTDaMj4XdBpM1cA6pIS0Kv2w==} engines: {node: '>= 0.4'} @@ -2367,9 +2416,19 @@ packages: duck@0.1.12: resolution: {integrity: sha512-wkctla1O6VfP89gQ+J/yDesM0S7B7XLXjKGzXxMDVFg7uEn706niAtyYovKbyq1oT9YwDcly721/iUWoc8MVRg==} + dunder-proto@1.0.1: + resolution: {integrity: sha512-KIN/nDJBQRcXw0MLVhZE9iQHmG68qAVIBg9CqmUYjmQIhgij9U5MFvrqkUL5FbtyyzZuOeOt0zdeRe4UY7ct+A==} + engines: {node: '>= 0.4'} + + duplexify@4.1.3: + resolution: {integrity: sha512-M3BmBhwJRZsSx38lZyhE53Csddgzl5R7xGJNk7CVddZD6CcmwMCH8J+7AprIrQKH7TonKxaCjcv27Qmf+sQ+oA==} + eastasianwidth@0.2.0: resolution: {integrity: sha512-I88TYZWc9XiYHRQ4/3c5rjjfgkjhLyW2luGIheGERbNQ6OY7yTybanSpDXZa8y7VUP9YmDcYa+eyq4ca7iLqWA==} + ecdsa-sig-formatter@1.0.11: + resolution: {integrity: sha512-nagl3RYrbNv6kQkeJIpt6NJZy8twLB/2vtz6yN9Z4vRKHN4/QZJIEbqohALSgwKdnksuY3k5Addp5lg8sVoVcQ==} + editorconfig@1.0.4: resolution: {integrity: sha512-L9Qe08KWTlqYMVvMcTIvMAdl1cDUubzRNYL+WfA4bLDMHe4nemKkpmYzkznE1FwLKu0EEmy6obgQKzMJrg4x9Q==} engines: {node: '>=14'} @@ -2424,10 +2483,22 @@ packages: resolution: {integrity: sha512-jxayLKShrEqqzJ0eumQbVhTYQM27CfT1T35+gCgDFoL82JLsXqTJ76zv6A0YLOgEnLUMvLzsDsGIrl8NFpT2gQ==} engines: {node: '>= 0.4'} + es-define-property@1.0.1: + resolution: {integrity: sha512-e3nRfgfUZ4rNGL232gUgX06QNyyez04KdjFrF+LTRoOXmrOgFKDg4BCdsjW8EnT69eqdYGmRpJwiPVYNrCaW3g==} + engines: {node: '>= 0.4'} + es-errors@1.3.0: resolution: {integrity: sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw==} engines: {node: '>= 0.4'} + es-object-atoms@1.1.1: + resolution: {integrity: sha512-FGgH2h8zKNim9ljj7dankFPcICIK9Cp5bm+c2gQSYePhpaG5+esrLODihIorn+Pe6FGJzWhXQotPv73jTaldXA==} + engines: {node: '>= 0.4'} + + es-set-tostringtag@2.1.0: + resolution: {integrity: sha512-j6vWzfrGVfyXxge+O0x5sh6cvxAog0a/4Rdd2K36zCMV5eJ+/+tOAngRO8cODMNWbVRdVlmGZQL2YS3yR8bIUA==} + engines: {node: '>= 0.4'} + escalade@3.1.2: resolution: {integrity: sha512-ErCHMCae19vR8vQGe50xIsVomy19rg6gFu3+r3jkEO46suLMWBksvVyoGgQV+jOfl84ZSOSlmv6Gxa89PmTGmA==} engines: {node: '>=6'} @@ -2508,6 +2579,9 @@ packages: resolution: {integrity: sha512-5T6nhjsT+EOMzuck8JjBHARTHfMht0POzlA60WV2pMD3gyXw2LZnZ+ueGdNxG+0calOJcWKbpFcuzLZ91YWq9Q==} engines: {node: '>= 0.10.0'} + extend@3.0.2: + resolution: {integrity: sha512-fjquC59cD7CyW6urNXK0FBufkZcoiGG80wTuPujX590cB5Ttln20E2UB4S/WARVqhXffZl2LNgS+gQdPIIim/g==} + extract-zip@2.0.1: resolution: {integrity: sha512-GDhU9ntwuKyGXdZBUgTIe+vXnWj0fppUEtMDL0+idd5Sta8TGpHssn/eusA9mrPr9qNDym6SxAYZjNvCn/9RBg==} engines: {node: '>= 10.17.0'} @@ -2587,6 +2661,10 @@ packages: resolution: {integrity: sha512-KQVhvhK8ZkWzxKxOr56CPulAhH3dobtuQ4+hNQ+HekH/Wp5gSOafqRAeTphQUJAIk0GBvHZgJ2ZGRWd5kphMuw==} engines: {node: '>= 18'} + form-data@2.5.3: + resolution: {integrity: sha512-XHIrMD0NpDrNM/Ckf7XJiBbLl57KEhT3+i3yY+eWm+cqYZJQTZrKo8Y8AWKnuV5GT4scfuUGt9LzNoIx3dU1nQ==} + engines: {node: '>= 0.12'} + form-data@4.0.0: resolution: {integrity: sha512-ETEklSGi5t0QMZuiXoA/Q6vcnxcLQP5vdugSpuAyi6SVGi2clPPp+xgEhuMaHC+zGgn31Kd235W35f7Hykkaww==} engines: {node: '>= 6'} @@ -2633,6 +2711,14 @@ packages: function-bind@1.1.2: resolution: {integrity: sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA==} + gaxios@6.7.1: + resolution: {integrity: sha512-LDODD4TMYx7XXdpwxAVRAIAuB0bzv0s+ywFonY46k126qzQHT9ygyoa9tncmOiQmmDrik65UYsEkv3lbfqQ3yQ==} + engines: {node: '>=14'} + + gcp-metadata@6.1.1: + resolution: {integrity: sha512-a4tiq7E0/5fTjxPAaH4jpjkSv/uCaU2p5KC6HVGrvl0cDjA8iBZv4vv1gyzlmK0ZUKqwpOyQMKzZQe3lTit77A==} + engines: {node: '>=14'} + generic-pool@3.9.0: resolution: {integrity: sha512-hymDOu5B53XvN4QT9dBmZxPX4CWhBPPLguTZ9MMFeFa/Kg0xWVfylOVNlJji/E7yTZWFd/q9GO5TxDLq156D7g==} engines: {node: '>= 4'} @@ -2649,10 +2735,18 @@ packages: resolution: {integrity: sha512-5uYhsJH8VJBTv7oslg4BznJYhDoRI6waYCxMmCdnTrcCrHA/fCFKoTFz2JKKE0HdDFUF7/oQuhzumXJK7paBRQ==} engines: {node: '>= 0.4'} + get-intrinsic@1.3.0: + resolution: {integrity: sha512-9fSjSaos/fRIVIp+xSJlE6lfwhES7LNtKaCBIamHsjr2na1BiABJPo0mOjjz8GJDURarmCPGqaiVg5mfjb98CQ==} + engines: {node: '>= 0.4'} + get-package-type@0.1.0: resolution: {integrity: sha512-pjzuKtY64GYfWizNAJ0fr9VqttZkNiK2iS430LtIHzjBEr6bX8Am2zm4sW4Ro5wjWW5cAlRL1qAMTcXbjNAO2Q==} engines: {node: '>=8.0.0'} + get-proto@1.0.1: + resolution: {integrity: sha512-sTSfBjoXBp89JvIKIefqw7U2CCebsc74kiY6awiGogKtoSGbgjYE/G/+l9sF3MWFPNc9IcoOC4ODfKHfxFmp0g==} + engines: {node: '>= 0.4'} + get-stream@5.2.0: resolution: {integrity: sha512-nBF+F1rAZVCu/p7rjzgA+Yb4lfYXrpl7a6VmJrU8wF9I1CKvP/QwPNZHnOlwbTkY6dvtFIzFMSyQXbLoTQPRpA==} engines: {node: '>=8'} @@ -2682,9 +2776,21 @@ packages: resolution: {integrity: sha512-WOBp/EEGUiIsJSp7wcv/y6MO+lV9UoncWqxuFfm8eBwzWNgyfBd6Gz+IeKQ9jCmyhoH99g15M3T+QaVHFjizVA==} engines: {node: '>=4'} + google-auth-library@9.15.1: + resolution: {integrity: sha512-Jb6Z0+nvECVz+2lzSMt9u98UsoakXxA2HGHMCxh+so3n90XgYWkq5dur19JAJV7ONiJY22yBTyJB1TSkvPq9Ng==} + engines: {node: '>=14'} + + google-logging-utils@0.0.2: + resolution: {integrity: sha512-NEgUnEcBiP5HrPzufUkBzJOD/Sxsco3rLNo1F1TNf7ieU8ryUzBhqba8r756CjLX7rn3fHl6iLEwPYuqpoKgQQ==} + engines: {node: '>=14'} + gopd@1.0.1: resolution: {integrity: sha512-d65bNlIadxvpb/A2abVdlqKqV563juRnZ1Wtk6s1sIR8uNsXR70xqIzVqxVf1eTqDunwT2MkczEeaezCKTZhwA==} + gopd@1.2.0: + resolution: {integrity: sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg==} + engines: {node: '>= 0.4'} + gpt3-tokenizer@1.1.5: resolution: {integrity: sha512-O9iCL8MqGR0Oe9wTh0YftzIbysypNQmS5a5JG3cB3M4LMYjlAVvNnf8LUzVY9MrI7tj+YLY356uHtO2lLX2HpA==} engines: {node: '>=12'} @@ -2692,6 +2798,10 @@ packages: graceful-fs@4.2.11: resolution: {integrity: sha512-RbJ5/jmFcNNCcDV5o9eTnBLJ/HszWV0P73bc+Ff4nS/rJj+YaS6IGyiOL0VoBYX+l1Wrl3k63h/KrH+nhJ0XvQ==} + gtoken@7.1.0: + resolution: {integrity: sha512-pCcEwRi+TKpMlxAQObHDQ56KawURgyAf6jtIY046fJ5tIv3zDe/LEIubckAO8fj6JnAxLdmWkUfNyulQ2iKdEw==} + engines: {node: '>=14.0.0'} + has-flag@3.0.0: resolution: {integrity: sha512-sKJf1+ceQBr4SMkvQnBDNDtf4TXpVhVGateu0t918bl30FnbE2m4vNLX+VWe/dpjlb+HugGYzW7uQXH98HPEYw==} engines: {node: '>=4'} @@ -2711,6 +2821,14 @@ packages: resolution: {integrity: sha512-l3LCuF6MgDNwTDKkdYGEihYjt5pRPbEg46rtlmnSPlUbgmB8LOIrKJbYYFBSbnPaJexMKtiPO8hmeRjRz2Td+A==} engines: {node: '>= 0.4'} + has-symbols@1.1.0: + resolution: {integrity: sha512-1cDNdwJ2Jaohmb3sg4OmKaMBwuC48sYni5HUw2DvsC8LjGTLK9h+eb1X6RyuOHe4hT0ULCW68iomhjUoKUqlPQ==} + engines: {node: '>= 0.4'} + + has-tostringtag@1.0.2: + resolution: {integrity: sha512-NqADB8VjPFLM2V0VvHUewwwsw0ZWBaIdgo+ieHtK3hasLz4qeCRjYcqfB6AQrBggRKppKF8L52/VqdVsO47Dlw==} + engines: {node: '>= 0.4'} + hasown@2.0.2: resolution: {integrity: sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ==} engines: {node: '>= 0.4'} @@ -2731,6 +2849,9 @@ packages: resolution: {integrity: sha512-Y22oTqIU4uuPgEemfz7NDJz6OeKf12Lsu+QC+s3BVpda64lTiMYCyGwg5ki4vFxkMwQdeZDl2adZoqUgdFuTgQ==} engines: {node: '>=18'} + html-entities@2.6.0: + resolution: {integrity: sha512-kig+rMn/QOVRvr7c86gQ8lWXq+Hkv6CbAH1hLu+RG338StTpE8Z0b44SDVaqVu7HGKf27frdmUYEs9hTUX/cLQ==} + html-escaper@2.0.2: resolution: {integrity: sha512-H2iMtd0I4Mt5eYiapRdIDjp+XzelXQ0tFE4JS7YFwFevXXMmOp9myNrUvCg0D6ws8iqkRPBfKHgbwig1SmlLfg==} @@ -2745,6 +2866,10 @@ packages: resolution: {integrity: sha512-FtwrG/euBzaEjYeRqOgly7G0qviiXoJWnvEH2Z1plBdXgbyjv34pHTSb9zoeHMyDy33+DWy5Wt9Wo+TURtOYSQ==} engines: {node: '>= 0.8'} + http-proxy-agent@5.0.0: + resolution: {integrity: sha512-n2hY8YdoRE1i7r6M0w9DIw5GgZN0G25P8zLCRQ8rjXtTU3vsNFBI/vWK/UIeE6g5MUUz6avwAPXmL6Fy9D/90w==} + engines: {node: '>= 6'} + http-proxy-agent@7.0.2: resolution: {integrity: sha512-T1gkAiYYDWYx3V5Bmyu7HcfcvL7mUrTWiM6yOfa3PIphViJ/gFPbvidQ+veqSOHci/PxBcDabeUNCzpOODJZig==} engines: {node: '>= 14'} @@ -3111,6 +3236,9 @@ packages: engines: {node: '>=4'} hasBin: true + json-bigint@1.0.0: + resolution: {integrity: sha512-SiPv/8VpZuWbvLSMtTDU8hEfrZWg/mH/nV/b4o0CYbSxu1UIQPLdwKOCIyLQX+VIPO5vrLX3i8qtqFyhdPSUSQ==} + json-parse-even-better-errors@2.3.1: resolution: {integrity: sha512-xyFwyhro/JEof6Ghe2iz2NcXoj2sloNsWr/XsERDK/oiPCfaNhl5ONfp+jQdAZRQQ0IJWNzH9zIZF7li91kh2w==} @@ -3140,6 +3268,12 @@ packages: jszip@3.10.1: resolution: {integrity: sha512-xXDvecyTpGLrqFrvkrUSoxxfJI5AH7U8zxxtVclpsUtMCq4JQ290LY8AW5c7Ggnr/Y/oK+bQMbqK2qmtk3pN4g==} + jwa@2.0.0: + resolution: {integrity: sha512-jrZ2Qx916EA+fq9cEAeCROWPTfCwi1IVHqT2tapuqLEVVDKFDENFw1oL+MwrTvH6msKxsd1YTDVw6uKEcsrLEA==} + + jws@4.0.0: + resolution: {integrity: sha512-KDncfTmOZoOMTFG4mBlG0qUIOlc03fmzH+ru6RgYVZhPkyiy/92Owlt/8UEN+a4TXR1FQetfIpJE8ApdvdVxTg==} + kareem@2.6.3: resolution: {integrity: sha512-C3iHfuGUXK2u8/ipq9LfjFfXFxAZMQJJq7vLS45r3D9Y2xQ/m4S8zaR4zMLFWh9AsNPXmcFfUDhTEO8UIC/V6Q==} engines: {node: '>=12.0.0'} @@ -3258,6 +3392,10 @@ packages: engines: {node: '>= 18'} hasBin: true + math-intrinsics@1.1.0: + resolution: {integrity: sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g==} + engines: {node: '>= 0.4'} + md5@2.3.0: resolution: {integrity: sha512-T1GITYmFaKuO91vxyoQMFETst+O71VUPEU3ze5GNzDm0OWdP8v1ziTaAEPUr/3kLsY3Sftgz242A1SetQiDL7g==} @@ -3304,6 +3442,11 @@ packages: engines: {node: '>=4.0.0'} hasBin: true + mime@3.0.0: + resolution: {integrity: sha512-jSCU7/VB1loIWBZe14aEYHU/+1UMEHoaO7qxCOVJOw9GgH72VAWppxNcjU+x9a2k3GSIBXNKxXQFqRvvZ7vr3A==} + engines: {node: '>=10.0.0'} + hasBin: true + mimic-fn@2.1.0: resolution: {integrity: sha512-OqbOk5oEQeAZ8WXWydlu9HJjz9WVdEIvamMCcXmuqUYjTknH/sqsWvhQ3vgwKFRR1HpjvNBKQ37nbJgYzGqGcg==} engines: {node: '>=6'} @@ -3896,6 +4039,14 @@ packages: resolution: {integrity: sha512-oKWePCxqpd6FlLvGV1VU0x7bkPmmCNolxzjMf4NczoDnQcIWrAF+cPtZn5i6n+RfD2d9i0tzpKnG6Yk168yIyw==} hasBin: true + retry-request@7.0.2: + resolution: {integrity: sha512-dUOvLMJ0/JJYEn8NrpOaGNE7X3vpI5XlZS/u0ANjqtcZVKnIxP7IgCFwrKTxENw29emmwug53awKtaMm4i9g5w==} + engines: {node: '>=14'} + + retry@0.13.1: + resolution: {integrity: sha512-XQBQ3I8W1Cge0Seh+6gjj03LbmRFWuoszgK9ooCpwYIrhhoO80pfq4cUkU5DkknwfOfFteRwlZ56PYOGYyFWdg==} + engines: {node: '>= 4'} + rimraf@5.0.7: resolution: {integrity: sha512-nV6YcJo5wbLW77m+8KjH8aB/7/rxQy9SZ0HY5shnwULfS+9nmTtVXAJET5NdZmCzA4fPI/Hm1wo/Po/4mopOdg==} engines: {node: '>=14.18'} @@ -4075,6 +4226,12 @@ packages: resolution: {integrity: sha512-I6GPS/E0zyieHehMRPQcqkiBMJKGgLta+1hREixhoLPqEA0AlVFiC43dl8uPpmkkeRdDMzYRWFWk5/l9x7nmNg==} engines: {node: '>=0.10.0'} + stream-events@1.0.5: + resolution: {integrity: sha512-E1GUzBSgvct8Jsb3v2X15pjzN1tYebtbLaMg+eBOUOAxgbLoSbT2NS91ckc5lJD1KfLjId+jXJRgo0qnV5Nerg==} + + stream-shift@1.0.3: + resolution: {integrity: sha512-76ORR0DO1o1hlKwTbi/DM3EXWGf3ZJYO8cXX5RJwnul2DEg2oyoZyjLNoQM8WsvZiFKCRfC1O0J7iCvie3RZmQ==} + streamx@2.18.0: resolution: {integrity: sha512-LLUC1TWdjVdn1weXGcSxyTR3T4+acB6tVGXT95y0nGbca4t4o/ng1wKAGTljm9VicuCVLvRlqFYXYy5GwgM7sQ==} @@ -4123,6 +4280,9 @@ packages: strnum@1.0.5: resolution: {integrity: sha512-J8bbNyKKXl5qYcR36TIO8W3mVGVHrmmxsd5PAItGkmyzwJvybiw2IVq5nqd0i4LSNSkB/sx9VHllbfFdr9k1JA==} + stubs@3.0.0: + resolution: {integrity: sha512-PdHt7hHUJKxvTCgbKX9C1V/ftOcjJQgz8BZwNfV5c4B6dcGqlpelTbJ999jBGZ2jYiPAwcX5dP6oBwVlBlUbxw==} + supabase@1.172.2: resolution: {integrity: sha512-h2J6kKEikXnZyurUcCYg215qkQpINOhdWkiclHcWAuVeqXsNrfrYaf1s0qbbcdRyMtrVW48I+VdVTw71Cnn20Q==} engines: {npm: '>=8'} @@ -4181,6 +4341,10 @@ packages: resolution: {integrity: sha512-hctwP0Nb4AB60bj8WQgRYaMOuJYRAPMGiQUAotms5igN8ppfQM+IvjQ5HcKu1MaZh2Wy2KWVTe563Yj8dfc14w==} engines: {node: '>=18'} + teeny-request@9.0.0: + resolution: {integrity: sha512-resvxdc6Mgb7YEThw6G6bExlXKkv6+YbuzGg9xuXxSgxJF7Ozs+o8Y9+2R3sArdWdW8nOokoQb1yrpFB0pQK2g==} + engines: {node: '>=14'} + test-exclude@6.0.0: resolution: {integrity: sha512-cAGWPIyOHU6zlmg88jwm7VRyXnMN7iV68OGAbYDk/Mh/xC/pzVPlQtY6ngoIH/5/tciuhGfvESU8GrHrcxD56w==} engines: {node: '>=8'} @@ -4305,13 +4469,8 @@ packages: resolution: {integrity: sha512-TkRKr9sUTxEH8MdfuCSP7VizJyzRNMjj2J2do2Jr3Kym598JVdEksuzPQCnlFPW4ky9Q+iA+ma9BGm06XQBy8g==} engines: {node: '>= 0.6'} - typescript@5.4.5: - resolution: {integrity: sha512-vcI4UpRgg81oIRUFwR0WSIHKt11nJ7SAVlYNIu+QpqeyXP+gpQJy/Z4+F0aGxSE4MqwjyXvW/TzgkLAx2AGHwQ==} - engines: {node: '>=14.17'} - hasBin: true - - typescript@5.8.2: - resolution: {integrity: sha512-aJn6wq13/afZp/jT9QZmwEjDqqvSGp1VT5GVg+f/t6/oVyrgXM6BY1h9BRh/O5p3PlUPAe+WuiEZOmb/49RqoQ==} + typescript@5.8.3: + resolution: {integrity: sha512-p1diW6TqL9L07nNxvRMM7hMMw4c5XOo/1ibL4aAIGmSAt9slTE1Xgw5KWuof2uTOvCg9BY7ZRi+GaF+7sfgPeQ==} engines: {node: '>=14.17'} hasBin: true @@ -4385,6 +4544,10 @@ packages: resolution: {integrity: sha512-8XkAphELsDnEGrDxUOHB3RGvXz6TeuYSGEZBOjtTtPm2lwhGBjLgOzLHB63IUWfBpNucQjND6d3AOudO+H3RWQ==} hasBin: true + uuid@8.3.2: + resolution: {integrity: sha512-+NYs2QeMWy+GWFOEm9xnn6HCDp0l7QBD7ml8zLUmJ+93Q5NF0NocErnwkTkXVFNiX3/fpC6afS8Dhb/gz7R7eg==} + hasBin: true + uuid@9.0.1: resolution: {integrity: sha512-b+1eJOlsR9K8HJpow9Ok3fiWOWSIcIzXodvv0rQjVoOVNpWMpxf1wZNpt4y9h10odCNrqnYp1OBzRktckBe3sA==} hasBin: true @@ -5423,6 +5586,36 @@ snapshots: '@dqbd/tiktoken@1.0.17': {} + '@google-cloud/paginator@5.0.2': + dependencies: + arrify: 2.0.1 + extend: 3.0.2 + + '@google-cloud/projectify@4.0.0': {} + + '@google-cloud/promisify@4.0.0': {} + + '@google-cloud/storage@7.16.0(encoding@0.1.13)': + dependencies: + '@google-cloud/paginator': 5.0.2 + '@google-cloud/projectify': 4.0.0 + '@google-cloud/promisify': 4.0.0 + abort-controller: 3.0.0 + async-retry: 1.3.3 + duplexify: 4.1.3 + fast-xml-parser: 4.4.1 + gaxios: 6.7.1(encoding@0.1.13) + google-auth-library: 9.15.1(encoding@0.1.13) + html-entities: 2.6.0 + mime: 3.0.0 + p-limit: 3.1.0 + retry-request: 7.0.2(encoding@0.1.13) + teeny-request: 9.0.0(encoding@0.1.13) + uuid: 8.3.2 + transitivePeerDependencies: + - encoding + - supports-color + '@ioredis/commands@1.2.0': {} '@isaacs/cliui@8.0.2': @@ -5457,7 +5650,7 @@ snapshots: jest-util: 29.7.0 slash: 3.0.0 - '@jest/core@29.7.0(ts-node@10.9.2(@types/node@20.14.1)(typescript@5.4.5))': + '@jest/core@29.7.0(ts-node@10.9.2(@types/node@20.14.1)(typescript@5.8.3))': dependencies: '@jest/console': 29.7.0 '@jest/reporters': 29.7.0 @@ -5471,7 +5664,7 @@ snapshots: exit: 0.1.2 graceful-fs: 4.2.11 jest-changed-files: 29.7.0 - jest-config: 29.7.0(@types/node@20.14.1)(ts-node@10.9.2(@types/node@20.14.1)(typescript@5.4.5)) + jest-config: 29.7.0(@types/node@20.14.1)(ts-node@10.9.2(@types/node@20.14.1)(typescript@5.8.3)) jest-haste-map: 29.7.0 jest-message-util: 29.7.0 jest-regex-util: 29.6.3 @@ -6530,6 +6723,8 @@ snapshots: - bufferutil - utf-8-validate + '@tootallnate/once@2.0.0': {} + '@tootallnate/quickjs-emscripten@0.23.0': {} '@tsconfig/node10@1.0.11': {} @@ -6568,6 +6763,8 @@ snapshots: '@types/connect': 3.4.38 '@types/node': 20.14.1 + '@types/caseless@0.12.5': {} + '@types/connect@3.4.36': dependencies: '@types/node': 20.14.1 @@ -6670,6 +6867,13 @@ snapshots: '@types/range-parser@1.2.7': {} + '@types/request@2.48.12': + dependencies: + '@types/caseless': 0.12.5 + '@types/node': 20.14.1 + '@types/tough-cookie': 4.0.5 + form-data: 2.5.3 + '@types/send@0.17.4': dependencies: '@types/mime': 1.3.5 @@ -6697,6 +6901,8 @@ snapshots: '@types/methods': 1.1.4 '@types/superagent': 8.1.9 + '@types/tough-cookie@4.0.5': {} + '@types/triple-beam@1.3.5': {} '@types/uuid@9.0.8': {} @@ -6832,6 +7038,8 @@ snapshots: array-keyed-map@2.1.3: {} + arrify@2.0.1: {} + asap@2.0.6: {} ast-types@0.13.4: @@ -6842,6 +7050,10 @@ snapshots: dependencies: tslib: 2.6.3 + async-retry@1.3.3: + dependencies: + retry: 0.13.1 + async@2.6.4: dependencies: lodash: 4.17.21 @@ -6961,6 +7173,8 @@ snapshots: basic-ftp@5.0.5: {} + bignumber.js@9.2.0: {} + bin-links@4.0.4: dependencies: cmd-shim: 6.0.3 @@ -7027,6 +7241,8 @@ snapshots: buffer-crc32@0.2.13: {} + buffer-equal-constant-time@1.0.1: {} + buffer-from@1.1.2: {} buffer@5.7.1: @@ -7055,6 +7271,11 @@ snapshots: cacheable-lookup@6.1.0: {} + call-bind-apply-helpers@1.0.2: + dependencies: + es-errors: 1.3.0 + function-bind: 1.1.2 + call-bind@1.0.7: dependencies: es-define-property: 1.0.0 @@ -7234,22 +7455,22 @@ snapshots: corser@2.0.1: {} - cosmiconfig@9.0.0(typescript@5.4.5): + cosmiconfig@9.0.0(typescript@5.8.3): dependencies: env-paths: 2.2.1 import-fresh: 3.3.0 js-yaml: 4.1.0 parse-json: 5.2.0 optionalDependencies: - typescript: 5.4.5 + typescript: 5.8.3 - create-jest@29.7.0(@types/node@20.14.1)(ts-node@10.9.2(@types/node@20.14.1)(typescript@5.4.5)): + create-jest@29.7.0(@types/node@20.14.1)(ts-node@10.9.2(@types/node@20.14.1)(typescript@5.8.3)): dependencies: '@jest/types': 29.6.3 chalk: 4.1.2 exit: 0.1.2 graceful-fs: 4.2.11 - jest-config: 29.7.0(@types/node@20.14.1)(ts-node@10.9.2(@types/node@20.14.1)(typescript@5.4.5)) + jest-config: 29.7.0(@types/node@20.14.1)(ts-node@10.9.2(@types/node@20.14.1)(typescript@5.8.3)) jest-util: 29.7.0 prompts: 2.4.2 transitivePeerDependencies: @@ -7406,8 +7627,25 @@ snapshots: dependencies: underscore: 1.13.6 + dunder-proto@1.0.1: + dependencies: + call-bind-apply-helpers: 1.0.2 + es-errors: 1.3.0 + gopd: 1.2.0 + + duplexify@4.1.3: + dependencies: + end-of-stream: 1.4.4 + inherits: 2.0.4 + readable-stream: 3.6.2 + stream-shift: 1.0.3 + eastasianwidth@0.2.0: {} + ecdsa-sig-formatter@1.0.11: + dependencies: + safe-buffer: 5.2.1 + editorconfig@1.0.4: dependencies: '@one-ini/wasm': 0.1.1 @@ -7453,8 +7691,21 @@ snapshots: dependencies: get-intrinsic: 1.2.4 + es-define-property@1.0.1: {} + es-errors@1.3.0: {} + es-object-atoms@1.1.1: + dependencies: + es-errors: 1.3.0 + + es-set-tostringtag@2.1.0: + dependencies: + es-errors: 1.3.0 + get-intrinsic: 1.3.0 + has-tostringtag: 1.0.2 + hasown: 2.0.2 + escalade@3.1.2: {} escape-html@1.0.3: {} @@ -7557,6 +7808,8 @@ snapshots: transitivePeerDependencies: - supports-color + extend@3.0.2: {} + extract-zip@2.0.1: dependencies: debug: 4.3.5 @@ -7636,6 +7889,14 @@ snapshots: form-data-encoder@4.0.2: {} + form-data@2.5.3: + dependencies: + asynckit: 0.4.0 + combined-stream: 1.0.8 + es-set-tostringtag: 2.1.0 + mime-types: 2.1.35 + safe-buffer: 5.2.1 + form-data@4.0.0: dependencies: asynckit: 0.4.0 @@ -7683,6 +7944,26 @@ snapshots: function-bind@1.1.2: {} + gaxios@6.7.1(encoding@0.1.13): + dependencies: + extend: 3.0.2 + https-proxy-agent: 7.0.6 + is-stream: 2.0.1 + node-fetch: 2.7.0(encoding@0.1.13) + uuid: 9.0.1 + transitivePeerDependencies: + - encoding + - supports-color + + gcp-metadata@6.1.1(encoding@0.1.13): + dependencies: + gaxios: 6.7.1(encoding@0.1.13) + google-logging-utils: 0.0.2 + json-bigint: 1.0.0 + transitivePeerDependencies: + - encoding + - supports-color + generic-pool@3.9.0: {} gensync@1.0.0-beta.2: {} @@ -7697,8 +7978,26 @@ snapshots: has-symbols: 1.0.3 hasown: 2.0.2 + get-intrinsic@1.3.0: + dependencies: + call-bind-apply-helpers: 1.0.2 + es-define-property: 1.0.1 + es-errors: 1.3.0 + es-object-atoms: 1.1.1 + function-bind: 1.1.2 + get-proto: 1.0.1 + gopd: 1.2.0 + has-symbols: 1.1.0 + hasown: 2.0.2 + math-intrinsics: 1.1.0 + get-package-type@0.1.0: {} + get-proto@1.0.1: + dependencies: + dunder-proto: 1.0.1 + es-object-atoms: 1.1.1 + get-stream@5.2.0: dependencies: pump: 3.0.0 @@ -7738,16 +8037,40 @@ snapshots: globals@11.12.0: {} + google-auth-library@9.15.1(encoding@0.1.13): + dependencies: + base64-js: 1.5.1 + ecdsa-sig-formatter: 1.0.11 + gaxios: 6.7.1(encoding@0.1.13) + gcp-metadata: 6.1.1(encoding@0.1.13) + gtoken: 7.1.0(encoding@0.1.13) + jws: 4.0.0 + transitivePeerDependencies: + - encoding + - supports-color + + google-logging-utils@0.0.2: {} + gopd@1.0.1: dependencies: get-intrinsic: 1.2.4 + gopd@1.2.0: {} + gpt3-tokenizer@1.1.5: dependencies: array-keyed-map: 2.1.3 graceful-fs@4.2.11: {} + gtoken@7.1.0(encoding@0.1.13): + dependencies: + gaxios: 6.7.1(encoding@0.1.13) + jws: 4.0.0 + transitivePeerDependencies: + - encoding + - supports-color + has-flag@3.0.0: {} has-flag@4.0.0: {} @@ -7760,6 +8083,12 @@ snapshots: has-symbols@1.0.3: {} + has-symbols@1.1.0: {} + + has-tostringtag@1.0.2: + dependencies: + has-symbols: 1.0.3 + hasown@2.0.2: dependencies: function-bind: 1.1.2 @@ -7776,6 +8105,8 @@ snapshots: dependencies: whatwg-encoding: 3.1.1 + html-entities@2.6.0: {} + html-escaper@2.0.2: {} html-to-text@9.0.5: @@ -7801,6 +8132,14 @@ snapshots: statuses: 2.0.1 toidentifier: 1.0.1 + http-proxy-agent@5.0.0: + dependencies: + '@tootallnate/once': 2.0.0 + agent-base: 6.0.2 + debug: 4.3.5 + transitivePeerDependencies: + - supports-color + http-proxy-agent@7.0.2: dependencies: agent-base: 7.1.1 @@ -8073,16 +8412,16 @@ snapshots: - babel-plugin-macros - supports-color - jest-cli@29.7.0(@types/node@20.14.1)(ts-node@10.9.2(@types/node@20.14.1)(typescript@5.4.5)): + jest-cli@29.7.0(@types/node@20.14.1)(ts-node@10.9.2(@types/node@20.14.1)(typescript@5.8.3)): dependencies: - '@jest/core': 29.7.0(ts-node@10.9.2(@types/node@20.14.1)(typescript@5.4.5)) + '@jest/core': 29.7.0(ts-node@10.9.2(@types/node@20.14.1)(typescript@5.8.3)) '@jest/test-result': 29.7.0 '@jest/types': 29.6.3 chalk: 4.1.2 - create-jest: 29.7.0(@types/node@20.14.1)(ts-node@10.9.2(@types/node@20.14.1)(typescript@5.4.5)) + create-jest: 29.7.0(@types/node@20.14.1)(ts-node@10.9.2(@types/node@20.14.1)(typescript@5.8.3)) exit: 0.1.2 import-local: 3.1.0 - jest-config: 29.7.0(@types/node@20.14.1)(ts-node@10.9.2(@types/node@20.14.1)(typescript@5.4.5)) + jest-config: 29.7.0(@types/node@20.14.1)(ts-node@10.9.2(@types/node@20.14.1)(typescript@5.8.3)) jest-util: 29.7.0 jest-validate: 29.7.0 yargs: 17.7.2 @@ -8092,7 +8431,7 @@ snapshots: - supports-color - ts-node - jest-config@29.7.0(@types/node@20.14.1)(ts-node@10.9.2(@types/node@20.14.1)(typescript@5.4.5)): + jest-config@29.7.0(@types/node@20.14.1)(ts-node@10.9.2(@types/node@20.14.1)(typescript@5.8.3)): dependencies: '@babel/core': 7.24.6 '@jest/test-sequencer': 29.7.0 @@ -8118,7 +8457,7 @@ snapshots: strip-json-comments: 3.1.1 optionalDependencies: '@types/node': 20.14.1 - ts-node: 10.9.2(@types/node@20.14.1)(typescript@5.4.5) + ts-node: 10.9.2(@types/node@20.14.1)(typescript@5.8.3) transitivePeerDependencies: - babel-plugin-macros - supports-color @@ -8345,12 +8684,12 @@ snapshots: merge-stream: 2.0.0 supports-color: 8.1.1 - jest@29.7.0(@types/node@20.14.1)(ts-node@10.9.2(@types/node@20.14.1)(typescript@5.4.5)): + jest@29.7.0(@types/node@20.14.1)(ts-node@10.9.2(@types/node@20.14.1)(typescript@5.8.3)): dependencies: - '@jest/core': 29.7.0(ts-node@10.9.2(@types/node@20.14.1)(typescript@5.4.5)) + '@jest/core': 29.7.0(ts-node@10.9.2(@types/node@20.14.1)(typescript@5.8.3)) '@jest/types': 29.6.3 import-local: 3.1.0 - jest-cli: 29.7.0(@types/node@20.14.1)(ts-node@10.9.2(@types/node@20.14.1)(typescript@5.4.5)) + jest-cli: 29.7.0(@types/node@20.14.1)(ts-node@10.9.2(@types/node@20.14.1)(typescript@5.8.3)) transitivePeerDependencies: - '@types/node' - babel-plugin-macros @@ -8414,6 +8753,10 @@ snapshots: jsesc@2.5.2: {} + json-bigint@1.0.0: + dependencies: + bignumber.js: 9.2.0 + json-parse-even-better-errors@2.3.1: {} json-schema-to-zod@2.3.0: {} @@ -8443,6 +8786,17 @@ snapshots: readable-stream: 2.3.8 setimmediate: 1.0.5 + jwa@2.0.0: + dependencies: + buffer-equal-constant-time: 1.0.1 + ecdsa-sig-formatter: 1.0.11 + safe-buffer: 5.2.1 + + jws@4.0.0: + dependencies: + jwa: 2.0.0 + safe-buffer: 5.2.1 + kareem@2.6.3: {} keyword-extractor@0.0.28: {} @@ -8556,6 +8910,8 @@ snapshots: marked@14.1.2: {} + math-intrinsics@1.1.0: {} + md5@2.3.0: dependencies: charenc: 0.0.2 @@ -8589,6 +8945,8 @@ snapshots: mime@2.6.0: {} + mime@3.0.0: {} + mimic-fn@2.1.0: {} minimatch@3.1.2: @@ -9036,7 +9394,7 @@ snapshots: csv-parse: 5.5.6 gpt3-tokenizer: 1.1.5 openai: 3.3.0 - typescript: 5.8.2 + typescript: 5.8.3 uuid: 9.0.1 zod: 3.24.2 transitivePeerDependencies: @@ -9090,10 +9448,10 @@ snapshots: - supports-color - utf-8-validate - puppeteer@22.12.1(typescript@5.4.5): + puppeteer@22.12.1(typescript@5.8.3): dependencies: '@puppeteer/browsers': 2.2.3 - cosmiconfig: 9.0.0(typescript@5.4.5) + cosmiconfig: 9.0.0(typescript@5.8.3) devtools-protocol: 0.0.1299070 puppeteer-core: 22.12.1 transitivePeerDependencies: @@ -9234,6 +9592,17 @@ snapshots: path-parse: 1.0.7 supports-preserve-symlinks-flag: 1.0.0 + retry-request@7.0.2(encoding@0.1.13): + dependencies: + '@types/request': 2.48.12 + extend: 3.0.2 + teeny-request: 9.0.0(encoding@0.1.13) + transitivePeerDependencies: + - encoding + - supports-color + + retry@0.13.1: {} + rimraf@5.0.7: dependencies: glob: 10.4.2 @@ -9407,6 +9776,12 @@ snapshots: stopwords-iso@1.1.0: {} + stream-events@1.0.5: + dependencies: + stubs: 3.0.0 + + stream-shift@1.0.3: {} + streamx@2.18.0: dependencies: fast-fifo: 1.3.2 @@ -9461,6 +9836,8 @@ snapshots: strnum@1.0.5: {} + stubs@3.0.0: {} + supabase@1.172.2: dependencies: bin-links: 4.0.4 @@ -9541,6 +9918,17 @@ snapshots: mkdirp: 3.0.1 yallist: 5.0.0 + teeny-request@9.0.0(encoding@0.1.13): + dependencies: + http-proxy-agent: 5.0.0 + https-proxy-agent: 5.0.1 + node-fetch: 2.7.0(encoding@0.1.13) + stream-events: 1.0.5 + uuid: 9.0.1 + transitivePeerDependencies: + - encoding + - supports-color + test-exclude@6.0.0: dependencies: '@istanbuljs/schema': 0.1.3 @@ -9591,17 +9979,17 @@ snapshots: triple-beam@1.4.1: {} - ts-jest@29.1.4(@babel/core@7.24.6)(@jest/transform@29.7.0)(@jest/types@29.6.3)(babel-jest@29.7.0(@babel/core@7.24.6))(jest@29.7.0(@types/node@20.14.1)(ts-node@10.9.2(@types/node@20.14.1)(typescript@5.4.5)))(typescript@5.4.5): + ts-jest@29.1.4(@babel/core@7.24.6)(@jest/transform@29.7.0)(@jest/types@29.6.3)(babel-jest@29.7.0(@babel/core@7.24.6))(jest@29.7.0(@types/node@20.14.1)(ts-node@10.9.2(@types/node@20.14.1)(typescript@5.8.3)))(typescript@5.8.3): dependencies: bs-logger: 0.2.6 fast-json-stable-stringify: 2.1.0 - jest: 29.7.0(@types/node@20.14.1)(ts-node@10.9.2(@types/node@20.14.1)(typescript@5.4.5)) + jest: 29.7.0(@types/node@20.14.1)(ts-node@10.9.2(@types/node@20.14.1)(typescript@5.8.3)) jest-util: 29.7.0 json5: 2.2.3 lodash.memoize: 4.1.2 make-error: 1.3.6 semver: 7.6.2 - typescript: 5.4.5 + typescript: 5.8.3 yargs-parser: 21.1.1 optionalDependencies: '@babel/core': 7.24.6 @@ -9609,7 +9997,7 @@ snapshots: '@jest/types': 29.6.3 babel-jest: 29.7.0(@babel/core@7.24.6) - ts-node@10.9.2(@types/node@20.14.1)(typescript@5.4.5): + ts-node@10.9.2(@types/node@20.14.1)(typescript@5.8.3): dependencies: '@cspotcode/source-map-support': 0.8.1 '@tsconfig/node10': 1.0.11 @@ -9623,7 +10011,7 @@ snapshots: create-require: 1.1.1 diff: 4.0.2 make-error: 1.3.6 - typescript: 5.4.5 + typescript: 5.8.3 v8-compile-cache-lib: 3.0.1 yn: 3.1.1 @@ -9646,9 +10034,7 @@ snapshots: media-typer: 0.3.0 mime-types: 2.1.35 - typescript@5.4.5: {} - - typescript@5.8.2: {} + typescript@5.8.3: {} typesense@1.8.2(@babel/runtime@7.24.6): dependencies: @@ -9709,6 +10095,8 @@ snapshots: uuid@10.0.0: {} + uuid@8.3.2: {} + uuid@9.0.1: {} v8-compile-cache-lib@3.0.1: {} diff --git a/apps/api/src/services/logging/log_job.ts b/apps/api/src/services/logging/log_job.ts index ac41b6e8..5432687f 100644 --- a/apps/api/src/services/logging/log_job.ts +++ b/apps/api/src/services/logging/log_job.ts @@ -1,10 +1,10 @@ -import { ExtractorOptions } from "./../../lib/entities"; import { supabase_service } from "../supabase"; import { FirecrawlJob } from "../../types"; import { posthog } from "../posthog"; import "dotenv/config"; import { logger } from "../../lib/logger"; import { configDotenv } from "dotenv"; +import { Storage } from "@google-cloud/storage"; configDotenv(); function cleanOfNull(x: T): T { @@ -21,6 +21,43 @@ function cleanOfNull(x: T): T { } } + +async function saveJobToGCS(job: FirecrawlJob, bucketName: string): Promise { + try { + const storage = new Storage(); + const bucket = storage.bucket(bucketName); + const blob = bucket.file(`${job.job_id}.json`); + await blob.save(JSON.stringify(job.docs), { + contentType: "application/json", + }); + await blob.setMetadata({ + metadata: { + job_id: job.job_id ?? null, + success: job.success, + message: job.message ?? null, + num_docs: job.num_docs, + time_taken: job.time_taken, + team_id: (job.team_id === "preview" || job.team_id?.startsWith("preview_"))? null : job.team_id, + mode: job.mode, + url: job.url, + crawler_options: job.crawlerOptions, + page_options: job.scrapeOptions, + origin: job.origin, + num_tokens: job.num_tokens ?? null, + retry: !!job.retry, + crawl_id: job.crawl_id ?? null, + tokens_billed: job.tokens_billed ?? null, + }, + }) + } catch (error) { + logger.error(`Error saving job to GCS`, { + error, + scrapeId: job.job_id, + jobId: job.job_id, + }); + } +} + async function indexJob(job: FirecrawlJob): Promise { try { if (job.mode !== "single_urls" && job.mode !== "scrape") { @@ -108,6 +145,10 @@ export async function logJob(job: FirecrawlJob, force: boolean = false) { indexJob(job); } + if (process.env.GCS_BUCKET_NAME) { + await saveJobToGCS(job, process.env.GCS_BUCKET_NAME); + } + if (force) { let i = 0, done = false; From 37b13ba146d35f016e77453e920e8029af901b26 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 8 Apr 2025 20:04:59 +0200 Subject: [PATCH 049/160] feat(log_job): allow use of api key if specified --- apps/api/src/services/logging/log_job.ts | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/apps/api/src/services/logging/log_job.ts b/apps/api/src/services/logging/log_job.ts index 5432687f..d1751a6c 100644 --- a/apps/api/src/services/logging/log_job.ts +++ b/apps/api/src/services/logging/log_job.ts @@ -24,7 +24,9 @@ function cleanOfNull(x: T): T { async function saveJobToGCS(job: FirecrawlJob, bucketName: string): Promise { try { - const storage = new Storage(); + const storage = new Storage({ + apiKey: process.env.GCS_API_KEY, + }); const bucket = storage.bucket(bucketName); const blob = bucket.file(`${job.job_id}.json`); await blob.save(JSON.stringify(job.docs), { From c69d1561794075f4248d325e81118db6d3387687 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 8 Apr 2025 20:18:48 +0200 Subject: [PATCH 050/160] fix(log_job): use service account credentials --- apps/api/src/services/logging/log_job.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/services/logging/log_job.ts b/apps/api/src/services/logging/log_job.ts index d1751a6c..80481a3e 100644 --- a/apps/api/src/services/logging/log_job.ts +++ b/apps/api/src/services/logging/log_job.ts @@ -25,7 +25,7 @@ function cleanOfNull(x: T): T { async function saveJobToGCS(job: FirecrawlJob, bucketName: string): Promise { try { const storage = new Storage({ - apiKey: process.env.GCS_API_KEY, + credentials: process.env.GCS_CREDENTIALS ? JSON.parse(process.env.GCS_CREDENTIALS) : undefined, }); const bucket = storage.bucket(bucketName); const blob = bucket.file(`${job.job_id}.json`); From 62265c63c85ff58fc0ca34275bdc19c2cd7640bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 8 Apr 2025 20:26:43 +0200 Subject: [PATCH 051/160] feat(log_job): use atob --- apps/api/src/services/logging/log_job.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/services/logging/log_job.ts b/apps/api/src/services/logging/log_job.ts index 80481a3e..9caa6716 100644 --- a/apps/api/src/services/logging/log_job.ts +++ b/apps/api/src/services/logging/log_job.ts @@ -25,7 +25,7 @@ function cleanOfNull(x: T): T { async function saveJobToGCS(job: FirecrawlJob, bucketName: string): Promise { try { const storage = new Storage({ - credentials: process.env.GCS_CREDENTIALS ? JSON.parse(process.env.GCS_CREDENTIALS) : undefined, + credentials: process.env.GCS_CREDENTIALS ? JSON.parse(atob(process.env.GCS_CREDENTIALS)) : undefined, }); const bucket = storage.bucket(bucketName); const blob = bucket.file(`${job.job_id}.json`); From 8c801ed95672e75501dae03f64b9d7e11f5ca1cb Mon Sep 17 00:00:00 2001 From: "devin-ai-integration[bot]" <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Tue, 8 Apr 2025 21:09:31 +0200 Subject: [PATCH 052/160] Rename 'compare' format and property to 'changeTracking' (#1423) --- apps/api/src/__tests__/snips/scrape.test.ts | 8 ++++---- apps/api/src/controllers/v1/types.ts | 10 +++++----- apps/api/src/scraper/scrapeURL/transformers/diff.ts | 6 +++--- apps/api/src/scraper/scrapeURL/transformers/index.ts | 10 +++++----- apps/js-sdk/firecrawl/src/index.ts | 4 ++-- 5 files changed, 19 insertions(+), 19 deletions(-) diff --git a/apps/api/src/__tests__/snips/scrape.test.ts b/apps/api/src/__tests__/snips/scrape.test.ts index 7495e789..bb053271 100644 --- a/apps/api/src/__tests__/snips/scrape.test.ts +++ b/apps/api/src/__tests__/snips/scrape.test.ts @@ -85,15 +85,15 @@ describe("Scrape tests", () => { // }, 30000); // }); - describe("Compare format", () => { + describe("Change Tracking format", () => { it.concurrent("works", async () => { const response = await scrape({ url: "https://example.com", - formats: ["markdown", "compare"], + formats: ["markdown", "changeTracking"], }); - expect(response.compare).toBeDefined(); - expect(response.compare?.previousScrapeAt).not.toBeNull(); + expect(response.changeTracking).toBeDefined(); + expect(response.changeTracking?.previousScrapeAt).not.toBeNull(); }); }); diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index b610826d..ea46768f 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -21,7 +21,7 @@ export type Format = | "screenshot" | "screenshot@fullPage" | "extract" - | "compare"; + | "changeTracking"; export const url = z.preprocess( (x) => { @@ -166,7 +166,7 @@ const baseScrapeOptions = z "screenshot@fullPage", "extract", "json", - "compare", + "changeTracking", ]) .array() .optional() @@ -176,8 +176,8 @@ const baseScrapeOptions = z "You may only specify either screenshot or screenshot@fullPage", ) .refine( - (x) => !x.includes("compare") || x.includes("markdown"), - "The compare format requires the markdown format to be specified as well", + (x) => !x.includes("changeTracking") || x.includes("markdown"), + "The changeTracking format requires the markdown format to be specified as well", ), headers: z.record(z.string(), z.string()).optional(), includeTags: z.string().array().optional(), @@ -552,7 +552,7 @@ export type Document = { value: unknown }[]; }; - compare?: { + changeTracking?: { previousScrapeAt: string | null; changeStatus: "new" | "same" | "changed" | "removed"; visibility: "visible" | "hidden"; diff --git a/apps/api/src/scraper/scrapeURL/transformers/diff.ts b/apps/api/src/scraper/scrapeURL/transformers/diff.ts index 9628844d..edc6efd3 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/diff.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/diff.ts @@ -3,7 +3,7 @@ import { Document } from "../../../controllers/v1/types"; import { Meta } from "../index"; export async function deriveDiff(meta: Meta, document: Document): Promise { - if (meta.options.formats.includes("compare")) { + if (meta.options.formats.includes("changeTracking")) { const res = await supabase_service .rpc("diff_get_last_scrape_1", { i_team_id: meta.internalOptions.teamId, @@ -21,13 +21,13 @@ export async function deriveDiff(meta: Meta, document: Document): Promise [...x.replace(/\s+/g, "").replace(/\[iframe\]\(.+?\)/g, "")].sort().join(""); - document.compare = { + document.changeTracking = { previousScrapeAt: data.o_date_added, changeStatus: document.metadata.statusCode === 404 ? "removed" : transformer(previousMarkdown) === transformer(currentMarkdown) ? "same" : "changed", visibility: meta.internalOptions.urlInvisibleInCurrentCrawl ? "hidden" : "visible", } } else if (!res.error) { - document.compare = { + document.changeTracking = { previousScrapeAt: null, changeStatus: document.metadata.statusCode === 404 ? "removed" : "new", visibility: meta.internalOptions.urlInvisibleInCurrentCrawl ? "hidden" : "visible", diff --git a/apps/api/src/scraper/scrapeURL/transformers/index.ts b/apps/api/src/scraper/scrapeURL/transformers/index.ts index 114c59a8..5b16ae99 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/index.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/index.ts @@ -148,14 +148,14 @@ export function coerceFieldsToFormats( ); } - if (!formats.has("compare") && document.compare !== undefined) { + if (!formats.has("changeTracking") && document.changeTracking !== undefined) { meta.logger.warn( - "Removed compare from Document because it wasn't in formats -- this is extremely wasteful and indicates a bug.", + "Removed changeTracking from Document because it wasn't in formats -- this is extremely wasteful and indicates a bug.", ); - delete document.compare; - } else if (formats.has("compare") && document.compare === undefined) { + delete document.changeTracking; + } else if (formats.has("changeTracking") && document.changeTracking === undefined) { meta.logger.warn( - "Request had format compare, but there was no compare field in the result.", + "Request had format changeTracking, but there was no changeTracking field in the result.", ); } diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 4942ec69..6f76220f 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -68,7 +68,7 @@ export interface FirecrawlDocument; includeTags?: string[]; excludeTags?: string[]; From ab6fb48e6e53d2e4658628d2ae5e6cd7da94465e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 8 Apr 2025 21:11:42 +0200 Subject: [PATCH 053/160] bump ver --- apps/js-sdk/firecrawl/package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 8a17ed18..7c64b33e 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.22.0", + "version": "1.22.1", "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts", From 673bf6a2dea0f2340f964ba9c25cab8e92d929e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 9 Apr 2025 12:27:23 +0200 Subject: [PATCH 054/160] feat(crawl-status): retrieve job data from GCS (#1427) * feat(crawl-status): retrieve job data from GCS * feat(gcs-jobs/save): retrying saving metadata (might conflict) * feat(gcs-jobs/save): retry save operation * fix(gcs-jobs/save): respect metadata rules * feat(crawl-status): log if gcs job is not found --- apps/api/src/controllers/v1/crawl-status.ts | 29 +++++- apps/api/src/lib/gcs-jobs.ts | 104 ++++++++++++++++++++ apps/api/src/services/logging/log_job.ts | 43 +------- 3 files changed, 130 insertions(+), 46 deletions(-) create mode 100644 apps/api/src/lib/gcs-jobs.ts diff --git a/apps/api/src/controllers/v1/crawl-status.ts b/apps/api/src/controllers/v1/crawl-status.ts index 96aa578e..bcefa9c3 100644 --- a/apps/api/src/controllers/v1/crawl-status.ts +++ b/apps/api/src/controllers/v1/crawl-status.ts @@ -13,7 +13,7 @@ import { getDoneJobsOrderedLength, isCrawlKickoffFinished, } from "../../lib/crawl-redis"; -import { getScrapeQueue, QueueFunction } from "../../services/queue-service"; +import { getScrapeQueue } from "../../services/queue-service"; import { supabaseGetJobById, supabaseGetJobsById, @@ -23,6 +23,7 @@ import type { Job, JobState, Queue } from "bullmq"; import { logger } from "../../lib/logger"; import { supabase_rr_service, supabase_service } from "../../services/supabase"; import { getConcurrencyLimitedJobs } from "../../lib/concurrency-limit"; +import { getJobFromGCS } from "../../lib/gcs-jobs"; configDotenv(); export type PseudoJob = { @@ -39,14 +40,20 @@ export type PseudoJob = { export type DBJob = { docs: any, success: boolean, page_options: any, date_added: any, message: string | null } export async function getJob(id: string): Promise | null> { - const [bullJob, dbJob] = await Promise.all([ + const [bullJob, dbJob, gcsJob] = await Promise.all([ getScrapeQueue().getJob(id), (process.env.USE_DB_AUTHENTICATION === "true" ? supabaseGetJobById(id) : null) as Promise, + (process.env.GCS_BUCKET_NAME ? getJobFromGCS(id) : null) as Promise, ]); if (!bullJob && !dbJob) return null; - const data = dbJob?.docs ?? bullJob?.returnvalue; + const data = gcsJob ?? dbJob?.docs ?? bullJob?.returnvalue; + if (gcsJob === null && data) { + logger.warn("GCS Job not found", { + jobId: id, + }); + } const job: PseudoJob = { id, @@ -65,13 +72,15 @@ export async function getJob(id: string): Promise | null> { } export async function getJobs(ids: string[]): Promise[]> { - const [bullJobs, dbJobs] = await Promise.all([ + const [bullJobs, dbJobs, gcsJobs] = await Promise.all([ Promise.all(ids.map((x) => getScrapeQueue().getJob(x))).then(x => x.filter(x => x)) as Promise<(Job & { id: string })[]>, process.env.USE_DB_AUTHENTICATION === "true" ? supabaseGetJobsById(ids) : [], + process.env.GCS_BUCKET_NAME ? Promise.all(ids.map(async (x) => ({ id: x, job: await getJobFromGCS(x) }))).then(x => x.filter(x => x.job)) as Promise<({ id: string, job: any | null })[]> : [], ]); const bullJobMap = new Map>(); const dbJobMap = new Map(); + const gcsJobMap = new Map(); for (const job of bullJobs) { bullJobMap.set(job.id, job); @@ -81,15 +90,25 @@ export async function getJobs(ids: string[]): Promise[]> { dbJobMap.set(job.job_id, job); } + for (const job of gcsJobs) { + gcsJobMap.set(job.id, job.job); + } + const jobs: PseudoJob[] = []; for (const id of ids) { const bullJob = bullJobMap.get(id); const dbJob = dbJobMap.get(id); + const gcsJob = gcsJobMap.get(id); if (!bullJob && !dbJob) continue; - const data = dbJob?.docs ?? bullJob?.returnvalue; + const data = gcsJob ?? dbJob?.docs ?? bullJob?.returnvalue; + if (gcsJob === null && data) { + logger.warn("GCS Job not found", { + jobId: id, + }); + } const job: PseudoJob = { id, diff --git a/apps/api/src/lib/gcs-jobs.ts b/apps/api/src/lib/gcs-jobs.ts new file mode 100644 index 00000000..024cc410 --- /dev/null +++ b/apps/api/src/lib/gcs-jobs.ts @@ -0,0 +1,104 @@ +import { FirecrawlJob } from "../types"; +import { Storage } from "@google-cloud/storage"; +import { logger } from "./logger"; + +const credentials = process.env.GCS_CREDENTIALS ? JSON.parse(atob(process.env.GCS_CREDENTIALS)) : undefined; + +export async function saveJobToGCS(job: FirecrawlJob): Promise { + try { + if (!process.env.GCS_BUCKET_NAME) { + return; + } + + const storage = new Storage({ credentials }); + const bucket = storage.bucket(process.env.GCS_BUCKET_NAME); + const blob = bucket.file(`${job.job_id}.json`); + for (let i = 0; i < 3; i++) { + try { + await blob.save(JSON.stringify(job.docs), { + contentType: "application/json", + }); + break; + } catch (error) { + if (i === 2) { + throw error; + } else { + logger.error(`Error saving job to GCS, retrying`, { + error, + scrapeId: job.job_id, + jobId: job.job_id, + i, + }); + } + } + } + for (let i = 0; i < 3; i++) { + try { + await blob.setMetadata({ + metadata: { + job_id: job.job_id ?? null, + success: job.success, + message: job.message ?? null, + num_docs: job.num_docs, + time_taken: job.time_taken, + team_id: (job.team_id === "preview" || job.team_id?.startsWith("preview_")) ? null : job.team_id, + mode: job.mode, + url: job.url, + crawler_options: JSON.stringify(job.crawlerOptions), + page_options: JSON.stringify(job.scrapeOptions), + origin: job.origin, + num_tokens: job.num_tokens ?? null, + retry: !!job.retry, + crawl_id: job.crawl_id ?? null, + tokens_billed: job.tokens_billed ?? null, + }, + }); + break; + } catch (error) { + if (i === 2) { + throw error; + } else { + logger.error(`Error saving job metadata to GCS, retrying`, { + error, + scrapeId: job.job_id, + jobId: job.job_id, + i, + }); + } + } + } + } catch (error) { + logger.error(`Error saving job to GCS`, { + error, + scrapeId: job.job_id, + jobId: job.job_id, + }); + } +} + +export async function getJobFromGCS(jobId: string): Promise { + try { + if (!process.env.GCS_BUCKET_NAME) { + return null; + } + + const storage = new Storage({ credentials }); + const bucket = storage.bucket(process.env.GCS_BUCKET_NAME); + const blob = bucket.file(`${jobId}.json`); + const [exists] = await blob.exists(); + if (!exists) { + return null; + } + const [content] = await blob.download(); + const x = JSON.parse(content.toString()); + console.log("Downloaded file ", jobId, x); + return x; + } catch (error) { + logger.error(`Error getting job from GCS`, { + error, + jobId, + scrapeId: jobId, + }); + return null; + } +} \ No newline at end of file diff --git a/apps/api/src/services/logging/log_job.ts b/apps/api/src/services/logging/log_job.ts index 9caa6716..ae1d66ae 100644 --- a/apps/api/src/services/logging/log_job.ts +++ b/apps/api/src/services/logging/log_job.ts @@ -4,7 +4,7 @@ import { posthog } from "../posthog"; import "dotenv/config"; import { logger } from "../../lib/logger"; import { configDotenv } from "dotenv"; -import { Storage } from "@google-cloud/storage"; +import { saveJobToGCS } from "../../lib/gcs-jobs"; configDotenv(); function cleanOfNull(x: T): T { @@ -21,45 +21,6 @@ function cleanOfNull(x: T): T { } } - -async function saveJobToGCS(job: FirecrawlJob, bucketName: string): Promise { - try { - const storage = new Storage({ - credentials: process.env.GCS_CREDENTIALS ? JSON.parse(atob(process.env.GCS_CREDENTIALS)) : undefined, - }); - const bucket = storage.bucket(bucketName); - const blob = bucket.file(`${job.job_id}.json`); - await blob.save(JSON.stringify(job.docs), { - contentType: "application/json", - }); - await blob.setMetadata({ - metadata: { - job_id: job.job_id ?? null, - success: job.success, - message: job.message ?? null, - num_docs: job.num_docs, - time_taken: job.time_taken, - team_id: (job.team_id === "preview" || job.team_id?.startsWith("preview_"))? null : job.team_id, - mode: job.mode, - url: job.url, - crawler_options: job.crawlerOptions, - page_options: job.scrapeOptions, - origin: job.origin, - num_tokens: job.num_tokens ?? null, - retry: !!job.retry, - crawl_id: job.crawl_id ?? null, - tokens_billed: job.tokens_billed ?? null, - }, - }) - } catch (error) { - logger.error(`Error saving job to GCS`, { - error, - scrapeId: job.job_id, - jobId: job.job_id, - }); - } -} - async function indexJob(job: FirecrawlJob): Promise { try { if (job.mode !== "single_urls" && job.mode !== "scrape") { @@ -148,7 +109,7 @@ export async function logJob(job: FirecrawlJob, force: boolean = false) { } if (process.env.GCS_BUCKET_NAME) { - await saveJobToGCS(job, process.env.GCS_BUCKET_NAME); + await saveJobToGCS(job); } if (force) { From 670e4a6bf1d48533a4222e12f44c5a5238051645 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 9 Apr 2025 12:28:46 +0200 Subject: [PATCH 055/160] Revert "feat(crawl-status): retrieve job data from GCS (#1427)" This reverts commit 673bf6a2dea0f2340f964ba9c25cab8e92d929e4. --- apps/api/src/controllers/v1/crawl-status.ts | 29 +----- apps/api/src/lib/gcs-jobs.ts | 104 -------------------- apps/api/src/services/logging/log_job.ts | 43 +++++++- 3 files changed, 46 insertions(+), 130 deletions(-) delete mode 100644 apps/api/src/lib/gcs-jobs.ts diff --git a/apps/api/src/controllers/v1/crawl-status.ts b/apps/api/src/controllers/v1/crawl-status.ts index bcefa9c3..96aa578e 100644 --- a/apps/api/src/controllers/v1/crawl-status.ts +++ b/apps/api/src/controllers/v1/crawl-status.ts @@ -13,7 +13,7 @@ import { getDoneJobsOrderedLength, isCrawlKickoffFinished, } from "../../lib/crawl-redis"; -import { getScrapeQueue } from "../../services/queue-service"; +import { getScrapeQueue, QueueFunction } from "../../services/queue-service"; import { supabaseGetJobById, supabaseGetJobsById, @@ -23,7 +23,6 @@ import type { Job, JobState, Queue } from "bullmq"; import { logger } from "../../lib/logger"; import { supabase_rr_service, supabase_service } from "../../services/supabase"; import { getConcurrencyLimitedJobs } from "../../lib/concurrency-limit"; -import { getJobFromGCS } from "../../lib/gcs-jobs"; configDotenv(); export type PseudoJob = { @@ -40,20 +39,14 @@ export type PseudoJob = { export type DBJob = { docs: any, success: boolean, page_options: any, date_added: any, message: string | null } export async function getJob(id: string): Promise | null> { - const [bullJob, dbJob, gcsJob] = await Promise.all([ + const [bullJob, dbJob] = await Promise.all([ getScrapeQueue().getJob(id), (process.env.USE_DB_AUTHENTICATION === "true" ? supabaseGetJobById(id) : null) as Promise, - (process.env.GCS_BUCKET_NAME ? getJobFromGCS(id) : null) as Promise, ]); if (!bullJob && !dbJob) return null; - const data = gcsJob ?? dbJob?.docs ?? bullJob?.returnvalue; - if (gcsJob === null && data) { - logger.warn("GCS Job not found", { - jobId: id, - }); - } + const data = dbJob?.docs ?? bullJob?.returnvalue; const job: PseudoJob = { id, @@ -72,15 +65,13 @@ export async function getJob(id: string): Promise | null> { } export async function getJobs(ids: string[]): Promise[]> { - const [bullJobs, dbJobs, gcsJobs] = await Promise.all([ + const [bullJobs, dbJobs] = await Promise.all([ Promise.all(ids.map((x) => getScrapeQueue().getJob(x))).then(x => x.filter(x => x)) as Promise<(Job & { id: string })[]>, process.env.USE_DB_AUTHENTICATION === "true" ? supabaseGetJobsById(ids) : [], - process.env.GCS_BUCKET_NAME ? Promise.all(ids.map(async (x) => ({ id: x, job: await getJobFromGCS(x) }))).then(x => x.filter(x => x.job)) as Promise<({ id: string, job: any | null })[]> : [], ]); const bullJobMap = new Map>(); const dbJobMap = new Map(); - const gcsJobMap = new Map(); for (const job of bullJobs) { bullJobMap.set(job.id, job); @@ -90,25 +81,15 @@ export async function getJobs(ids: string[]): Promise[]> { dbJobMap.set(job.job_id, job); } - for (const job of gcsJobs) { - gcsJobMap.set(job.id, job.job); - } - const jobs: PseudoJob[] = []; for (const id of ids) { const bullJob = bullJobMap.get(id); const dbJob = dbJobMap.get(id); - const gcsJob = gcsJobMap.get(id); if (!bullJob && !dbJob) continue; - const data = gcsJob ?? dbJob?.docs ?? bullJob?.returnvalue; - if (gcsJob === null && data) { - logger.warn("GCS Job not found", { - jobId: id, - }); - } + const data = dbJob?.docs ?? bullJob?.returnvalue; const job: PseudoJob = { id, diff --git a/apps/api/src/lib/gcs-jobs.ts b/apps/api/src/lib/gcs-jobs.ts deleted file mode 100644 index 024cc410..00000000 --- a/apps/api/src/lib/gcs-jobs.ts +++ /dev/null @@ -1,104 +0,0 @@ -import { FirecrawlJob } from "../types"; -import { Storage } from "@google-cloud/storage"; -import { logger } from "./logger"; - -const credentials = process.env.GCS_CREDENTIALS ? JSON.parse(atob(process.env.GCS_CREDENTIALS)) : undefined; - -export async function saveJobToGCS(job: FirecrawlJob): Promise { - try { - if (!process.env.GCS_BUCKET_NAME) { - return; - } - - const storage = new Storage({ credentials }); - const bucket = storage.bucket(process.env.GCS_BUCKET_NAME); - const blob = bucket.file(`${job.job_id}.json`); - for (let i = 0; i < 3; i++) { - try { - await blob.save(JSON.stringify(job.docs), { - contentType: "application/json", - }); - break; - } catch (error) { - if (i === 2) { - throw error; - } else { - logger.error(`Error saving job to GCS, retrying`, { - error, - scrapeId: job.job_id, - jobId: job.job_id, - i, - }); - } - } - } - for (let i = 0; i < 3; i++) { - try { - await blob.setMetadata({ - metadata: { - job_id: job.job_id ?? null, - success: job.success, - message: job.message ?? null, - num_docs: job.num_docs, - time_taken: job.time_taken, - team_id: (job.team_id === "preview" || job.team_id?.startsWith("preview_")) ? null : job.team_id, - mode: job.mode, - url: job.url, - crawler_options: JSON.stringify(job.crawlerOptions), - page_options: JSON.stringify(job.scrapeOptions), - origin: job.origin, - num_tokens: job.num_tokens ?? null, - retry: !!job.retry, - crawl_id: job.crawl_id ?? null, - tokens_billed: job.tokens_billed ?? null, - }, - }); - break; - } catch (error) { - if (i === 2) { - throw error; - } else { - logger.error(`Error saving job metadata to GCS, retrying`, { - error, - scrapeId: job.job_id, - jobId: job.job_id, - i, - }); - } - } - } - } catch (error) { - logger.error(`Error saving job to GCS`, { - error, - scrapeId: job.job_id, - jobId: job.job_id, - }); - } -} - -export async function getJobFromGCS(jobId: string): Promise { - try { - if (!process.env.GCS_BUCKET_NAME) { - return null; - } - - const storage = new Storage({ credentials }); - const bucket = storage.bucket(process.env.GCS_BUCKET_NAME); - const blob = bucket.file(`${jobId}.json`); - const [exists] = await blob.exists(); - if (!exists) { - return null; - } - const [content] = await blob.download(); - const x = JSON.parse(content.toString()); - console.log("Downloaded file ", jobId, x); - return x; - } catch (error) { - logger.error(`Error getting job from GCS`, { - error, - jobId, - scrapeId: jobId, - }); - return null; - } -} \ No newline at end of file diff --git a/apps/api/src/services/logging/log_job.ts b/apps/api/src/services/logging/log_job.ts index ae1d66ae..9caa6716 100644 --- a/apps/api/src/services/logging/log_job.ts +++ b/apps/api/src/services/logging/log_job.ts @@ -4,7 +4,7 @@ import { posthog } from "../posthog"; import "dotenv/config"; import { logger } from "../../lib/logger"; import { configDotenv } from "dotenv"; -import { saveJobToGCS } from "../../lib/gcs-jobs"; +import { Storage } from "@google-cloud/storage"; configDotenv(); function cleanOfNull(x: T): T { @@ -21,6 +21,45 @@ function cleanOfNull(x: T): T { } } + +async function saveJobToGCS(job: FirecrawlJob, bucketName: string): Promise { + try { + const storage = new Storage({ + credentials: process.env.GCS_CREDENTIALS ? JSON.parse(atob(process.env.GCS_CREDENTIALS)) : undefined, + }); + const bucket = storage.bucket(bucketName); + const blob = bucket.file(`${job.job_id}.json`); + await blob.save(JSON.stringify(job.docs), { + contentType: "application/json", + }); + await blob.setMetadata({ + metadata: { + job_id: job.job_id ?? null, + success: job.success, + message: job.message ?? null, + num_docs: job.num_docs, + time_taken: job.time_taken, + team_id: (job.team_id === "preview" || job.team_id?.startsWith("preview_"))? null : job.team_id, + mode: job.mode, + url: job.url, + crawler_options: job.crawlerOptions, + page_options: job.scrapeOptions, + origin: job.origin, + num_tokens: job.num_tokens ?? null, + retry: !!job.retry, + crawl_id: job.crawl_id ?? null, + tokens_billed: job.tokens_billed ?? null, + }, + }) + } catch (error) { + logger.error(`Error saving job to GCS`, { + error, + scrapeId: job.job_id, + jobId: job.job_id, + }); + } +} + async function indexJob(job: FirecrawlJob): Promise { try { if (job.mode !== "single_urls" && job.mode !== "scrape") { @@ -109,7 +148,7 @@ export async function logJob(job: FirecrawlJob, force: boolean = false) { } if (process.env.GCS_BUCKET_NAME) { - await saveJobToGCS(job); + await saveJobToGCS(job, process.env.GCS_BUCKET_NAME); } if (force) { From 3a8de846e32b03254dfcd1adda787b8a192d5315 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 9 Apr 2025 12:47:51 +0200 Subject: [PATCH 056/160] read from GCS (again) (#1433) * feat(crawl-status): retrieve job data from GCS * feat(gcs-jobs/save): retrying saving metadata (might conflict) * feat(gcs-jobs/save): retry save operation * fix(gcs-jobs/save): respect metadata rules * feat(crawl-status): log if gcs job is not found * feat(ci/test/server): add gcs --- .github/workflows/test-server.yml | 2 + apps/api/src/controllers/v1/crawl-status.ts | 29 +++++- apps/api/src/lib/gcs-jobs.ts | 104 ++++++++++++++++++++ apps/api/src/services/logging/log_job.ts | 43 +------- 4 files changed, 132 insertions(+), 46 deletions(-) create mode 100644 apps/api/src/lib/gcs-jobs.ts diff --git a/.github/workflows/test-server.yml b/.github/workflows/test-server.yml index 2c52dd94..7a45dece 100644 --- a/.github/workflows/test-server.yml +++ b/.github/workflows/test-server.yml @@ -29,6 +29,8 @@ env: ENV: ${{ secrets.ENV }} RUNPOD_MU_POD_ID: ${{ secrets.RUNPOD_MU_POD_ID }} RUNPOD_MU_API_KEY: ${{ secrets.RUNPOD_MU_API_KEY }} + GCS_CREDENTIALS: ${{ secrets.GCS_CREDENTIALS }} + GCS_BUCKET_NAME: ${{ secrets.GCS_BUCKET_NAME }} USE_GO_MARKDOWN_PARSER: true jobs: diff --git a/apps/api/src/controllers/v1/crawl-status.ts b/apps/api/src/controllers/v1/crawl-status.ts index 96aa578e..bcefa9c3 100644 --- a/apps/api/src/controllers/v1/crawl-status.ts +++ b/apps/api/src/controllers/v1/crawl-status.ts @@ -13,7 +13,7 @@ import { getDoneJobsOrderedLength, isCrawlKickoffFinished, } from "../../lib/crawl-redis"; -import { getScrapeQueue, QueueFunction } from "../../services/queue-service"; +import { getScrapeQueue } from "../../services/queue-service"; import { supabaseGetJobById, supabaseGetJobsById, @@ -23,6 +23,7 @@ import type { Job, JobState, Queue } from "bullmq"; import { logger } from "../../lib/logger"; import { supabase_rr_service, supabase_service } from "../../services/supabase"; import { getConcurrencyLimitedJobs } from "../../lib/concurrency-limit"; +import { getJobFromGCS } from "../../lib/gcs-jobs"; configDotenv(); export type PseudoJob = { @@ -39,14 +40,20 @@ export type PseudoJob = { export type DBJob = { docs: any, success: boolean, page_options: any, date_added: any, message: string | null } export async function getJob(id: string): Promise | null> { - const [bullJob, dbJob] = await Promise.all([ + const [bullJob, dbJob, gcsJob] = await Promise.all([ getScrapeQueue().getJob(id), (process.env.USE_DB_AUTHENTICATION === "true" ? supabaseGetJobById(id) : null) as Promise, + (process.env.GCS_BUCKET_NAME ? getJobFromGCS(id) : null) as Promise, ]); if (!bullJob && !dbJob) return null; - const data = dbJob?.docs ?? bullJob?.returnvalue; + const data = gcsJob ?? dbJob?.docs ?? bullJob?.returnvalue; + if (gcsJob === null && data) { + logger.warn("GCS Job not found", { + jobId: id, + }); + } const job: PseudoJob = { id, @@ -65,13 +72,15 @@ export async function getJob(id: string): Promise | null> { } export async function getJobs(ids: string[]): Promise[]> { - const [bullJobs, dbJobs] = await Promise.all([ + const [bullJobs, dbJobs, gcsJobs] = await Promise.all([ Promise.all(ids.map((x) => getScrapeQueue().getJob(x))).then(x => x.filter(x => x)) as Promise<(Job & { id: string })[]>, process.env.USE_DB_AUTHENTICATION === "true" ? supabaseGetJobsById(ids) : [], + process.env.GCS_BUCKET_NAME ? Promise.all(ids.map(async (x) => ({ id: x, job: await getJobFromGCS(x) }))).then(x => x.filter(x => x.job)) as Promise<({ id: string, job: any | null })[]> : [], ]); const bullJobMap = new Map>(); const dbJobMap = new Map(); + const gcsJobMap = new Map(); for (const job of bullJobs) { bullJobMap.set(job.id, job); @@ -81,15 +90,25 @@ export async function getJobs(ids: string[]): Promise[]> { dbJobMap.set(job.job_id, job); } + for (const job of gcsJobs) { + gcsJobMap.set(job.id, job.job); + } + const jobs: PseudoJob[] = []; for (const id of ids) { const bullJob = bullJobMap.get(id); const dbJob = dbJobMap.get(id); + const gcsJob = gcsJobMap.get(id); if (!bullJob && !dbJob) continue; - const data = dbJob?.docs ?? bullJob?.returnvalue; + const data = gcsJob ?? dbJob?.docs ?? bullJob?.returnvalue; + if (gcsJob === null && data) { + logger.warn("GCS Job not found", { + jobId: id, + }); + } const job: PseudoJob = { id, diff --git a/apps/api/src/lib/gcs-jobs.ts b/apps/api/src/lib/gcs-jobs.ts new file mode 100644 index 00000000..024cc410 --- /dev/null +++ b/apps/api/src/lib/gcs-jobs.ts @@ -0,0 +1,104 @@ +import { FirecrawlJob } from "../types"; +import { Storage } from "@google-cloud/storage"; +import { logger } from "./logger"; + +const credentials = process.env.GCS_CREDENTIALS ? JSON.parse(atob(process.env.GCS_CREDENTIALS)) : undefined; + +export async function saveJobToGCS(job: FirecrawlJob): Promise { + try { + if (!process.env.GCS_BUCKET_NAME) { + return; + } + + const storage = new Storage({ credentials }); + const bucket = storage.bucket(process.env.GCS_BUCKET_NAME); + const blob = bucket.file(`${job.job_id}.json`); + for (let i = 0; i < 3; i++) { + try { + await blob.save(JSON.stringify(job.docs), { + contentType: "application/json", + }); + break; + } catch (error) { + if (i === 2) { + throw error; + } else { + logger.error(`Error saving job to GCS, retrying`, { + error, + scrapeId: job.job_id, + jobId: job.job_id, + i, + }); + } + } + } + for (let i = 0; i < 3; i++) { + try { + await blob.setMetadata({ + metadata: { + job_id: job.job_id ?? null, + success: job.success, + message: job.message ?? null, + num_docs: job.num_docs, + time_taken: job.time_taken, + team_id: (job.team_id === "preview" || job.team_id?.startsWith("preview_")) ? null : job.team_id, + mode: job.mode, + url: job.url, + crawler_options: JSON.stringify(job.crawlerOptions), + page_options: JSON.stringify(job.scrapeOptions), + origin: job.origin, + num_tokens: job.num_tokens ?? null, + retry: !!job.retry, + crawl_id: job.crawl_id ?? null, + tokens_billed: job.tokens_billed ?? null, + }, + }); + break; + } catch (error) { + if (i === 2) { + throw error; + } else { + logger.error(`Error saving job metadata to GCS, retrying`, { + error, + scrapeId: job.job_id, + jobId: job.job_id, + i, + }); + } + } + } + } catch (error) { + logger.error(`Error saving job to GCS`, { + error, + scrapeId: job.job_id, + jobId: job.job_id, + }); + } +} + +export async function getJobFromGCS(jobId: string): Promise { + try { + if (!process.env.GCS_BUCKET_NAME) { + return null; + } + + const storage = new Storage({ credentials }); + const bucket = storage.bucket(process.env.GCS_BUCKET_NAME); + const blob = bucket.file(`${jobId}.json`); + const [exists] = await blob.exists(); + if (!exists) { + return null; + } + const [content] = await blob.download(); + const x = JSON.parse(content.toString()); + console.log("Downloaded file ", jobId, x); + return x; + } catch (error) { + logger.error(`Error getting job from GCS`, { + error, + jobId, + scrapeId: jobId, + }); + return null; + } +} \ No newline at end of file diff --git a/apps/api/src/services/logging/log_job.ts b/apps/api/src/services/logging/log_job.ts index 9caa6716..ae1d66ae 100644 --- a/apps/api/src/services/logging/log_job.ts +++ b/apps/api/src/services/logging/log_job.ts @@ -4,7 +4,7 @@ import { posthog } from "../posthog"; import "dotenv/config"; import { logger } from "../../lib/logger"; import { configDotenv } from "dotenv"; -import { Storage } from "@google-cloud/storage"; +import { saveJobToGCS } from "../../lib/gcs-jobs"; configDotenv(); function cleanOfNull(x: T): T { @@ -21,45 +21,6 @@ function cleanOfNull(x: T): T { } } - -async function saveJobToGCS(job: FirecrawlJob, bucketName: string): Promise { - try { - const storage = new Storage({ - credentials: process.env.GCS_CREDENTIALS ? JSON.parse(atob(process.env.GCS_CREDENTIALS)) : undefined, - }); - const bucket = storage.bucket(bucketName); - const blob = bucket.file(`${job.job_id}.json`); - await blob.save(JSON.stringify(job.docs), { - contentType: "application/json", - }); - await blob.setMetadata({ - metadata: { - job_id: job.job_id ?? null, - success: job.success, - message: job.message ?? null, - num_docs: job.num_docs, - time_taken: job.time_taken, - team_id: (job.team_id === "preview" || job.team_id?.startsWith("preview_"))? null : job.team_id, - mode: job.mode, - url: job.url, - crawler_options: job.crawlerOptions, - page_options: job.scrapeOptions, - origin: job.origin, - num_tokens: job.num_tokens ?? null, - retry: !!job.retry, - crawl_id: job.crawl_id ?? null, - tokens_billed: job.tokens_billed ?? null, - }, - }) - } catch (error) { - logger.error(`Error saving job to GCS`, { - error, - scrapeId: job.job_id, - jobId: job.job_id, - }); - } -} - async function indexJob(job: FirecrawlJob): Promise { try { if (job.mode !== "single_urls" && job.mode !== "scrape") { @@ -148,7 +109,7 @@ export async function logJob(job: FirecrawlJob, force: boolean = false) { } if (process.env.GCS_BUCKET_NAME) { - await saveJobToGCS(job, process.env.GCS_BUCKET_NAME); + await saveJobToGCS(job); } if (force) { From dc1a17d5718d7286e45ed5b6b21663a5f224ea84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 9 Apr 2025 13:04:00 +0200 Subject: [PATCH 057/160] remove bad log --- apps/api/src/lib/gcs-jobs.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/apps/api/src/lib/gcs-jobs.ts b/apps/api/src/lib/gcs-jobs.ts index 024cc410..6895c7e1 100644 --- a/apps/api/src/lib/gcs-jobs.ts +++ b/apps/api/src/lib/gcs-jobs.ts @@ -91,7 +91,6 @@ export async function getJobFromGCS(jobId: string): Promise { } const [content] = await blob.download(); const x = JSON.parse(content.toString()); - console.log("Downloaded file ", jobId, x); return x; } catch (error) { logger.error(`Error getting job from GCS`, { From 9fd735f3a1fbda29ce34117f70bbe68b7537ffb9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 9 Apr 2025 15:45:07 +0200 Subject: [PATCH 058/160] feat(api/test/snips): disable flaky tests --- apps/api/src/__tests__/snips/billing.test.ts | 315 ++++++++++--------- apps/api/src/__tests__/snips/scrape.test.ts | 21 +- 2 files changed, 171 insertions(+), 165 deletions(-) diff --git a/apps/api/src/__tests__/snips/billing.test.ts b/apps/api/src/__tests__/snips/billing.test.ts index 8afa5a44..9d183e6d 100644 --- a/apps/api/src/__tests__/snips/billing.test.ts +++ b/apps/api/src/__tests__/snips/billing.test.ts @@ -1,192 +1,197 @@ -import { batchScrape, crawl, creditUsage, extract, map, scrape, search, tokenUsage } from "./lib"; +// import { batchScrape, crawl, creditUsage, extract, map, scrape, search, tokenUsage } from "./lib"; -const sleep = (ms: number) => new Promise(x => setTimeout(() => x(true), ms)); -const sleepForBatchBilling = () => sleep(20000); +// const sleep = (ms: number) => new Promise(x => setTimeout(() => x(true), ms)); +// const sleepForBatchBilling = () => sleep(20000); -beforeAll(async () => { - // Wait for previous test runs to stop billing processing - if (!process.env.TEST_SUITE_SELF_HOSTED) { - await sleep(40000); - } -}, 50000); +// beforeAll(async () => { +// // Wait for previous test runs to stop billing processing +// if (!process.env.TEST_SUITE_SELF_HOSTED) { +// await sleep(40000); +// } +// }, 50000); -describe("Billing tests", () => { - if (process.env.TEST_SUITE_SELF_HOSTED) { - it("dummy", () => { - expect(true).toBe(true); - }); - } else { - it("bills scrape correctly", async () => { - const rc1 = (await creditUsage()).remaining_credits; +// describe("Billing tests", () => { +// if (process.env.TEST_SUITE_SELF_HOSTED) { +// it("dummy", () => { +// expect(true).toBe(true); +// }); +// } else { +// it("bills scrape correctly", async () => { +// const rc1 = (await creditUsage()).remaining_credits; - // Run all scrape operations in parallel with Promise.all - await Promise.all([ - // scrape 1: regular fc.dev scrape (1 credit) - scrape({ - url: "https://firecrawl.dev" - }), +// // Run all scrape operations in parallel with Promise.all +// await Promise.all([ +// // scrape 1: regular fc.dev scrape (1 credit) +// scrape({ +// url: "https://firecrawl.dev" +// }), - // scrape 1.1: regular fc.dev scrape (1 credit) - scrape({ - url: "https://firecrawl.dev" - }), +// // scrape 1.1: regular fc.dev scrape (1 credit) +// scrape({ +// url: "https://firecrawl.dev" +// }), - // scrape 2: fc.dev with json (5 credits) - scrape({ - url: "https://firecrawl.dev", - formats: ["json"], - jsonOptions: { - schema: { - type: "object", - properties: { - is_open_source: { type: "boolean" }, - }, - required: ["is_open_source"], - }, - }, - }) - ]); +// // scrape 2: fc.dev with json (5 credits) +// scrape({ +// url: "https://firecrawl.dev", +// formats: ["json"], +// jsonOptions: { +// schema: { +// type: "object", +// properties: { +// is_open_source: { type: "boolean" }, +// }, +// required: ["is_open_source"], +// }, +// }, +// }) +// ]); - // sum: 7 credits +// // sum: 7 credits - await sleepForBatchBilling(); +// await sleepForBatchBilling(); - const rc2 = (await creditUsage()).remaining_credits; +// const rc2 = (await creditUsage()).remaining_credits; - expect(rc1 - rc2).toBe(7); - }, 120000); +// expect(rc1 - rc2).toBe(7); +// }, 120000); - it("bills batch scrape correctly", async () => { - const rc1 = (await creditUsage()).remaining_credits; +// it("bills batch scrape correctly", async () => { +// const rc1 = (await creditUsage()).remaining_credits; - // Run both scrape operations in parallel with Promise.all - const [scrape1, scrape2] = await Promise.all([ - // scrape 1: regular batch scrape with failing domain (2 credits) - batchScrape({ - urls: [ - "https://firecrawl.dev", - "https://mendable.ai", - "https://thisdomaindoesnotexistandwillfail.fcr", - ], - }), +// // Run both scrape operations in parallel with Promise.all +// const [scrape1, scrape2] = await Promise.all([ +// // scrape 1: regular batch scrape with failing domain (2 credits) +// batchScrape({ +// urls: [ +// "https://firecrawl.dev", +// "https://mendable.ai", +// "https://thisdomaindoesnotexistandwillfail.fcr", +// ], +// }), - // scrape 2: batch scrape with json (10 credits) - batchScrape({ - urls: [ - "https://firecrawl.dev", - "https://mendable.ai", - "https://thisdomaindoesnotexistandwillfail.fcr", - ], - formats: ["json"], - jsonOptions: { - schema: { - type: "object", - properties: { - four_word_summary: { type: "string" }, - }, - required: ["four_word_summary"], - }, - }, - }) - ]); +// // scrape 2: batch scrape with json (10 credits) +// batchScrape({ +// urls: [ +// "https://firecrawl.dev", +// "https://mendable.ai", +// "https://thisdomaindoesnotexistandwillfail.fcr", +// ], +// formats: ["json"], +// jsonOptions: { +// schema: { +// type: "object", +// properties: { +// four_word_summary: { type: "string" }, +// }, +// required: ["four_word_summary"], +// }, +// }, +// }) +// ]); - // sum: 12 credits +// // sum: 12 credits - await sleepForBatchBilling(); +// await sleepForBatchBilling(); - const rc2 = (await creditUsage()).remaining_credits; +// const rc2 = (await creditUsage()).remaining_credits; - expect(rc1 - rc2).toBe(12); - }, 600000); +// expect(rc1 - rc2).toBe(12); +// }, 600000); - it("bills crawl correctly", async () => { - const rc1 = (await creditUsage()).remaining_credits; +// it("bills crawl correctly", async () => { +// const rc1 = (await creditUsage()).remaining_credits; - // Run both crawl operations in parallel with Promise.all - const [crawl1, crawl2] = await Promise.all([ - // crawl 1: regular fc.dev crawl (x credits) - crawl({ - url: "https://firecrawl.dev", - }), +// // Run both crawl operations in parallel with Promise.all +// const [crawl1, crawl2] = await Promise.all([ +// // crawl 1: regular fc.dev crawl (x credits) +// crawl({ +// url: "https://firecrawl.dev", +// }), - // crawl 2: fc.dev crawl with json (5y credits) - crawl({ - url: "https://firecrawl.dev", - scrapeOptions: { - formats: ["json"], - jsonOptions: { - schema: { - type: "object", - properties: { - four_word_summary: { type: "string" }, - }, - required: ["four_word_summary"], - }, - }, - } - }) - ]); +// // crawl 2: fc.dev crawl with json (5y credits) +// crawl({ +// url: "https://firecrawl.dev", +// scrapeOptions: { +// formats: ["json"], +// jsonOptions: { +// schema: { +// type: "object", +// properties: { +// four_word_summary: { type: "string" }, +// }, +// required: ["four_word_summary"], +// }, +// }, +// } +// }) +// ]); - expect(crawl1.success).toBe(true); - expect(crawl2.success).toBe(true); +// expect(crawl1.success).toBe(true); +// expect(crawl2.success).toBe(true); - // sum: x+5y credits +// // sum: x+5y credits - await sleepForBatchBilling(); +// await sleepForBatchBilling(); - const rc2 = (await creditUsage()).remaining_credits; +// const rc2 = (await creditUsage()).remaining_credits; - if (crawl1.success && crawl2.success) { - expect(rc1 - rc2).toBe(crawl1.completed + crawl2.completed * 5); - } - }, 600000); +// if (crawl1.success && crawl2.success) { +// expect(rc1 - rc2).toBe(crawl1.completed + crawl2.completed * 5); +// } +// }, 600000); - it("bills map correctly", async () => { - const rc1 = (await creditUsage()).remaining_credits; - await map({ url: "https://firecrawl.dev" }); - await sleepForBatchBilling(); - const rc2 = (await creditUsage()).remaining_credits; - expect(rc1 - rc2).toBe(1); - }, 60000); +// it("bills map correctly", async () => { +// const rc1 = (await creditUsage()).remaining_credits; +// await map({ url: "https://firecrawl.dev" }); +// await sleepForBatchBilling(); +// const rc2 = (await creditUsage()).remaining_credits; +// expect(rc1 - rc2).toBe(1); +// }, 60000); - it("bills search correctly", async () => { - const rc1 = (await creditUsage()).remaining_credits; +// it("bills search correctly", async () => { +// const rc1 = (await creditUsage()).remaining_credits; - const results = await search({ - query: "firecrawl" - }); +// const results = await search({ +// query: "firecrawl" +// }); - await sleepForBatchBilling(); +// await sleepForBatchBilling(); - const rc2 = (await creditUsage()).remaining_credits; +// const rc2 = (await creditUsage()).remaining_credits; - expect(rc1 - rc2).toBe(results.length); - }, 60000); +// expect(rc1 - rc2).toBe(results.length); +// }, 60000); - it("bills extract correctly", async () => { - const rc1 = (await tokenUsage()).remaining_tokens; +// it("bills extract correctly", async () => { +// const rc1 = (await tokenUsage()).remaining_tokens; - await extract({ - urls: ["https://firecrawl.dev"], - schema: { - "type": "object", - "properties": { - "is_open_source": { - "type": "boolean" - } - }, - "required": [ - "is_open_source" - ] - }, - origin: "api-sdk", - }); +// await extract({ +// urls: ["https://firecrawl.dev"], +// schema: { +// "type": "object", +// "properties": { +// "is_open_source": { +// "type": "boolean" +// } +// }, +// "required": [ +// "is_open_source" +// ] +// }, +// origin: "api-sdk", +// }); - await sleepForBatchBilling(); +// await sleepForBatchBilling(); - const rc2 = (await tokenUsage()).remaining_tokens; +// const rc2 = (await tokenUsage()).remaining_tokens; - expect(rc1 - rc2).toBe(305); - }, 300000); - } +// expect(rc1 - rc2).toBe(305); +// }, 300000); +// } +// }); + +// temporarily disabled +it("is mocked", () => { + expect(true).toBe(true); }); \ No newline at end of file diff --git a/apps/api/src/__tests__/snips/scrape.test.ts b/apps/api/src/__tests__/snips/scrape.test.ts index bb053271..96337171 100644 --- a/apps/api/src/__tests__/snips/scrape.test.ts +++ b/apps/api/src/__tests__/snips/scrape.test.ts @@ -152,20 +152,21 @@ describe("Scrape tests", () => { await scrape({ url: "http://firecrawl.dev", proxy: "stealth", - timeout: 60000, + timeout: 120000, }); - }, 70000); + }, 130000); }); - describe("PDF (f-e dependant)", () => { - it.concurrent("works for PDFs behind anti-bot", async () => { - const response = await scrape({ - url: "https://www.researchgate.net/profile/Amir-Leshem/publication/220732050_Robust_adaptive_beamforming_based_on_jointly_estimating_covariance_matrix_and_steering_vector/links/0c96052d2fd8f0a84b000000/Robust-adaptive-beamforming-based-on-jointly-estimating-covariance-matrix-and-steering-vector.pdf" - }); + // Temporarily disabled, too flaky + // describe("PDF (f-e dependant)", () => { + // it.concurrent("works for PDFs behind anti-bot", async () => { + // const response = await scrape({ + // url: "https://www.researchgate.net/profile/Amir-Leshem/publication/220732050_Robust_adaptive_beamforming_based_on_jointly_estimating_covariance_matrix_and_steering_vector/links/0c96052d2fd8f0a84b000000/Robust-adaptive-beamforming-based-on-jointly-estimating-covariance-matrix-and-steering-vector.pdf" + // }); - expect(response.markdown).toContain("Robust adaptive beamforming based on jointly estimating covariance matrix"); - }, 60000); - }); + // expect(response.markdown).toContain("Robust adaptive beamforming based on jointly estimating covariance matrix"); + // }, 60000); + // }); } if (!process.env.TEST_SUITE_SELF_HOSTED || process.env.OPENAI_API_KEY || process.env.OLLAMA_BASE_URL) { From da2f17c7579e823b7ce8b3a261af3637675e9e22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adem=C3=ADlson=20F=2E=20Tonato?= Date: Wed, 9 Apr 2025 17:24:29 +0100 Subject: [PATCH 059/160] feat(api/search): add search endpoint and update request limits - Introduced a new POST endpoint for searching with a query and limit - Updated the maximum limit for search results from 20 to 50 in the request schema - Adjusted the default number of results in the Google search function from 7 to 5 --- apps/api/requests.http | 11 +++++++++++ apps/api/src/controllers/v1/types.ts | 2 +- apps/api/src/search/googlesearch.ts | 2 +- 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/apps/api/requests.http b/apps/api/requests.http index 4c69d011..2ef8bb37 100644 --- a/apps/api/requests.http +++ b/apps/api/requests.http @@ -120,3 +120,14 @@ content-type: application/json GET {{baseUrl}}/v1/llmstxt/{{generateLlmsTxtId}} HTTP/1.1 Authorization: Bearer {{$dotenv TEST_API_KEY}} + +### Search +# @name search +POST {{baseUrl}}/v1/search HTTP/1.1 +Authorization: Bearer {{$dotenv TEST_API_KEY}} +content-type: application/json + +{ + "query": "firecrawl", + "limit": 50 +} diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index ea46768f..a85ef2cd 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -969,7 +969,7 @@ export const searchRequestSchema = z .positive() .finite() .safe() - .max(20) + .max(50) .optional() .default(5), tbs: z.string().optional(), diff --git a/apps/api/src/search/googlesearch.ts b/apps/api/src/search/googlesearch.ts index 8e6eade0..f4f924ea 100644 --- a/apps/api/src/search/googlesearch.ts +++ b/apps/api/src/search/googlesearch.ts @@ -70,7 +70,7 @@ async function _req( export async function googleSearch( term: string, advanced = false, - num_results = 7, + num_results = 5, tbs = undefined as string | undefined, filter = undefined as string | undefined, lang = "en", From d3da790dc48ae159c46cc2815af2d047ddac05b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 9 Apr 2025 18:47:45 +0200 Subject: [PATCH 060/160] feat(extraction-service): teamId logging --- apps/api/src/lib/extract/extraction-service.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/apps/api/src/lib/extract/extraction-service.ts b/apps/api/src/lib/extract/extraction-service.ts index acd9f8b0..f259ffd6 100644 --- a/apps/api/src/lib/extract/extraction-service.ts +++ b/apps/api/src/lib/extract/extraction-service.ts @@ -91,6 +91,7 @@ export async function performExtraction( module: "extract", method: "performExtraction", extractId, + teamId, }); // If no URLs are provided, generate URLs from the prompt From 78a920af61a6f43d9fd799b10accc767d7b9513d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 9 Apr 2025 19:47:38 +0200 Subject: [PATCH 061/160] fix(api/tests/scrape): bump timeout --- apps/api/src/__tests__/snips/scrape.test.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/__tests__/snips/scrape.test.ts b/apps/api/src/__tests__/snips/scrape.test.ts index 96337171..097f5504 100644 --- a/apps/api/src/__tests__/snips/scrape.test.ts +++ b/apps/api/src/__tests__/snips/scrape.test.ts @@ -94,7 +94,7 @@ describe("Scrape tests", () => { expect(response.changeTracking).toBeDefined(); expect(response.changeTracking?.previousScrapeAt).not.toBeNull(); - }); + }, 30000); }); describe("Location API (f-e dependant)", () => { From d925bf2f6844c6753ad7b22cd7decf87fbaf7de7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 9 Apr 2025 21:29:56 +0200 Subject: [PATCH 062/160] feat(log_job): stop putting docs in the db (#1438) * feat(log_job): stop putting jobs in the db * fix parens --- apps/api/src/services/logging/log_job.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/services/logging/log_job.ts b/apps/api/src/services/logging/log_job.ts index ae1d66ae..d974df78 100644 --- a/apps/api/src/services/logging/log_job.ts +++ b/apps/api/src/services/logging/log_job.ts @@ -89,7 +89,7 @@ export async function logJob(job: FirecrawlJob, force: boolean = false) { success: job.success, message: job.message, num_docs: job.num_docs, - docs: cleanOfNull(job.docs), + docs: (job.mode === "single_urls" || job.mode === "scrape") ? null : cleanOfNull(job.docs), time_taken: job.time_taken, team_id: (job.team_id === "preview" || job.team_id?.startsWith("preview_"))? null : job.team_id, mode: job.mode, From 7bb564302891c57b4436e03ab5df7c8dd8120f3f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 10 Apr 2025 08:48:33 +0200 Subject: [PATCH 063/160] feat(log_job): is_migrated: true --- apps/api/src/services/logging/log_job.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/apps/api/src/services/logging/log_job.ts b/apps/api/src/services/logging/log_job.ts index d974df78..25cdf2d4 100644 --- a/apps/api/src/services/logging/log_job.ts +++ b/apps/api/src/services/logging/log_job.ts @@ -101,6 +101,7 @@ export async function logJob(job: FirecrawlJob, force: boolean = false) { retry: !!job.retry, crawl_id: job.crawl_id, tokens_billed: job.tokens_billed, + is_migrated: true, }; // Send job to external server From 4294face784e9b2eff46702371baa7ae5ec9fdfa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 10 Apr 2025 12:29:54 +0200 Subject: [PATCH 064/160] feat(scrapeURL): reintroduce default timeout for simple queries (#1440) * feat(scrapeURL): reintroduce default timeout * fix * adjust timeouts --- apps/api/src/__tests__/snips/batch-scrape.test.ts | 4 ++-- apps/api/src/scraper/scrapeURL/index.ts | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/apps/api/src/__tests__/snips/batch-scrape.test.ts b/apps/api/src/__tests__/snips/batch-scrape.test.ts index 1890b08b..f3e9e585 100644 --- a/apps/api/src/__tests__/snips/batch-scrape.test.ts +++ b/apps/api/src/__tests__/snips/batch-scrape.test.ts @@ -8,7 +8,7 @@ describe("Batch scrape tests", () => { expect(response.body.data[0]).toHaveProperty("markdown"); expect(response.body.data[0].markdown).toContain("Firecrawl"); - }, 30000); + }, 180000); if (!process.env.TEST_SUITE_SELF_HOSTED) { describe("JSON format", () => { @@ -45,7 +45,7 @@ describe("Batch scrape tests", () => { expect(response.body.data[0].json).toHaveProperty("is_open_source"); expect(response.body.data[0].json.is_open_source).toBe(true); expect(typeof response.body.data[0].json.is_open_source).toBe("boolean"); - }, 30000); + }, 180000); }); } }); diff --git a/apps/api/src/scraper/scrapeURL/index.ts b/apps/api/src/scraper/scrapeURL/index.ts index e047d1cb..cedd275e 100644 --- a/apps/api/src/scraper/scrapeURL/index.ts +++ b/apps/api/src/scraper/scrapeURL/index.ts @@ -230,7 +230,9 @@ async function scrapeURLLoop(meta: Meta): Promise { const timeToRun = meta.options.timeout !== undefined ? Math.round(meta.options.timeout / Math.min(fallbackList.length, 2)) - : undefined; + : (!meta.options.actions && !meta.options.jsonOptions && !meta.options.extract) + ? Math.round(120000 / Math.min(fallbackList.length, 2)) + : undefined; for (const { engine, unsupportedFeatures } of fallbackList) { meta.internalOptions.abort?.throwIfAborted(); From 415603acb0587e98aa66addb3ff9eb89827ef1b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 10 Apr 2025 12:50:03 +0200 Subject: [PATCH 065/160] fixes --- .../src/scraper/scrapeURL/transformers/diff.ts | 15 +++++++++++---- apps/api/src/services/logging/log_job.ts | 2 +- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/apps/api/src/scraper/scrapeURL/transformers/diff.ts b/apps/api/src/scraper/scrapeURL/transformers/diff.ts index edc6efd3..ea98fa6a 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/diff.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/diff.ts @@ -1,22 +1,29 @@ import { supabase_service } from "../../../services/supabase"; import { Document } from "../../../controllers/v1/types"; import { Meta } from "../index"; +import { getJob } from "../../../controllers/v1/crawl-status"; export async function deriveDiff(meta: Meta, document: Document): Promise { if (meta.options.formats.includes("changeTracking")) { const res = await supabase_service - .rpc("diff_get_last_scrape_1", { + .rpc("diff_get_last_scrape_2", { i_team_id: meta.internalOptions.teamId, i_url: document.metadata.sourceURL ?? meta.url, }); const data: { - o_docs: Document[], + o_job_id: string, o_date_added: string, } | undefined | null = (res.data ?? [])[0] as any; - if (data && data.o_docs.length > 0) { - const previousMarkdown = data.o_docs[0].markdown!; + const job: { + returnvalue: Document, + } | null = data?.o_job_id ? await getJob(data.o_job_id) : null; + + console.log(data, job); + + if (data && job && job?.returnvalue) { + const previousMarkdown = job.returnvalue.markdown!; const currentMarkdown = document.markdown!; const transformer = (x: string) => [...x.replace(/\s+/g, "").replace(/\[iframe\]\(.+?\)/g, "")].sort().join(""); diff --git a/apps/api/src/services/logging/log_job.ts b/apps/api/src/services/logging/log_job.ts index 25cdf2d4..ce6a4887 100644 --- a/apps/api/src/services/logging/log_job.ts +++ b/apps/api/src/services/logging/log_job.ts @@ -89,7 +89,7 @@ export async function logJob(job: FirecrawlJob, force: boolean = false) { success: job.success, message: job.message, num_docs: job.num_docs, - docs: (job.mode === "single_urls" || job.mode === "scrape") ? null : cleanOfNull(job.docs), + docs: ((job.mode === "single_urls" || job.mode === "scrape") && process.env.GCS_BUCKET_NAME) ? null : cleanOfNull(job.docs), time_taken: job.time_taken, team_id: (job.team_id === "preview" || job.team_id?.startsWith("preview_"))? null : job.team_id, mode: job.mode, From a461f72d17191bc6334a7c86f4cc1dd4c19aec83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 10 Apr 2025 13:07:09 +0200 Subject: [PATCH 066/160] temporarily disable some flaky tests --- apps/api/src/__tests__/snips/crawl.test.ts | 62 +++++++++++----------- 1 file changed, 32 insertions(+), 30 deletions(-) diff --git a/apps/api/src/__tests__/snips/crawl.test.ts b/apps/api/src/__tests__/snips/crawl.test.ts index 599e34cf..2aed9e95 100644 --- a/apps/api/src/__tests__/snips/crawl.test.ts +++ b/apps/api/src/__tests__/snips/crawl.test.ts @@ -37,37 +37,39 @@ describe("Crawl tests", () => { } }, 120000); - it.concurrent("discovers URLs properly when origin is not included", async () => { - const res = await crawl({ - url: "https://firecrawl.dev", - includePaths: ["^/blog"], - ignoreSitemap: true, - limit: 10, - }); + // TEMP: Flaky + // it.concurrent("discovers URLs properly when origin is not included", async () => { + // const res = await crawl({ + // url: "https://firecrawl.dev", + // includePaths: ["^/blog"], + // ignoreSitemap: true, + // limit: 10, + // }); - expect(res.success).toBe(true); - if (res.success) { - expect(res.data.length).toBeGreaterThan(1); - for (const page of res.data) { - expect(page.metadata.url ?? page.metadata.sourceURL).toMatch(/^https:\/\/(www\.)?firecrawl\.dev\/blog/); - } - } - }, 120000); + // expect(res.success).toBe(true); + // if (res.success) { + // expect(res.data.length).toBeGreaterThan(1); + // for (const page of res.data) { + // expect(page.metadata.url ?? page.metadata.sourceURL).toMatch(/^https:\/\/(www\.)?firecrawl\.dev\/blog/); + // } + // } + // }, 120000); - it.concurrent("discovers URLs properly when maxDiscoveryDepth is provided", async () => { - const res = await crawl({ - url: "https://firecrawl.dev", - ignoreSitemap: true, - maxDiscoveryDepth: 1, - limit: 10, - }); + // TEMP: Flaky + // it.concurrent("discovers URLs properly when maxDiscoveryDepth is provided", async () => { + // const res = await crawl({ + // url: "https://firecrawl.dev", + // ignoreSitemap: true, + // maxDiscoveryDepth: 1, + // limit: 10, + // }); - expect(res.success).toBe(true); - if (res.success) { - expect(res.data.length).toBeGreaterThan(1); - for (const page of res.data) { - expect(page.metadata.url ?? page.metadata.sourceURL).not.toMatch(/^https:\/\/(www\.)?firecrawl\.dev\/blog\/.+$/); - } - } - }, 120000); + // expect(res.success).toBe(true); + // if (res.success) { + // expect(res.data.length).toBeGreaterThan(1); + // for (const page of res.data) { + // expect(page.metadata.url ?? page.metadata.sourceURL).not.toMatch(/^https:\/\/(www\.)?firecrawl\.dev\/blog\/.+$/); + // } + // } + // }, 120000); }); From f2865f66992644ca28cddf06d04d7668800f2fd2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 10 Apr 2025 16:08:20 +0200 Subject: [PATCH 067/160] temp: disable acuc caching --- apps/api/src/controllers/auth.ts | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/apps/api/src/controllers/auth.ts b/apps/api/src/controllers/auth.ts index 7b38181d..6cf39bf0 100644 --- a/apps/api/src/controllers/auth.ts +++ b/apps/api/src/controllers/auth.ts @@ -80,14 +80,14 @@ export async function getACUC( ): Promise { const cacheKeyACUC = `acuc_${api_key}_${mode}`; - if (useCache) { - const cachedACUC = await getValue(cacheKeyACUC); - if (cachedACUC !== null) { - return JSON.parse(cachedACUC); - } - } + // if (useCache) { + // const cachedACUC = await getValue(cacheKeyACUC); + // if (cachedACUC !== null) { + // return JSON.parse(cachedACUC); + // } + // } - if (!cacheOnly) { + // if (!cacheOnly) { let data; let error; let retries = 0; @@ -129,14 +129,14 @@ export async function getACUC( data.length === 0 ? null : data[0].team_id === null ? null : data[0]; // NOTE: Should we cache null chunks? - mogery - if (chunk !== null && useCache) { - setCachedACUC(api_key, chunk); - } + // if (chunk !== null && useCache) { + // setCachedACUC(api_key, chunk); + // } return chunk ? { ...chunk, is_extract: isExtract } : null; - } else { - return null; - } + // } else { + // return null; + // } } export async function clearACUC(api_key: string): Promise { From 6a10f0689de8566d01c4466fd5813a259be065bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 10 Apr 2025 18:49:23 +0200 Subject: [PATCH 068/160] ACUC: Dynamic Limits (FIR-1641) (#1434) * extend acuc definition * kill plan * stuff * stupid tests * feat: better acuc * feat(acuc): mock ACUC when not using db auth --- .../src/__tests__/concurrency-limit.test.ts | 30 - .../unit/deep-research-redis.test.ts | 1 - .../queue-concurrency-integration.test.ts | 20 +- apps/api/src/__tests__/snips/crawl.test.ts | 4 +- apps/api/src/controllers/auth.ts | 308 +++++---- .../controllers/v0/admin/acuc-cache-clear.ts | 3 +- apps/api/src/controllers/v0/crawl.ts | 8 +- apps/api/src/controllers/v0/crawlPreview.ts | 5 - apps/api/src/controllers/v0/scrape.ts | 9 +- apps/api/src/controllers/v0/search.ts | 8 +- apps/api/src/controllers/v1/batch-scrape.ts | 4 - .../src/controllers/v1/concurrency-check.ts | 10 +- .../api/src/controllers/v1/crawl-status-ws.ts | 4 +- apps/api/src/controllers/v1/crawl.ts | 3 - apps/api/src/controllers/v1/credit-usage.ts | 4 +- apps/api/src/controllers/v1/deep-research.ts | 2 - apps/api/src/controllers/v1/extract.ts | 3 - .../src/controllers/v1/generate-llmstxt.ts | 2 - apps/api/src/controllers/v1/map.ts | 4 - apps/api/src/controllers/v1/scrape-status.ts | 1 - apps/api/src/controllers/v1/scrape.ts | 3 - apps/api/src/controllers/v1/search.ts | 7 +- apps/api/src/controllers/v1/token-usage.ts | 4 +- apps/api/src/controllers/v1/types.ts | 24 +- .../src/lib/__tests__/job-priority.test.ts | 18 +- apps/api/src/lib/crawl-redis.ts | 4 - .../lib/deep-research/deep-research-redis.ts | 1 - .../deep-research/deep-research-service.ts | 6 +- .../src/lib/deep-research/research-manager.ts | 1 - apps/api/src/lib/extract/document-scraper.ts | 4 - apps/api/src/lib/extract/extract-redis.ts | 1 - .../api/src/lib/extract/extraction-service.ts | 7 +- apps/api/src/lib/extract/url-processor.ts | 4 - .../generate-llmstxt-redis.ts | 1 - .../generate-llmstxt-service.ts | 6 +- apps/api/src/lib/job-priority.ts | 52 +- apps/api/src/routes/v1.ts | 4 +- .../api/src/services/billing/batch_billing.ts | 14 +- apps/api/src/services/queue-jobs.ts | 13 +- apps/api/src/services/queue-worker.ts | 14 +- apps/api/src/services/rate-limiter.test.ts | 642 +++++++++--------- apps/api/src/services/rate-limiter.ts | 307 +-------- apps/api/src/types.ts | 26 - 43 files changed, 611 insertions(+), 985 deletions(-) diff --git a/apps/api/src/__tests__/concurrency-limit.test.ts b/apps/api/src/__tests__/concurrency-limit.test.ts index 2cd9a63f..742bffd6 100644 --- a/apps/api/src/__tests__/concurrency-limit.test.ts +++ b/apps/api/src/__tests__/concurrency-limit.test.ts @@ -9,8 +9,6 @@ import { getConcurrencyQueueJobsCount, ConcurrencyLimitedJob, } from "../lib/concurrency-limit"; -import { CONCURRENCY_LIMIT, getConcurrencyLimitMax } from "../services/rate-limiter"; -import { PlanType } from "../types"; // Mock Redis client jest.mock("../services/queue-service", () => ({ @@ -174,34 +172,6 @@ describe("Concurrency Limit", () => { }); }); - describe("getConcurrencyLimitMax", () => { - it("should return correct limit for free plan", () => { - const result = getConcurrencyLimitMax("free"); - expect(result).toBe(2); - }); - - it("should return correct limit for standard plan", () => { - const result = getConcurrencyLimitMax("standard"); - expect(result).toBe(CONCURRENCY_LIMIT.standard); - }); - - it("should return correct limit for scale plan", () => { - const result = getConcurrencyLimitMax("scale"); - expect(result).toBe(CONCURRENCY_LIMIT.scale); - }); - - it("should return default limit for unknown plan", () => { - const result = getConcurrencyLimitMax("unknown" as PlanType); - expect(result).toBe(10); - }); - - it("should handle special team IDs", () => { - process.env.DEV_B_TEAM_ID = "dev-b-team"; - const result = getConcurrencyLimitMax("free", "dev-b-team"); - expect(result).toBe(120); - }); - }); - describe("Integration Scenarios", () => { it("should handle complete job lifecycle", async () => { const mockJob: ConcurrencyLimitedJob = { diff --git a/apps/api/src/__tests__/deep-research/unit/deep-research-redis.test.ts b/apps/api/src/__tests__/deep-research/unit/deep-research-redis.test.ts index 13b4d994..2f2ff735 100644 --- a/apps/api/src/__tests__/deep-research/unit/deep-research-redis.test.ts +++ b/apps/api/src/__tests__/deep-research/unit/deep-research-redis.test.ts @@ -20,7 +20,6 @@ describe("Deep Research Redis Operations", () => { const mockResearch: StoredDeepResearch = { id: "test-id", team_id: "team-1", - plan: "pro", createdAt: Date.now(), status: "processing", currentDepth: 0, diff --git a/apps/api/src/__tests__/queue-concurrency-integration.test.ts b/apps/api/src/__tests__/queue-concurrency-integration.test.ts index 004ea592..940efaaa 100644 --- a/apps/api/src/__tests__/queue-concurrency-integration.test.ts +++ b/apps/api/src/__tests__/queue-concurrency-integration.test.ts @@ -6,8 +6,8 @@ import { takeConcurrencyLimitedJob, removeConcurrencyLimitActiveJob, } from "../lib/concurrency-limit"; -import { getConcurrencyLimitMax } from "../services/rate-limiter"; -import { WebScraperOptions, PlanType } from "../types"; +import { WebScraperOptions } from "../types"; +import { getACUCTeam } from "../controllers/auth"; // Mock all the dependencies const mockAdd = jest.fn(); @@ -32,7 +32,6 @@ jest.mock("uuid", () => ({ describe("Queue Concurrency Integration", () => { const mockTeamId = "test-team-id"; - const mockPlan = "standard" as PlanType; const mockNow = Date.now(); const defaultScrapeOptions = { @@ -77,7 +76,6 @@ describe("Queue Concurrency Integration", () => { url: "https://test.com", mode: "single_urls", team_id: mockTeamId, - plan: mockPlan, scrapeOptions: defaultScrapeOptions, crawlerOptions: null, }; @@ -104,8 +102,10 @@ describe("Queue Concurrency Integration", () => { it("should add job to concurrency queue when at concurrency limit", async () => { // Mock current active jobs to be at limit - const maxConcurrency = getConcurrencyLimitMax(mockPlan); - const activeJobs = Array(maxConcurrency).fill("active-job"); + (getACUCTeam as jest.Mock).mockResolvedValue({ + concurrency: 15, + } as any); + const activeJobs = Array(15).fill("active-job"); (redisConnection.zrangebyscore as jest.Mock).mockResolvedValue( activeJobs, ); @@ -136,7 +136,6 @@ describe("Queue Concurrency Integration", () => { url: `https://test${i}.com`, mode: "single_urls", team_id: mockTeamId, - plan: mockPlan, scrapeOptions: defaultScrapeOptions, } as WebScraperOptions, opts: { @@ -146,7 +145,10 @@ describe("Queue Concurrency Integration", () => { })); it("should handle batch jobs respecting concurrency limits", async () => { - const maxConcurrency = getConcurrencyLimitMax(mockPlan); + const maxConcurrency = 15; + (getACUCTeam as jest.Mock).mockResolvedValue({ + concurrency: maxConcurrency, + } as any); const totalJobs = maxConcurrency + 5; // Some jobs should go to queue const mockJobs = createMockJobs(totalJobs); @@ -180,7 +182,6 @@ describe("Queue Concurrency Integration", () => { id: "test-job", data: { team_id: mockTeamId, - plan: mockPlan, }, }; @@ -218,7 +219,6 @@ describe("Queue Concurrency Integration", () => { id: "failing-job", data: { team_id: mockTeamId, - plan: mockPlan, }, }; diff --git a/apps/api/src/__tests__/snips/crawl.test.ts b/apps/api/src/__tests__/snips/crawl.test.ts index 2aed9e95..d04eab35 100644 --- a/apps/api/src/__tests__/snips/crawl.test.ts +++ b/apps/api/src/__tests__/snips/crawl.test.ts @@ -53,7 +53,7 @@ describe("Crawl tests", () => { // expect(page.metadata.url ?? page.metadata.sourceURL).toMatch(/^https:\/\/(www\.)?firecrawl\.dev\/blog/); // } // } - // }, 120000); + // }, 300000); // TEMP: Flaky // it.concurrent("discovers URLs properly when maxDiscoveryDepth is provided", async () => { @@ -71,5 +71,5 @@ describe("Crawl tests", () => { // expect(page.metadata.url ?? page.metadata.sourceURL).not.toMatch(/^https:\/\/(www\.)?firecrawl\.dev\/blog\/.+$/); // } // } - // }, 120000); + // }, 300000); }); diff --git a/apps/api/src/controllers/auth.ts b/apps/api/src/controllers/auth.ts index 6cf39bf0..395331af 100644 --- a/apps/api/src/controllers/auth.ts +++ b/apps/api/src/controllers/auth.ts @@ -3,7 +3,6 @@ import { getRateLimiter, isTestSuiteToken } from "../services/rate-limiter"; import { AuthResponse, NotificationType, - PlanType, RateLimiterMode, } from "../types"; import { supabase_rr_service, supabase_service } from "../services/supabase"; @@ -16,7 +15,7 @@ import { deleteKey, getValue } from "../services/redis"; import { setValue } from "../services/redis"; import { validate } from "uuid"; import * as Sentry from "@sentry/node"; -import { AuthCreditUsageChunk } from "./v1/types"; +import { AuthCreditUsageChunk, AuthCreditUsageChunkFromTeam } from "./v1/types"; // const { data, error } = await supabase_service // .from('api_keys') // .select(` @@ -38,12 +37,13 @@ function normalizedApiIsUuid(potentialUuid: string): boolean { export async function setCachedACUC( api_key: string, + is_extract: boolean, acuc: | AuthCreditUsageChunk | null | ((acuc: AuthCreditUsageChunk) => AuthCreditUsageChunk | null), ) { - const cacheKeyACUC = `acuc_${api_key}`; + const cacheKeyACUC = `acuc_${api_key}_${is_extract ? "extract" : "scrape"}`; const redLockKey = `lock_${cacheKeyACUC}`; try { @@ -72,13 +72,55 @@ export async function setCachedACUC( } } +const mockACUC: () => AuthCreditUsageChunk = () => ({ + api_key: "bypass", + team_id: "bypass", + sub_id: "bypass", + sub_current_period_start: new Date().toISOString(), + sub_current_period_end: new Date(new Date().getTime() + 30 * 24 * 60 * 60 * 1000).toISOString(), + sub_user_id: "bypass", + price_id: "bypass", + rate_limits: { + crawl: 99999999, + scrape: 99999999, + extract: 99999999, + search: 99999999, + map: 99999999, + preview: 99999999, + crawlStatus: 99999999, + extractStatus: 99999999, + }, + price_credits: 99999999, + credits_used: 0, + coupon_credits: 99999999, + adjusted_credits_used: 0, + remaining_credits: 99999999, + total_credits_sum: 99999999, + plan_priority: { + bucketLimit: 25, + planModifier: 0.1, + }, + concurrency: 99999999, + is_extract: false, +}); + export async function getACUC( api_key: string, cacheOnly = false, useCache = true, mode?: RateLimiterMode, ): Promise { - const cacheKeyACUC = `acuc_${api_key}_${mode}`; + let isExtract = + mode === RateLimiterMode.Extract || + mode === RateLimiterMode.ExtractStatus; + + if (process.env.USE_DB_AUTHENTICATION !== "true") { + const acuc = mockACUC(); + acuc.is_extract = isExtract; + return acuc; + } + + const cacheKeyACUC = `acuc_${api_key}_${isExtract ? "extract" : "scrape"}`; // if (useCache) { // const cachedACUC = await getValue(cacheKeyACUC); @@ -92,15 +134,11 @@ export async function getACUC( let error; let retries = 0; const maxRetries = 5; - - let isExtract = - mode === RateLimiterMode.Extract || - mode === RateLimiterMode.ExtractStatus; while (retries < maxRetries) { const client = Math.random() > (2/3) ? supabase_rr_service : supabase_service; ({ data, error } = await client.rpc( - "auth_credit_usage_chunk_27_tally", + "auth_credit_usage_chunk_28", { input_key: api_key, i_is_extract: isExtract, tally_untallied_credits: true }, { get: true }, )); @@ -125,6 +163,117 @@ export async function getACUC( await new Promise((resolve) => setTimeout(resolve, 200)); } + const chunk: AuthCreditUsageChunk | null = + data.length === 0 ? null : data[0].team_id === null ? null : data[0]; + + // NOTE: Should we cache null chunks? - mogery + // if (chunk !== null && useCache) { + // setCachedACUC(api_key, isExtract, chunk); + // } + + return chunk ? { ...chunk, is_extract: isExtract } : null; + // } else { + // return null; + // } +} + +export async function setCachedACUCTeam( + team_id: string, + is_extract: boolean, + acuc: + | AuthCreditUsageChunkFromTeam + | null + | ((acuc: AuthCreditUsageChunkFromTeam) => AuthCreditUsageChunkFromTeam | null), +) { + const cacheKeyACUC = `acuc_team_${team_id}_${is_extract ? "extract" : "scrape"}`; + const redLockKey = `lock_${cacheKeyACUC}`; + + try { + await redlock.using([redLockKey], 10000, {}, async (signal) => { + if (typeof acuc === "function") { + acuc = acuc(JSON.parse((await getValue(cacheKeyACUC)) ?? "null")); + + if (acuc === null) { + if (signal.aborted) { + throw signal.error; + } + + return; + } + } + + if (signal.aborted) { + throw signal.error; + } + + // Cache for 1 hour. - mogery + await setValue(cacheKeyACUC, JSON.stringify(acuc), 3600, true); + }); + } catch (error) { + logger.error(`Error updating cached ACUC ${cacheKeyACUC}: ${error}`); + } +} + +export async function getACUCTeam( + team_id: string, + cacheOnly = false, + useCache = true, + mode?: RateLimiterMode, +): Promise { + let isExtract = + mode === RateLimiterMode.Extract || + mode === RateLimiterMode.ExtractStatus; + + if (process.env.USE_DB_AUTHENTICATION !== "true") { + const acuc = mockACUC(); + acuc.is_extract = isExtract; + return acuc; + } + + const cacheKeyACUC = `acuc_team_${team_id}_${isExtract ? "extract" : "scrape"}`; + + // if (useCache) { + // const cachedACUC = await getValue(cacheKeyACUC); + // if (cachedACUC !== null) { + // return JSON.parse(cachedACUC); + // } + // } + + // if (!cacheOnly) { + let data; + let error; + let retries = 0; + const maxRetries = 5; + + while (retries < maxRetries) { + const client = + Math.random() > (2/3) ? supabase_rr_service : supabase_service; + ({ data, error } = await client.rpc( + "auth_credit_usage_chunk_28_from_team", + { input_team: team_id, i_is_extract: isExtract, tally_untallied_credits: true }, + { get: true }, + )); + + if (!error) { + break; + } + + logger.warn( + `Failed to retrieve authentication and credit usage data after ${retries}, trying again...`, + { error } + ); + retries++; + if (retries === maxRetries) { + throw new Error( + "Failed to retrieve authentication and credit usage data after 3 attempts: " + + JSON.stringify(error), + ); + } + + // Wait for a short time before retrying + await new Promise((resolve) => setTimeout(resolve, 200)); + } + const chunk: AuthCreditUsageChunk | null = data.length === 0 ? null : data[0].team_id === null ? null : data[0]; @@ -141,10 +290,10 @@ export async function getACUC( export async function clearACUC(api_key: string): Promise { // Delete cache for all rate limiter modes - const modes = Object.values(RateLimiterMode); + const modes = [true, false]; await Promise.all( modes.map(async (mode) => { - const cacheKey = `acuc_${api_key}_${mode}`; + const cacheKey = `acuc_${api_key}_${mode ? "extract" : "scrape"}`; await deleteKey(cacheKey); }), ); @@ -153,6 +302,20 @@ export async function clearACUC(api_key: string): Promise { await deleteKey(`acuc_${api_key}`); } +export async function clearACUCTeam(team_id: string): Promise { + // Delete cache for all rate limiter modes + const modes = [true, false]; + await Promise.all( + modes.map(async (mode) => { + const cacheKey = `acuc_team_${team_id}_${mode ? "extract" : "scrape"}`; + await deleteKey(cacheKey); + }), + ); + + // Also clear the base cache key + await deleteKey(`acuc_team_${team_id}`); +} + export async function authenticateUser( req, res, @@ -192,13 +355,12 @@ export async function supaAuthenticateUser( const iptoken = incomingIP + token; let rateLimiter: RateLimiterRedis; - let subscriptionData: { team_id: string; plan: string } | null = null; + let subscriptionData: { team_id: string} | null = null; let normalizedApi: string; let teamId: string | null = null; let priceId: string | null = null; let chunk: AuthCreditUsageChunk | null = null; - let plan: PlanType = "free"; if (token == "this_is_just_a_preview_token") { throw new Error( "Unauthenticated Playground calls are temporarily disabled due to abuse. Please sign up.", @@ -213,7 +375,6 @@ export async function supaAuthenticateUser( rateLimiter = getRateLimiter(RateLimiterMode.Preview, token); } teamId = `preview_${iptoken}`; - plan = "free"; } else { normalizedApi = parseApi(token); if (!normalizedApiIsUuid(normalizedApi)) { @@ -237,65 +398,13 @@ export async function supaAuthenticateUser( teamId = chunk.team_id; priceId = chunk.price_id; - plan = getPlanByPriceId(priceId); subscriptionData = { team_id: teamId, - plan, }; - switch (mode) { - case RateLimiterMode.Crawl: - rateLimiter = getRateLimiter( - RateLimiterMode.Crawl, - token, - subscriptionData.plan, - ); - break; - case RateLimiterMode.Scrape: - rateLimiter = getRateLimiter( - RateLimiterMode.Scrape, - token, - subscriptionData.plan, - teamId, - ); - break; - case RateLimiterMode.Search: - rateLimiter = getRateLimiter( - RateLimiterMode.Search, - token, - subscriptionData.plan, - ); - break; - case RateLimiterMode.Map: - rateLimiter = getRateLimiter( - RateLimiterMode.Map, - token, - subscriptionData.plan, - ); - break; - case RateLimiterMode.Extract: - rateLimiter = getRateLimiter( - RateLimiterMode.Extract, - token, - subscriptionData.plan, - ); - break; - case RateLimiterMode.ExtractStatus: - rateLimiter = getRateLimiter(RateLimiterMode.ExtractStatus, token); - break; - case RateLimiterMode.CrawlStatus: - rateLimiter = getRateLimiter(RateLimiterMode.CrawlStatus, token); - break; - - case RateLimiterMode.Preview: - rateLimiter = getRateLimiter(RateLimiterMode.Preview, token); - break; - default: - rateLimiter = getRateLimiter(RateLimiterMode.Crawl, token); - break; - // case RateLimiterMode.Search: - // rateLimiter = await searchRateLimiter(RateLimiterMode.Search, token); - // break; - } + rateLimiter = getRateLimiter( + mode ?? RateLimiterMode.Crawl, + chunk.rate_limits, + ); } const team_endpoint_token = @@ -307,8 +416,8 @@ export async function supaAuthenticateUser( logger.error(`Rate limit exceeded: ${rateLimiterRes}`, { teamId, priceId, - plan: subscriptionData?.plan, mode, + rateLimits: chunk?.rate_limits, rateLimiterRes, }); const secs = Math.round(rateLimiterRes.msBeforeNext / 1000) || 1; @@ -342,7 +451,6 @@ export async function supaAuthenticateUser( success: true, team_id: `preview_${iptoken}`, chunk: null, - plan: "free", }; // check the origin of the request and make sure its from firecrawl.dev // const origin = req.headers.origin; @@ -356,65 +464,9 @@ export async function supaAuthenticateUser( // return { success: false, error: "Unauthorized: Invalid token", status: 401 }; } - if (token && isTestSuiteToken(token)) { - return { - success: true, - team_id: teamId ?? undefined, - // Now we have a test suite plan - plan: "testSuite", - chunk, - }; - } - return { success: true, team_id: teamId ?? undefined, - plan: (subscriptionData?.plan ?? "") as PlanType, chunk, }; } -function getPlanByPriceId(price_id: string | null): PlanType { - switch (price_id) { - case process.env.STRIPE_PRICE_ID_STARTER: - return "starter"; - case process.env.STRIPE_PRICE_ID_STANDARD: - return "standard"; - case process.env.STRIPE_PRICE_ID_SCALE: - return "scale"; - case process.env.STRIPE_PRICE_ID_HOBBY: - case process.env.STRIPE_PRICE_ID_HOBBY_YEARLY: - return "hobby"; - case process.env.STRIPE_PRICE_ID_STANDARD_NEW: - case process.env.STRIPE_PRICE_ID_STANDARD_NEW_YEARLY: - return "standardnew"; - case process.env.STRIPE_PRICE_ID_GROWTH: - case process.env.STRIPE_PRICE_ID_GROWTH_YEARLY: - case process.env.STRIPE_PRICE_ID_SCALE_2M: - return "growth"; - case process.env.STRIPE_PRICE_ID_GROWTH_DOUBLE_MONTHLY: - return "growthdouble"; - case process.env.STRIPE_PRICE_ID_ETIER2C: - return "etier2c"; - case process.env.STRIPE_PRICE_ID_ETIER1A_MONTHLY: //ocqh - return "etier1a"; - case process.env.STRIPE_PRICE_ID_ETIER_SCALE_1_MONTHLY: - case process.env.STRIPE_PRICE_ID_ETIER_SCALE_1_YEARLY: - case process.env.STRIPE_PRICE_ID_ETIER_SCALE_1_YEARLY_FIRECRAWL: - return "etierscale1"; - case process.env.STRIPE_PRICE_ID_ETIER_SCALE_2_YEARLY: - case process.env.STRIPE_PRICE_ID_ETIER_SCALE_2_MONTHLY: - return "etierscale2"; - case process.env.STRIPE_PRICE_ID_EXTRACT_STARTER_MONTHLY: - case process.env.STRIPE_PRICE_ID_EXTRACT_STARTER_YEARLY: - return "extract_starter"; - case process.env.STRIPE_PRICE_ID_EXTRACT_EXPLORER_MONTHLY: - case process.env.STRIPE_PRICE_ID_EXTRACT_EXPLORER_YEARLY: - return "extract_explorer"; - case process.env.STRIPE_PRICE_ID_EXTRACT_PRO_MONTHLY: - case process.env.STRIPE_PRICE_ID_EXTRACT_PRO_YEARLY: - return "extract_pro"; - default: - return "free"; - } -} - diff --git a/apps/api/src/controllers/v0/admin/acuc-cache-clear.ts b/apps/api/src/controllers/v0/admin/acuc-cache-clear.ts index dc554f9b..44459fe6 100644 --- a/apps/api/src/controllers/v0/admin/acuc-cache-clear.ts +++ b/apps/api/src/controllers/v0/admin/acuc-cache-clear.ts @@ -1,6 +1,6 @@ import { Request, Response } from "express"; import { supabase_service } from "../../../services/supabase"; -import { clearACUC } from "../../auth"; +import { clearACUC, clearACUCTeam } from "../../auth"; import { logger } from "../../../lib/logger"; export async function acucCacheClearController(req: Request, res: Response) { @@ -13,6 +13,7 @@ export async function acucCacheClearController(req: Request, res: Response) { .eq("team_id", team_id); await Promise.all((keys.data ?? []).map((x) => clearACUC(x.key))); + await clearACUCTeam(team_id); logger.info(`ACUC cache cleared for team ${team_id}`); res.json({ ok: true }); diff --git a/apps/api/src/controllers/v0/crawl.ts b/apps/api/src/controllers/v0/crawl.ts index c8b186b0..889b5c2b 100644 --- a/apps/api/src/controllers/v0/crawl.ts +++ b/apps/api/src/controllers/v0/crawl.ts @@ -39,7 +39,7 @@ export async function crawlController(req: Request, res: Response) { return res.status(auth.status).json({ error: auth.error }); } - const { team_id, plan, chunk } = auth; + const { team_id, chunk } = auth; redisConnection.sadd("teams_using_v0", team_id) .catch(error => logger.error("Failed to add team to teams_using_v0", { error, team_id })); @@ -170,7 +170,6 @@ export async function crawlController(req: Request, res: Response) { scrapeOptions, internalOptions, team_id, - plan, createdAt: Date.now(), }; @@ -190,7 +189,6 @@ export async function crawlController(req: Request, res: Response) { if (urls.length === 0) return; let jobPriority = await getJobPriority({ - plan, team_id, basePriority: 21, }); @@ -205,7 +203,6 @@ export async function crawlController(req: Request, res: Response) { scrapeOptions, internalOptions, team_id, - plan, origin: req.body.origin ?? defaultOrigin, crawl_id: id, sitemapped: true, @@ -236,7 +233,7 @@ export async function crawlController(req: Request, res: Response) { await lockURL(id, sc, url); // Not needed, first one should be 15. - // const jobPriority = await getJobPriority({plan, team_id, basePriority: 10}) + // const jobPriority = await getJobPriority({team_id, basePriority: 10}) const jobId = uuidv4(); await addScrapeJob( @@ -247,7 +244,6 @@ export async function crawlController(req: Request, res: Response) { scrapeOptions, internalOptions, team_id, - plan: plan!, origin: req.body.origin ?? defaultOrigin, crawl_id: id, }, diff --git a/apps/api/src/controllers/v0/crawlPreview.ts b/apps/api/src/controllers/v0/crawlPreview.ts index 9153ea79..abdbdda9 100644 --- a/apps/api/src/controllers/v0/crawlPreview.ts +++ b/apps/api/src/controllers/v0/crawlPreview.ts @@ -31,8 +31,6 @@ export async function crawlPreviewController(req: Request, res: Response) { return res.status(auth.status).json({ error: auth.error }); } - const { plan } = auth; - let url = req.body.url; if (!url) { return res.status(400).json({ error: "Url is required" }); @@ -108,7 +106,6 @@ export async function crawlPreviewController(req: Request, res: Response) { scrapeOptions, internalOptions, team_id, - plan, robots, createdAt: Date.now(), }; @@ -130,7 +127,6 @@ export async function crawlPreviewController(req: Request, res: Response) { url, mode: "single_urls", team_id, - plan: plan!, crawlerOptions, scrapeOptions, internalOptions, @@ -153,7 +149,6 @@ export async function crawlPreviewController(req: Request, res: Response) { url, mode: "single_urls", team_id, - plan: plan!, crawlerOptions, scrapeOptions, internalOptions, diff --git a/apps/api/src/controllers/v0/scrape.ts b/apps/api/src/controllers/v0/scrape.ts index 0bdd197b..053afd50 100644 --- a/apps/api/src/controllers/v0/scrape.ts +++ b/apps/api/src/controllers/v0/scrape.ts @@ -5,7 +5,7 @@ import { checkTeamCredits, } from "../../services/billing/credit_billing"; import { authenticateUser } from "../auth"; -import { PlanType, RateLimiterMode } from "../../types"; +import { RateLimiterMode } from "../../types"; import { logJob } from "../../services/logging/log_job"; import { fromLegacyCombo, @@ -39,7 +39,6 @@ export async function scrapeHelper( pageOptions: PageOptions, extractorOptions: ExtractorOptions, timeout: number, - plan?: PlanType, ): Promise<{ success: boolean; error?: string; @@ -59,7 +58,7 @@ export async function scrapeHelper( }; } - const jobPriority = await getJobPriority({ plan, team_id, basePriority: 10 }); + const jobPriority = await getJobPriority({ team_id, basePriority: 10 }); const { scrapeOptions, internalOptions } = fromLegacyCombo( pageOptions, @@ -76,7 +75,6 @@ export async function scrapeHelper( team_id, scrapeOptions, internalOptions, - plan: plan!, origin: req.body.origin ?? defaultOrigin, is_scrape: true, }, @@ -180,7 +178,7 @@ export async function scrapeController(req: Request, res: Response) { return res.status(auth.status).json({ error: auth.error }); } - const { team_id, plan, chunk } = auth; + const { team_id, chunk } = auth; redisConnection.sadd("teams_using_v0", team_id) .catch(error => logger.error("Failed to add team to teams_using_v0", { error, team_id })); @@ -240,7 +238,6 @@ export async function scrapeController(req: Request, res: Response) { pageOptions, extractorOptions, timeout, - plan, ); const endTime = new Date().getTime(); const timeTakenInSeconds = (endTime - startTime) / 1000; diff --git a/apps/api/src/controllers/v0/search.ts b/apps/api/src/controllers/v0/search.ts index d8649a52..45092b77 100644 --- a/apps/api/src/controllers/v0/search.ts +++ b/apps/api/src/controllers/v0/search.ts @@ -4,7 +4,7 @@ import { checkTeamCredits, } from "../../services/billing/credit_billing"; import { authenticateUser } from "../auth"; -import { PlanType, RateLimiterMode } from "../../types"; +import { RateLimiterMode } from "../../types"; import { logJob } from "../../services/logging/log_job"; import { PageOptions, SearchOptions } from "../../lib/entities"; import { search } from "../../search"; @@ -31,7 +31,6 @@ export async function searchHelper( crawlerOptions: any, pageOptions: PageOptions, searchOptions: SearchOptions, - plan: PlanType | undefined, ): Promise<{ success: boolean; error?: string; @@ -94,7 +93,7 @@ export async function searchHelper( return { success: true, error: "No search results found", returnCode: 200 }; } - const jobPriority = await getJobPriority({ plan, team_id, basePriority: 20 }); + const jobPriority = await getJobPriority({ team_id, basePriority: 20 }); // filter out social media links @@ -163,7 +162,7 @@ export async function searchController(req: Request, res: Response) { if (!auth.success) { return res.status(auth.status).json({ error: auth.error }); } - const { team_id, plan, chunk } = auth; + const { team_id, chunk } = auth; redisConnection.sadd("teams_using_v0", team_id) .catch(error => logger.error("Failed to add team to teams_using_v0", { error, team_id })); @@ -202,7 +201,6 @@ export async function searchController(req: Request, res: Response) { crawlerOptions, pageOptions, searchOptions, - plan, ); const endTime = new Date().getTime(); const timeTakenInSeconds = (endTime - startTime) / 1000; diff --git a/apps/api/src/controllers/v1/batch-scrape.ts b/apps/api/src/controllers/v1/batch-scrape.ts index 20fab47c..b4da87c5 100644 --- a/apps/api/src/controllers/v1/batch-scrape.ts +++ b/apps/api/src/controllers/v1/batch-scrape.ts @@ -40,7 +40,6 @@ export async function batchScrapeController( module: "api/v1", method: "batchScrapeController", teamId: req.auth.team_id, - plan: req.auth.plan, }); let urls = req.body.urls; @@ -85,7 +84,6 @@ export async function batchScrapeController( internalOptions: { disableSmartWaitCache: true, teamId: req.auth.team_id }, // NOTE: smart wait disabled for batch scrapes to ensure contentful scrape, speed does not matter team_id: req.auth.team_id, createdAt: Date.now(), - plan: req.auth.plan, }; if (!req.body.appendToId) { @@ -99,7 +97,6 @@ export async function batchScrapeController( if (urls.length > 1000) { // set base to 21 jobPriority = await getJobPriority({ - plan: req.auth.plan, team_id: req.auth.team_id, basePriority: 21, }); @@ -116,7 +113,6 @@ export async function batchScrapeController( url: x, mode: "single_urls" as const, team_id: req.auth.team_id, - plan: req.auth.plan!, crawlerOptions: null, scrapeOptions, origin: "api", diff --git a/apps/api/src/controllers/v1/concurrency-check.ts b/apps/api/src/controllers/v1/concurrency-check.ts index 757541de..1aa69363 100644 --- a/apps/api/src/controllers/v1/concurrency-check.ts +++ b/apps/api/src/controllers/v1/concurrency-check.ts @@ -1,13 +1,10 @@ -import { authenticateUser } from "../auth"; import { ConcurrencyCheckParams, ConcurrencyCheckResponse, RequestWithAuth, } from "./types"; -import { RateLimiterMode, PlanType } from "../../types"; import { Response } from "express"; import { redisConnection } from "../../services/queue-service"; -import { getConcurrencyLimitMax } from "../../services/rate-limiter"; // Basically just middleware and error wrapping export async function concurrencyCheckController( @@ -22,14 +19,9 @@ export async function concurrencyCheckController( Infinity, ); - const maxConcurrency = getConcurrencyLimitMax( - req.auth.plan as PlanType, - req.auth.team_id, - ); - return res.status(200).json({ success: true, concurrency: activeJobsOfTeam.length, - maxConcurrency: maxConcurrency, + maxConcurrency: req.acuc.concurrency, }); } diff --git a/apps/api/src/controllers/v1/crawl-status-ws.ts b/apps/api/src/controllers/v1/crawl-status-ws.ts index 8a056cc6..3fc74f78 100644 --- a/apps/api/src/controllers/v1/crawl-status-ws.ts +++ b/apps/api/src/controllers/v1/crawl-status-ws.ts @@ -191,9 +191,9 @@ export async function crawlStatusWSController( }); } - const { team_id, plan } = auth; + const { team_id } = auth; - req.auth = { team_id, plan }; + req.auth = { team_id }; await crawlStatusWS(ws, req); } catch (err) { diff --git a/apps/api/src/controllers/v1/crawl.ts b/apps/api/src/controllers/v1/crawl.ts index 31e39502..17089c7b 100644 --- a/apps/api/src/controllers/v1/crawl.ts +++ b/apps/api/src/controllers/v1/crawl.ts @@ -25,7 +25,6 @@ export async function crawlController( module: "api/v1", method: "crawlController", teamId: req.auth.team_id, - plan: req.auth.plan, }); logger.debug("Crawl " + id + " starting", { request: req.body, @@ -84,7 +83,6 @@ export async function crawlController( internalOptions: { disableSmartWaitCache: true, teamId: req.auth.team_id }, // NOTE: smart wait disabled for crawls to ensure contentful scrape, speed does not matter team_id: req.auth.team_id, createdAt: Date.now(), - plan: req.auth.plan, }; const crawler = crawlToCrawler(id, sc); @@ -104,7 +102,6 @@ export async function crawlController( url: req.body.url, mode: "kickoff" as const, team_id: req.auth.team_id, - plan: req.auth.plan, crawlerOptions, scrapeOptions: sc.scrapeOptions, internalOptions: sc.internalOptions, diff --git a/apps/api/src/controllers/v1/credit-usage.ts b/apps/api/src/controllers/v1/credit-usage.ts index da522c13..fc070b24 100644 --- a/apps/api/src/controllers/v1/credit-usage.ts +++ b/apps/api/src/controllers/v1/credit-usage.ts @@ -1,6 +1,6 @@ import { Request, Response } from "express"; import { RequestWithAuth } from "./types"; -import { getACUC } from "../auth"; +import { getACUCTeam } from "../auth"; import { logger } from "../../lib/logger"; export async function creditUsageController( @@ -20,7 +20,7 @@ export async function creditUsageController( } // Otherwise fetch fresh data - const chunk = await getACUC(req.auth.team_id); + const chunk = await getACUCTeam(req.auth.team_id); if (!chunk) { res.status(404).json({ success: false, diff --git a/apps/api/src/controllers/v1/deep-research.ts b/apps/api/src/controllers/v1/deep-research.ts index 3d52d19d..b9baa7ca 100644 --- a/apps/api/src/controllers/v1/deep-research.ts +++ b/apps/api/src/controllers/v1/deep-research.ts @@ -52,7 +52,6 @@ export async function deepResearchController( const jobData = { request: req.body, teamId: req.auth.team_id, - plan: req.auth.plan, subId: req.acuc?.sub_id, researchId, }; @@ -60,7 +59,6 @@ export async function deepResearchController( await saveDeepResearch(researchId, { id: researchId, team_id: req.auth.team_id, - plan: req.auth.plan, createdAt: Date.now(), status: "processing", currentDepth: 0, diff --git a/apps/api/src/controllers/v1/extract.ts b/apps/api/src/controllers/v1/extract.ts index b18117f5..d0e008a7 100644 --- a/apps/api/src/controllers/v1/extract.ts +++ b/apps/api/src/controllers/v1/extract.ts @@ -22,7 +22,6 @@ export async function oldExtract( const result = await performExtraction(extractId, { request: req.body, teamId: req.auth.team_id, - plan: req.auth.plan ?? "free", subId: req.acuc?.sub_id ?? undefined, }); @@ -52,7 +51,6 @@ export async function extractController( const jobData = { request: req.body, teamId: req.auth.team_id, - plan: req.auth.plan, subId: req.acuc?.sub_id, extractId, }; @@ -68,7 +66,6 @@ export async function extractController( await saveExtract(extractId, { id: extractId, team_id: req.auth.team_id, - plan: req.auth.plan, createdAt: Date.now(), status: "processing", showSteps: req.body.__experimental_streamSteps, diff --git a/apps/api/src/controllers/v1/generate-llmstxt.ts b/apps/api/src/controllers/v1/generate-llmstxt.ts index f4e528ce..52358ba8 100644 --- a/apps/api/src/controllers/v1/generate-llmstxt.ts +++ b/apps/api/src/controllers/v1/generate-llmstxt.ts @@ -30,7 +30,6 @@ export async function generateLLMsTextController( const jobData = { request: req.body, teamId: req.auth.team_id, - plan: req.auth.plan, subId: req.acuc?.sub_id, generationId, }; @@ -38,7 +37,6 @@ export async function generateLLMsTextController( await saveGeneratedLlmsTxt(generationId, { id: generationId, team_id: req.auth.team_id, - plan: req.auth.plan!, // Add non-null assertion since plan is required createdAt: Date.now(), status: "processing", url: req.body.url, diff --git a/apps/api/src/controllers/v1/map.ts b/apps/api/src/controllers/v1/map.ts index 49890d90..455f1ed9 100644 --- a/apps/api/src/controllers/v1/map.ts +++ b/apps/api/src/controllers/v1/map.ts @@ -50,7 +50,6 @@ export async function getMapResults({ includeSubdomains = true, crawlerOptions = {}, teamId, - plan, origin, includeMetadata = false, allowExternalLinks, @@ -65,7 +64,6 @@ export async function getMapResults({ includeSubdomains?: boolean; crawlerOptions?: any; teamId: string; - plan?: string; origin?: string; includeMetadata?: boolean; allowExternalLinks?: boolean; @@ -88,7 +86,6 @@ export async function getMapResults({ internalOptions: { teamId }, team_id: teamId, createdAt: Date.now(), - plan: plan, }; const crawler = crawlToCrawler(id, sc); @@ -322,7 +319,6 @@ export async function mapController( crawlerOptions: req.body, origin: req.body.origin, teamId: req.auth.team_id, - plan: req.auth.plan, abort: abort.signal, mock: req.body.useMock, filterByPath: req.body.filterByPath !== false, diff --git a/apps/api/src/controllers/v1/scrape-status.ts b/apps/api/src/controllers/v1/scrape-status.ts index 7d074d42..1de111a2 100644 --- a/apps/api/src/controllers/v1/scrape-status.ts +++ b/apps/api/src/controllers/v1/scrape-status.ts @@ -1,6 +1,5 @@ import { Response } from "express"; import { supabaseGetJobByIdOnlyData } from "../../lib/supabase-jobs"; -import { scrapeStatusRateLimiter } from "../../services/rate-limiter"; export async function scrapeStatusController(req: any, res: any) { const allowedTeams = [ diff --git a/apps/api/src/controllers/v1/scrape.ts b/apps/api/src/controllers/v1/scrape.ts index 44214ee2..ccf81cda 100644 --- a/apps/api/src/controllers/v1/scrape.ts +++ b/apps/api/src/controllers/v1/scrape.ts @@ -12,7 +12,6 @@ import { v4 as uuidv4 } from "uuid"; import { addScrapeJob, waitForJob } from "../../services/queue-jobs"; import { logJob } from "../../services/logging/log_job"; import { getJobPriority } from "../../lib/job-priority"; -import { PlanType } from "../../types"; import { getScrapeQueue } from "../../services/queue-service"; export async function scrapeController( @@ -38,7 +37,6 @@ export async function scrapeController( const startTime = new Date().getTime(); const jobPriority = await getJobPriority({ - plan: req.auth.plan as PlanType, team_id: req.auth.team_id, basePriority: 10, }); @@ -51,7 +49,6 @@ export async function scrapeController( team_id: req.auth.team_id, scrapeOptions: req.body, internalOptions: { teamId: req.auth.team_id }, - plan: req.auth.plan!, origin: req.body.origin, is_scrape: true, }, diff --git a/apps/api/src/controllers/v1/search.ts b/apps/api/src/controllers/v1/search.ts index 082cd8cd..d48c511f 100644 --- a/apps/api/src/controllers/v1/search.ts +++ b/apps/api/src/controllers/v1/search.ts @@ -13,7 +13,7 @@ import { v4 as uuidv4 } from "uuid"; import { addScrapeJob, waitForJob } from "../../services/queue-jobs"; import { logJob } from "../../services/logging/log_job"; import { getJobPriority } from "../../lib/job-priority"; -import { PlanType, Mode } from "../../types"; +import { Mode } from "../../types"; import { getScrapeQueue } from "../../services/queue-service"; import { search } from "../../search"; import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; @@ -25,7 +25,6 @@ export async function searchAndScrapeSearchResult( query: string, options: { teamId: string; - plan: PlanType | undefined; origin: string; timeout: number; scrapeOptions: ScrapeOptions; @@ -60,7 +59,6 @@ async function scrapeSearchResult( searchResult: { url: string; title: string; description: string }, options: { teamId: string; - plan: PlanType | undefined; origin: string; timeout: number; scrapeOptions: ScrapeOptions; @@ -68,7 +66,6 @@ async function scrapeSearchResult( ): Promise { const jobId = uuidv4(); const jobPriority = await getJobPriority({ - plan: options.plan as PlanType, team_id: options.teamId, basePriority: 10, }); @@ -84,7 +81,6 @@ async function scrapeSearchResult( team_id: options.teamId, scrapeOptions: options.scrapeOptions, internalOptions: { teamId: options.teamId }, - plan: options.plan || "free", origin: options.origin, is_scrape: true, }, @@ -190,7 +186,6 @@ export async function searchController( const scrapePromises = searchResults.map((result) => scrapeSearchResult(result, { teamId: req.auth.team_id, - plan: req.auth.plan, origin: req.body.origin, timeout: req.body.timeout, scrapeOptions: req.body.scrapeOptions, diff --git a/apps/api/src/controllers/v1/token-usage.ts b/apps/api/src/controllers/v1/token-usage.ts index a49225d1..74c36289 100644 --- a/apps/api/src/controllers/v1/token-usage.ts +++ b/apps/api/src/controllers/v1/token-usage.ts @@ -1,6 +1,6 @@ import { Request, Response } from "express"; import { RequestWithAuth } from "./types"; -import { getACUC } from "../auth"; +import { getACUC, getACUCTeam } from "../auth"; import { logger } from "../../lib/logger"; import { RateLimiterMode } from "../../types"; @@ -21,7 +21,7 @@ export async function tokenUsageController( } // Otherwise fetch fresh data - const chunk = await getACUC(req.auth.team_id, false, true, RateLimiterMode.Extract); + const chunk = await getACUCTeam(req.auth.team_id, false, true, RateLimiterMode.Extract); if (!chunk) { res.status(404).json({ success: false, diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index a85ef2cd..fa059e6f 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -2,7 +2,6 @@ import { Request, Response } from "express"; import { z } from "zod"; import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; import { protocolIncluded, checkUrl } from "../../lib/validateUrl"; -import { PlanType } from "../../types"; import { countries } from "../../lib/validate-country"; import { ExtractorOptions, @@ -729,7 +728,6 @@ export type CrawlErrorsResponse = type AuthObject = { team_id: string; - plan: PlanType | undefined; }; type Account = { @@ -742,18 +740,36 @@ export type AuthCreditUsageChunk = { sub_id: string | null; sub_current_period_start: string | null; sub_current_period_end: string | null; + sub_user_id: string | null; price_id: string | null; price_credits: number; // credit limit with assoicated price, or free_credits (500) if free plan credits_used: number; coupon_credits: number; // do not rely on this number to be up to date after calling a billTeam - coupons: any[]; adjusted_credits_used: number; // credits this period minus coupons used remaining_credits: number; - sub_user_id: string | null; total_credits_sum: number; + plan_priority: { + bucketLimit: number; + planModifier: number; + }; + rate_limits: { + crawl: number; + scrape: number; + search: number; + map: number; + extract: number; + preview: number; + crawlStatus: number; + extractStatus: number; + }; + concurrency: number; + + // appended on JS-side is_extract?: boolean; }; +export type AuthCreditUsageChunkFromTeam = Omit; + export interface RequestWithMaybeACUC< ReqParams = {}, ReqBody = undefined, diff --git a/apps/api/src/lib/__tests__/job-priority.test.ts b/apps/api/src/lib/__tests__/job-priority.test.ts index 1a7550ef..118f3fe2 100644 --- a/apps/api/src/lib/__tests__/job-priority.test.ts +++ b/apps/api/src/lib/__tests__/job-priority.test.ts @@ -4,7 +4,7 @@ import { deleteJobPriority, } from "../job-priority"; import { redisConnection } from "../../services/queue-service"; -import { PlanType } from "../../types"; +import { } from "../../types"; jest.mock("../../services/queue-service", () => ({ redisConnection: { @@ -46,14 +46,14 @@ describe("Job Priority Tests", () => { test("getJobPriority should return correct priority based on plan and set length", async () => { const team_id = "team1"; - const plan: PlanType = "standard"; + const plan = "standard"; (redisConnection.scard as jest.Mock).mockResolvedValue(150); - const priority = await getJobPriority({ plan, team_id }); + const priority = await getJobPriority({ team_id }); expect(priority).toBe(10); (redisConnection.scard as jest.Mock).mockResolvedValue(250); - const priorityExceeded = await getJobPriority({ plan, team_id }); + const priorityExceeded = await getJobPriority({ team_id }); expect(priorityExceeded).toBe(20); // basePriority + Math.ceil((250 - 200) * 0.4) }); @@ -61,23 +61,23 @@ describe("Job Priority Tests", () => { const team_id = "team1"; (redisConnection.scard as jest.Mock).mockResolvedValue(50); - let plan: PlanType = "hobby"; - let priority = await getJobPriority({ plan, team_id }); + let plan = "hobby"; + let priority = await getJobPriority({ team_id }); expect(priority).toBe(10); (redisConnection.scard as jest.Mock).mockResolvedValue(150); plan = "hobby"; - priority = await getJobPriority({ plan, team_id }); + priority = await getJobPriority({ team_id }); expect(priority).toBe(25); // basePriority + Math.ceil((150 - 50) * 0.3) (redisConnection.scard as jest.Mock).mockResolvedValue(25); plan = "free"; - priority = await getJobPriority({ plan, team_id }); + priority = await getJobPriority({ team_id }); expect(priority).toBe(10); (redisConnection.scard as jest.Mock).mockResolvedValue(60); plan = "free"; - priority = await getJobPriority({ plan, team_id }); + priority = await getJobPriority({ team_id }); expect(priority).toBe(28); // basePriority + Math.ceil((60 - 25) * 0.5) }); diff --git a/apps/api/src/lib/crawl-redis.ts b/apps/api/src/lib/crawl-redis.ts index e261800f..a9a57239 100644 --- a/apps/api/src/lib/crawl-redis.ts +++ b/apps/api/src/lib/crawl-redis.ts @@ -11,7 +11,6 @@ export type StoredCrawl = { scrapeOptions: Omit; internalOptions: InternalOptions; team_id: string; - plan?: string; robots?: string; cancelled?: boolean; createdAt: number; @@ -24,7 +23,6 @@ export async function saveCrawl(id: string, crawl: StoredCrawl) { method: "saveCrawl", crawlId: id, teamId: crawl.team_id, - plan: crawl.plan, }); await redisConnection.set("crawl:" + id, JSON.stringify(crawl)); await redisConnection.expire("crawl:" + id, 24 * 60 * 60); @@ -274,7 +272,6 @@ export async function lockURL( method: "lockURL", preNormalizedURL: url, teamId: sc.team_id, - plan: sc.plan, }); if (typeof sc.crawlerOptions?.limit === "number") { @@ -335,7 +332,6 @@ export async function lockURLs( module: "crawl-redis", method: "lockURL", teamId: sc.team_id, - plan: sc.plan, }); // Add to visited_unique set diff --git a/apps/api/src/lib/deep-research/deep-research-redis.ts b/apps/api/src/lib/deep-research/deep-research-redis.ts index acefaacc..3e846b49 100644 --- a/apps/api/src/lib/deep-research/deep-research-redis.ts +++ b/apps/api/src/lib/deep-research/deep-research-redis.ts @@ -32,7 +32,6 @@ export type DeepResearchFinding = { export type StoredDeepResearch = { id: string; team_id: string; - plan?: string; createdAt: number; status: "processing" | "completed" | "failed" | "cancelled"; error?: any; diff --git a/apps/api/src/lib/deep-research/deep-research-service.ts b/apps/api/src/lib/deep-research/deep-research-service.ts index 590014a8..61b59a17 100644 --- a/apps/api/src/lib/deep-research/deep-research-service.ts +++ b/apps/api/src/lib/deep-research/deep-research-service.ts @@ -1,6 +1,5 @@ import { logger as _logger } from "../logger"; import { updateDeepResearch } from "./deep-research-redis"; -import { PlanType } from "../../types"; import { searchAndScrapeSearchResult } from "../../controllers/v1/search"; import { ResearchLLMService, ResearchStateManager } from "./research-manager"; import { logJob } from "../../services/logging/log_job"; @@ -10,7 +9,6 @@ import { ExtractOptions } from "../../controllers/v1/types"; interface DeepResearchServiceOptions { researchId: string; teamId: string; - plan: string; query: string; maxDepth: number; maxUrls: number; @@ -23,7 +21,7 @@ interface DeepResearchServiceOptions { } export async function performDeepResearch(options: DeepResearchServiceOptions) { - const { researchId, teamId, plan, timeLimit, subId, maxUrls } = options; + const { researchId, teamId, timeLimit, subId, maxUrls } = options; const startTime = Date.now(); let currentTopic = options.query; let urlsAnalyzed = 0; @@ -39,7 +37,6 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) { const state = new ResearchStateManager( researchId, teamId, - plan, options.maxDepth, logger, options.query, @@ -98,7 +95,6 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) { const response = await searchAndScrapeSearchResult(searchQuery.query, { teamId: options.teamId, - plan: options.plan as PlanType, origin: "deep-research", timeout: 10000, scrapeOptions: { diff --git a/apps/api/src/lib/deep-research/research-manager.ts b/apps/api/src/lib/deep-research/research-manager.ts index 70e8067b..0e9afa91 100644 --- a/apps/api/src/lib/deep-research/research-manager.ts +++ b/apps/api/src/lib/deep-research/research-manager.ts @@ -29,7 +29,6 @@ export class ResearchStateManager { constructor( private readonly researchId: string, private readonly teamId: string, - private readonly plan: string, private readonly maxDepth: number, private readonly logger: Logger, private readonly topic: string, diff --git a/apps/api/src/lib/extract/document-scraper.ts b/apps/api/src/lib/extract/document-scraper.ts index e9bd729a..313f2f31 100644 --- a/apps/api/src/lib/extract/document-scraper.ts +++ b/apps/api/src/lib/extract/document-scraper.ts @@ -1,5 +1,4 @@ import { Document, ScrapeOptions, URLTrace, scrapeOptions } from "../../controllers/v1/types"; -import { PlanType } from "../../types"; import { logger } from "../logger"; import { getScrapeQueue } from "../../services/queue-service"; import { waitForJob } from "../../services/queue-jobs"; @@ -10,7 +9,6 @@ import type { Logger } from "winston"; interface ScrapeDocumentOptions { url: string; teamId: string; - plan: PlanType; origin: string; timeout: number; isSingleUrl?: boolean; @@ -31,7 +29,6 @@ export async function scrapeDocument( async function attemptScrape(timeout: number) { const jobId = crypto.randomUUID(); const jobPriority = await getJobPriority({ - plan: options.plan, team_id: options.teamId, basePriority: 10, }); @@ -46,7 +43,6 @@ export async function scrapeDocument( useCache: true, teamId: options.teamId, }, - plan: options.plan, origin: options.origin, is_scrape: true, }, diff --git a/apps/api/src/lib/extract/extract-redis.ts b/apps/api/src/lib/extract/extract-redis.ts index c642f8b3..5c2ecdb4 100644 --- a/apps/api/src/lib/extract/extract-redis.ts +++ b/apps/api/src/lib/extract/extract-redis.ts @@ -24,7 +24,6 @@ export type ExtractedStep = { export type StoredExtract = { id: string; team_id: string; - plan?: string; createdAt: number; status: "processing" | "completed" | "failed" | "cancelled"; error?: any; diff --git a/apps/api/src/lib/extract/extraction-service.ts b/apps/api/src/lib/extract/extraction-service.ts index f259ffd6..f3d0c87b 100644 --- a/apps/api/src/lib/extract/extraction-service.ts +++ b/apps/api/src/lib/extract/extraction-service.ts @@ -4,7 +4,6 @@ import { TokenUsage, URLTrace, } from "../../controllers/v1/types"; -import { PlanType } from "../../types"; import { logger as _logger } from "../logger"; import { generateBasicCompletion, processUrl } from "./url-processor"; import { scrapeDocument } from "./document-scraper"; @@ -44,7 +43,6 @@ import { buildRephraseToSerpPrompt } from "./build-prompts"; interface ExtractServiceOptions { request: ExtractRequest; teamId: string; - plan: PlanType; subId?: string; cacheMode?: "load" | "save" | "direct"; cacheKey?: string; @@ -76,7 +74,7 @@ export async function performExtraction( extractId: string, options: ExtractServiceOptions, ): Promise { - const { request, teamId, plan, subId } = options; + const { request, teamId, subId } = options; const urlTraces: URLTrace[] = []; let docsMap: Map = new Map(); let singleAnswerCompletions: completions | null = null; @@ -161,7 +159,6 @@ export async function performExtraction( url, prompt: request.prompt, teamId, - plan, allowExternalLinks: request.allowExternalLinks, origin: request.origin, limit: request.limit, @@ -311,7 +308,6 @@ export async function performExtraction( { url, teamId, - plan, origin: request.origin || "api", timeout, }, @@ -574,7 +570,6 @@ export async function performExtraction( { url, teamId, - plan, origin: request.origin || "api", timeout, }, diff --git a/apps/api/src/lib/extract/url-processor.ts b/apps/api/src/lib/extract/url-processor.ts index 3cdad023..9cc5607e 100644 --- a/apps/api/src/lib/extract/url-processor.ts +++ b/apps/api/src/lib/extract/url-processor.ts @@ -1,6 +1,5 @@ import { MapDocument, URLTrace } from "../../controllers/v1/types"; import { getMapResults } from "../../controllers/v1/map"; -import { PlanType } from "../../types"; import { removeDuplicateUrls } from "../validateUrl"; import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; import { buildPreRerankPrompt, buildRefrasedPrompt } from "./build-prompts"; @@ -23,7 +22,6 @@ interface ProcessUrlOptions { prompt?: string; schema?: any; teamId: string; - plan: PlanType; allowExternalLinks?: boolean; origin?: string; limit?: number; @@ -80,7 +78,6 @@ export async function processUrl( url: baseUrl, search: searchQuery, teamId: options.teamId, - plan: options.plan, allowExternalLinks: options.allowExternalLinks, origin: options.origin, limit: options.limit, @@ -117,7 +114,6 @@ export async function processUrl( const retryMapResults = await getMapResults({ url: baseUrl, teamId: options.teamId, - plan: options.plan, allowExternalLinks: options.allowExternalLinks, origin: options.origin, limit: options.limit, diff --git a/apps/api/src/lib/generate-llmstxt/generate-llmstxt-redis.ts b/apps/api/src/lib/generate-llmstxt/generate-llmstxt-redis.ts index 26aebfcf..b32d034d 100644 --- a/apps/api/src/lib/generate-llmstxt/generate-llmstxt-redis.ts +++ b/apps/api/src/lib/generate-llmstxt/generate-llmstxt-redis.ts @@ -4,7 +4,6 @@ import { logger as _logger } from "../logger"; export interface GenerationData { id: string; team_id: string; - plan: string; createdAt: number; status: "processing" | "completed" | "failed"; url: string; diff --git a/apps/api/src/lib/generate-llmstxt/generate-llmstxt-service.ts b/apps/api/src/lib/generate-llmstxt/generate-llmstxt-service.ts index c9bd9c0c..fa72d582 100644 --- a/apps/api/src/lib/generate-llmstxt/generate-llmstxt-service.ts +++ b/apps/api/src/lib/generate-llmstxt/generate-llmstxt-service.ts @@ -3,7 +3,6 @@ import { updateGeneratedLlmsTxt } from "./generate-llmstxt-redis"; import { getMapResults } from "../../controllers/v1/map"; import { z } from "zod"; import { scrapeDocument } from "../extract/document-scraper"; -import { PlanType } from "../../types"; import { getLlmsTextFromCache, saveLlmsTextToCache, @@ -16,7 +15,6 @@ import { generateCompletions } from "../../scraper/scrapeURL/transformers/llmExt interface GenerateLLMsTextServiceOptions { generationId: string; teamId: string; - plan: PlanType; url: string; maxUrls: number; showFullText: boolean; @@ -64,7 +62,7 @@ function limitLlmsTxtEntries(llmstxt: string, maxEntries: number): string { export async function performGenerateLlmsTxt( options: GenerateLLMsTextServiceOptions, ) { - const { generationId, teamId, plan, url, maxUrls = 100, showFullText, subId } = + const { generationId, teamId, url, maxUrls = 100, showFullText, subId } = options; const startTime = Date.now(); const logger = _logger.child({ @@ -113,7 +111,6 @@ export async function performGenerateLlmsTxt( const mapResult = await getMapResults({ url, teamId, - plan, limit: effectiveMaxUrls, includeSubdomains: false, ignoreSitemap: false, @@ -142,7 +139,6 @@ export async function performGenerateLlmsTxt( { url, teamId, - plan, origin: url, timeout: 30000, isSingleUrl: true, diff --git a/apps/api/src/lib/job-priority.ts b/apps/api/src/lib/job-priority.ts index e5b98302..5e89ad9d 100644 --- a/apps/api/src/lib/job-priority.ts +++ b/apps/api/src/lib/job-priority.ts @@ -1,5 +1,5 @@ -import { redisConnection } from "../../src/services/queue-service"; -import { PlanType } from "../../src/types"; +import { getACUC, getACUCTeam } from "../controllers/auth"; +import { redisConnection } from "../services/queue-service"; import { logger } from "./logger"; const SET_KEY_PREFIX = "limit_team_id:"; @@ -29,11 +29,9 @@ export async function deleteJobPriority(team_id, job_id) { } export async function getJobPriority({ - plan, team_id, basePriority = 10, }: { - plan: PlanType | undefined; team_id: string; basePriority?: number; }): Promise { @@ -42,52 +40,16 @@ export async function getJobPriority({ } try { + const acuc = await getACUCTeam(team_id); + const setKey = SET_KEY_PREFIX + team_id; // Get the length of the set const setLength = await redisConnection.scard(setKey); // Determine the priority based on the plan and set length - let planModifier = 1; - let bucketLimit = 0; - - switch (plan) { - case "testSuite": - bucketLimit = 1000; - planModifier = 0.25; - break; - case "free": - bucketLimit = 25; - planModifier = 0.5; - break; - case "hobby": - bucketLimit = 100; - planModifier = 0.3; - break; - case "standard": - case "standardnew": - bucketLimit = 200; - planModifier = 0.2; - break; - case "growth": - case "growthdouble": - bucketLimit = 400; - planModifier = 0.1; - break; - case "etier2c": - bucketLimit = 1000; - planModifier = 0.05; - break; - case "etier1a": - bucketLimit = 1000; - planModifier = 0.05; - break; - - default: - bucketLimit = 25; - planModifier = 1; - break; - } + let planModifier = acuc?.plan_priority.planModifier ?? 1; + let bucketLimit = acuc?.plan_priority.bucketLimit ?? 25; // if length set is smaller than set, just return base priority if (setLength <= bucketLimit) { @@ -100,7 +62,7 @@ export async function getJobPriority({ } } catch (e) { logger.error( - `Get job priority failed: ${team_id}, ${plan}, ${basePriority}`, + `Get job priority failed: ${team_id}, ${basePriority}`, ); return basePriority; } diff --git a/apps/api/src/routes/v1.ts b/apps/api/src/routes/v1.ts index f6a46381..b962ff16 100644 --- a/apps/api/src/routes/v1.ts +++ b/apps/api/src/routes/v1.ts @@ -105,9 +105,9 @@ export function authMiddleware( } } - const { team_id, plan, chunk } = auth; + const { team_id, chunk } = auth; - req.auth = { team_id, plan }; + req.auth = { team_id }; req.acuc = chunk ?? undefined; if (chunk) { req.account = { remainingCredits: chunk.remaining_credits }; diff --git a/apps/api/src/services/billing/batch_billing.ts b/apps/api/src/services/billing/batch_billing.ts index b72e514c..d5e6591d 100644 --- a/apps/api/src/services/billing/batch_billing.ts +++ b/apps/api/src/services/billing/batch_billing.ts @@ -4,7 +4,7 @@ import { supabase_service } from "../supabase"; import * as Sentry from "@sentry/node"; import { Queue } from "bullmq"; import { withAuth } from "../../lib/withAuth"; -import { getACUC, setCachedACUC } from "../../controllers/auth"; +import { getACUC, setCachedACUC, setCachedACUCTeam } from "../../controllers/auth"; // Configuration constants const BATCH_KEY = "billing_batch"; @@ -298,7 +298,17 @@ async function supaBillTeam( // Update cached ACUC to reflect the new credit usage (async () => { for (const apiKey of (data ?? []).map((x) => x.api_key)) { - await setCachedACUC(apiKey, (acuc) => + await setCachedACUC(apiKey, is_extract, (acuc) => + acuc + ? { + ...acuc, + credits_used: acuc.credits_used + credits, + adjusted_credits_used: acuc.adjusted_credits_used + credits, + remaining_credits: acuc.remaining_credits - credits, + } + : null, + ); + await setCachedACUCTeam(team_id, is_extract, (acuc) => acuc ? { ...acuc, diff --git a/apps/api/src/services/queue-jobs.ts b/apps/api/src/services/queue-jobs.ts index 0a1a49fb..3e8985fe 100644 --- a/apps/api/src/services/queue-jobs.ts +++ b/apps/api/src/services/queue-jobs.ts @@ -1,6 +1,6 @@ import { getScrapeQueue } from "./queue-service"; import { v4 as uuidv4 } from "uuid"; -import { NotificationType, PlanType, WebScraperOptions } from "../types"; +import { NotificationType, WebScraperOptions } from "../types"; import * as Sentry from "@sentry/node"; import { cleanOldConcurrencyLimitEntries, @@ -10,9 +10,9 @@ import { pushConcurrencyLimitedJob, } from "../lib/concurrency-limit"; import { logger } from "../lib/logger"; -import { getConcurrencyLimitMax } from "./rate-limiter"; import { sendNotificationWithCustomDays } from './notification/email_notification'; import { shouldSendConcurrencyLimitNotification } from './notification/notification-check'; +import { getACUC, getACUCTeam } from "../controllers/auth"; /** * Checks if a job is a crawl or batch scrape based on its options @@ -51,8 +51,7 @@ export async function _addScrapeJobToBullMQ( ) { if ( webScraperOptions && - webScraperOptions.team_id && - webScraperOptions.plan + webScraperOptions.team_id ) { await pushConcurrencyLimitActiveJob(webScraperOptions.team_id, jobId, 60 * 1000); // 60s default timeout } @@ -79,7 +78,7 @@ async function addScrapeJobRaw( webScraperOptions.team_id ) { const now = Date.now(); - maxConcurrency = getConcurrencyLimitMax(webScraperOptions.plan ?? "free", webScraperOptions.team_id); + maxConcurrency = (await getACUCTeam(webScraperOptions.team_id))?.concurrency ?? 2; cleanOldConcurrencyLimitEntries(webScraperOptions.team_id, now); currentActiveConcurrency = (await getConcurrencyLimitActiveJobs(webScraperOptions.team_id, now)).length; concurrencyLimited = currentActiveConcurrency >= maxConcurrency; @@ -170,9 +169,9 @@ export async function addScrapeJobs( let currentActiveConcurrency = 0; let maxConcurrency = 0; - if (jobs[0].data && jobs[0].data.team_id && jobs[0].data.plan) { + if (jobs[0].data && jobs[0].data.team_id) { const now = Date.now(); - maxConcurrency = getConcurrencyLimitMax(jobs[0].data.plan as PlanType, jobs[0].data.team_id); + maxConcurrency = (await getACUCTeam(jobs[0].data.team_id))?.concurrency ?? 2; cleanOldConcurrencyLimitEntries(jobs[0].data.team_id, now); currentActiveConcurrency = (await getConcurrencyLimitActiveJobs(jobs[0].data.team_id, now)).length; diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 36f8ed2d..a1bf4df9 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -48,11 +48,9 @@ import { deleteJobPriority, getJobPriority, } from "../../src/lib/job-priority"; -import { PlanType, RateLimiterMode } from "../types"; import { getJobs } from "..//controllers/v1/crawl-status"; import { configDotenv } from "dotenv"; import { scrapeOptions } from "../controllers/v1/types"; -import { getRateLimiterPoints } from "./rate-limiter"; import { cleanOldConcurrencyLimitEntries, pushConcurrencyLimitActiveJob, @@ -144,7 +142,6 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) { url, mode: "single_urls" as const, team_id: job.data.team_id, - plan: job.data.plan!, crawlerOptions: { ...job.data.crawlerOptions, urlInvisibleInCurrentCrawl: true, @@ -407,7 +404,6 @@ const processExtractJobInternal = async ( const result = await performExtraction(job.data.extractId, { request: job.data.request, teamId: job.data.teamId, - plan: job.data.plan, subId: job.data.subId, }); @@ -489,7 +485,6 @@ const processDeepResearchJobInternal = async ( const result = await performDeepResearch({ researchId: job.data.researchId, teamId: job.data.teamId, - plan: job.data.plan, query: job.data.request.query, maxDepth: job.data.request.maxDepth, timeLimit: job.data.request.timeLimit, @@ -564,7 +559,6 @@ const processGenerateLlmsTxtJobInternal = async ( const result = await performGenerateLlmsTxt({ generationId: job.data.generationId, teamId: job.data.teamId, - plan: job.data.plan, url: job.data.request.url, maxUrls: job.data.request.maxUrls, showFullText: job.data.request.showFullText, @@ -682,7 +676,7 @@ const workerFun = async ( runningJobs.delete(job.id); } - if (job.id && job.data && job.data.team_id && job.data.plan) { + if (job.id && job.data && job.data.team_id) { await removeConcurrencyLimitActiveJob(job.data.team_id, job.id); cleanOldConcurrencyLimitEntries(job.data.team_id); @@ -805,7 +799,6 @@ async function processKickoffJob(job: Job & { id: string }, token: string) { crawlerOptions: job.data.crawlerOptions, scrapeOptions: scrapeOptions.parse(job.data.scrapeOptions), internalOptions: sc.internalOptions, - plan: job.data.plan!, origin: job.data.origin, crawl_id: job.data.crawl_id, webhook: job.data.webhook, @@ -844,7 +837,6 @@ async function processKickoffJob(job: Job & { id: string }, token: string) { }); let jobPriority = await getJobPriority({ - plan: job.data.plan, team_id: job.data.team_id, basePriority: 21, }); @@ -858,7 +850,6 @@ async function processKickoffJob(job: Job & { id: string }, token: string) { url, mode: "single_urls" as const, team_id: job.data.team_id, - plan: job.data.plan!, crawlerOptions: job.data.crawlerOptions, scrapeOptions: job.data.scrapeOptions, internalOptions: sc.internalOptions, @@ -1155,7 +1146,6 @@ async function processJob(job: Job & { id: string }, token: string) { if (await lockURL(job.data.crawl_id, sc, link)) { // This seems to work really welel const jobPriority = await getJobPriority({ - plan: sc.plan as PlanType, team_id: sc.team_id, basePriority: job.data.crawl_id ? 20 : 10, }); @@ -1169,7 +1159,6 @@ async function processJob(job: Job & { id: string }, token: string) { { jobPriority, url: link }, ); - // console.log("plan: ", sc.plan); // console.log("team_id: ", sc.team_id) // console.log("base priority: ", job.data.crawl_id ? 20 : 10) // console.log("job priority: " , jobPriority, "\n\n\n") @@ -1185,7 +1174,6 @@ async function processJob(job: Job & { id: string }, token: string) { ...sc.crawlerOptions, currentDiscoveryDepth: (job.data.crawlerOptions?.currentDiscoveryDepth ?? 0) + 1, }, - plan: job.data.plan, origin: job.data.origin, crawl_id: job.data.crawl_id, webhook: job.data.webhook, diff --git a/apps/api/src/services/rate-limiter.test.ts b/apps/api/src/services/rate-limiter.test.ts index 098a657c..c2989a05 100644 --- a/apps/api/src/services/rate-limiter.test.ts +++ b/apps/api/src/services/rate-limiter.test.ts @@ -1,370 +1,370 @@ -import { - getRateLimiter, - serverRateLimiter, - testSuiteRateLimiter, - redisRateLimitClient, -} from "./rate-limiter"; -import { RateLimiterMode } from "../../src/types"; -import { RateLimiterRedis } from "rate-limiter-flexible"; +// import { +// getRateLimiter, +// serverRateLimiter, +// redisRateLimitClient, +// } from "./rate-limiter"; +// import { RateLimiterMode } from "../../src/types"; +// import { RateLimiterRedis } from "rate-limiter-flexible"; -describe("Rate Limiter Service", () => { - beforeAll(async () => { - try { - await redisRateLimitClient.connect(); - // if (process.env.REDIS_RATE_LIMIT_URL === "redis://localhost:6379") { - // console.log("Erasing all keys"); - // // erase all the keys that start with "test-prefix" - // const keys = await redisRateLimitClient.keys("test-prefix:*"); - // if (keys.length > 0) { - // await redisRateLimitClient.del(...keys); - // } - // } - } catch (error) {} - }); +// describe("Rate Limiter Service", () => { +// beforeAll(async () => { +// try { +// await redisRateLimitClient.connect(); +// // if (process.env.REDIS_RATE_LIMIT_URL === "redis://localhost:6379") { +// // console.log("Erasing all keys"); +// // // erase all the keys that start with "test-prefix" +// // const keys = await redisRateLimitClient.keys("test-prefix:*"); +// // if (keys.length > 0) { +// // await redisRateLimitClient.del(...keys); +// // } +// // } +// } catch (error) {} +// }); - afterAll(async () => { - try { - // if (process.env.REDIS_RATE_LIMIT_URL === "redis://localhost:6379") { - await redisRateLimitClient.disconnect(); - // } - } catch (error) {} - }); +// afterAll(async () => { +// try { +// // if (process.env.REDIS_RATE_LIMIT_URL === "redis://localhost:6379") { +// await redisRateLimitClient.disconnect(); +// // } +// } catch (error) {} +// }); - it("should return the testSuiteRateLimiter for specific tokens", () => { - const limiter = getRateLimiter( - "crawl" as RateLimiterMode, - "test-prefix:a01ccae", - ); - expect(limiter).toBe(testSuiteRateLimiter); +// it("should return the testSuiteRateLimiter for specific tokens", () => { +// const limiter = getRateLimiter( +// "crawl" as RateLimiterMode, +// "test-prefix:a01ccae", +// ); +// expect(limiter).toBe(testSuiteRateLimiter); - const limiter2 = getRateLimiter( - "scrape" as RateLimiterMode, - "test-prefix:6254cf9", - ); - expect(limiter2).toBe(testSuiteRateLimiter); - }); +// const limiter2 = getRateLimiter( +// "scrape" as RateLimiterMode, +// "test-prefix:6254cf9", +// ); +// expect(limiter2).toBe(testSuiteRateLimiter); +// }); - it("should return the serverRateLimiter if mode is not found", () => { - const limiter = getRateLimiter( - "nonexistent" as RateLimiterMode, - "test-prefix:someToken", - ); - expect(limiter.points).toBe(serverRateLimiter.points); - }); +// it("should return the serverRateLimiter if mode is not found", () => { +// const limiter = getRateLimiter( +// "nonexistent" as RateLimiterMode, +// "test-prefix:someToken", +// ); +// expect(limiter.points).toBe(serverRateLimiter.points); +// }); - it("should return the correct rate limiter based on mode and plan", () => { - const limiter = getRateLimiter( - "crawl" as RateLimiterMode, - "test-prefix:someToken", - "free", - ); - expect(limiter.points).toBe(2); +// it("should return the correct rate limiter based on mode and plan", () => { +// const limiter = getRateLimiter( +// "crawl" as RateLimiterMode, +// "test-prefix:someToken", +// "free", +// ); +// expect(limiter.points).toBe(2); - const limiter2 = getRateLimiter( - "scrape" as RateLimiterMode, - "test-prefix:someToken", - "standard", - ); - expect(limiter2.points).toBe(100); +// const limiter2 = getRateLimiter( +// "scrape" as RateLimiterMode, +// "test-prefix:someToken", +// "standard", +// ); +// expect(limiter2.points).toBe(100); - const limiter3 = getRateLimiter( - "search" as RateLimiterMode, - "test-prefix:someToken", - "growth", - ); - expect(limiter3.points).toBe(500); +// const limiter3 = getRateLimiter( +// "search" as RateLimiterMode, +// "test-prefix:someToken", +// "growth", +// ); +// expect(limiter3.points).toBe(500); - const limiter4 = getRateLimiter( - "crawlStatus" as RateLimiterMode, - "test-prefix:someToken", - "growth", - ); - expect(limiter4.points).toBe(250); - }); +// const limiter4 = getRateLimiter( +// "crawlStatus" as RateLimiterMode, +// "test-prefix:someToken", +// "growth", +// ); +// expect(limiter4.points).toBe(250); +// }); - it("should return the default rate limiter if plan is not provided", () => { - const limiter = getRateLimiter( - "crawl" as RateLimiterMode, - "test-prefix:someToken", - ); - expect(limiter.points).toBe(3); +// it("should return the default rate limiter if plan is not provided", () => { +// const limiter = getRateLimiter( +// "crawl" as RateLimiterMode, +// "test-prefix:someToken", +// ); +// expect(limiter.points).toBe(3); - const limiter2 = getRateLimiter( - "scrape" as RateLimiterMode, - "test-prefix:someToken", - ); - expect(limiter2.points).toBe(20); - }); +// const limiter2 = getRateLimiter( +// "scrape" as RateLimiterMode, +// "test-prefix:someToken", +// ); +// expect(limiter2.points).toBe(20); +// }); - it("should create a new RateLimiterRedis instance with correct parameters", () => { - const keyPrefix = "test-prefix"; - const points = 10; - const limiter = new RateLimiterRedis({ - storeClient: redisRateLimitClient, - keyPrefix, - points, - duration: 60, - }); +// it("should create a new RateLimiterRedis instance with correct parameters", () => { +// const keyPrefix = "test-prefix"; +// const points = 10; +// const limiter = new RateLimiterRedis({ +// storeClient: redisRateLimitClient, +// keyPrefix, +// points, +// duration: 60, +// }); - expect(limiter.keyPrefix).toBe(keyPrefix); - expect(limiter.points).toBe(points); - expect(limiter.duration).toBe(60); - }); +// expect(limiter.keyPrefix).toBe(keyPrefix); +// expect(limiter.points).toBe(points); +// expect(limiter.duration).toBe(60); +// }); - it("should return the correct rate limiter for 'preview' mode", () => { - const limiter = getRateLimiter( - "preview" as RateLimiterMode, - "test-prefix:someToken", - "free", - ); - expect(limiter.points).toBe(5); +// it("should return the correct rate limiter for 'preview' mode", () => { +// const limiter = getRateLimiter( +// "preview" as RateLimiterMode, +// "test-prefix:someToken", +// "free", +// ); +// expect(limiter.points).toBe(5); - const limiter2 = getRateLimiter( - "preview" as RateLimiterMode, - "test-prefix:someToken", - ); - expect(limiter2.points).toBe(5); - }); +// const limiter2 = getRateLimiter( +// "preview" as RateLimiterMode, +// "test-prefix:someToken", +// ); +// expect(limiter2.points).toBe(5); +// }); - it("should return the correct rate limiter for 'account' mode", () => { - const limiter = getRateLimiter( - "account" as RateLimiterMode, - "test-prefix:someToken", - "free", - ); - expect(limiter.points).toBe(100); +// it("should return the correct rate limiter for 'account' mode", () => { +// const limiter = getRateLimiter( +// "account" as RateLimiterMode, +// "test-prefix:someToken", +// "free", +// ); +// expect(limiter.points).toBe(100); - const limiter2 = getRateLimiter( - "account" as RateLimiterMode, - "test-prefix:someToken", - ); - expect(limiter2.points).toBe(100); - }); +// const limiter2 = getRateLimiter( +// "account" as RateLimiterMode, +// "test-prefix:someToken", +// ); +// expect(limiter2.points).toBe(100); +// }); - it("should return the correct rate limiter for 'crawlStatus' mode", () => { - const limiter = getRateLimiter( - "crawlStatus" as RateLimiterMode, - "test-prefix:someToken", - "free", - ); - expect(limiter.points).toBe(150); +// it("should return the correct rate limiter for 'crawlStatus' mode", () => { +// const limiter = getRateLimiter( +// "crawlStatus" as RateLimiterMode, +// "test-prefix:someToken", +// "free", +// ); +// expect(limiter.points).toBe(150); - const limiter2 = getRateLimiter( - "crawlStatus" as RateLimiterMode, - "test-prefix:someToken", - ); - expect(limiter2.points).toBe(250); - }); +// const limiter2 = getRateLimiter( +// "crawlStatus" as RateLimiterMode, +// "test-prefix:someToken", +// ); +// expect(limiter2.points).toBe(250); +// }); - it("should consume points correctly for 'crawl' mode", async () => { - const limiter = getRateLimiter( - "crawl" as RateLimiterMode, - "test-prefix:someTokenCRAWL", - "free", - ); - const consumePoints = 1; +// it("should consume points correctly for 'crawl' mode", async () => { +// const limiter = getRateLimiter( +// "crawl" as RateLimiterMode, +// "test-prefix:someTokenCRAWL", +// "free", +// ); +// const consumePoints = 1; - const res = await limiter.consume( - "test-prefix:someTokenCRAWL", - consumePoints, - ); - expect(res.remainingPoints).toBe(1); - }); +// const res = await limiter.consume( +// "test-prefix:someTokenCRAWL", +// consumePoints, +// ); +// expect(res.remainingPoints).toBe(1); +// }); - it("should consume points correctly for 'scrape' mode (DEFAULT)", async () => { - const limiter = getRateLimiter( - "scrape" as RateLimiterMode, - "test-prefix:someTokenX", - ); - const consumePoints = 4; +// it("should consume points correctly for 'scrape' mode (DEFAULT)", async () => { +// const limiter = getRateLimiter( +// "scrape" as RateLimiterMode, +// "test-prefix:someTokenX", +// ); +// const consumePoints = 4; - const res = await limiter.consume("test-prefix:someTokenX", consumePoints); - expect(res.remainingPoints).toBe(16); - }); +// const res = await limiter.consume("test-prefix:someTokenX", consumePoints); +// expect(res.remainingPoints).toBe(16); +// }); - it("should consume points correctly for 'scrape' mode (HOBBY)", async () => { - const limiter = getRateLimiter( - "scrape" as RateLimiterMode, - "test-prefix:someTokenXY", - "hobby", - ); - expect(limiter.points).toBe(20); +// it("should consume points correctly for 'scrape' mode (HOBBY)", async () => { +// const limiter = getRateLimiter( +// "scrape" as RateLimiterMode, +// "test-prefix:someTokenXY", +// "hobby", +// ); +// expect(limiter.points).toBe(20); - const consumePoints = 5; +// const consumePoints = 5; - const res = await limiter.consume("test-prefix:someTokenXY", consumePoints); - expect(res.consumedPoints).toBe(5); - expect(res.remainingPoints).toBe(15); - }); +// const res = await limiter.consume("test-prefix:someTokenXY", consumePoints); +// expect(res.consumedPoints).toBe(5); +// expect(res.remainingPoints).toBe(15); +// }); - it("should return the correct rate limiter for 'crawl' mode", () => { - const limiter = getRateLimiter( - "crawl" as RateLimiterMode, - "test-prefix:someToken", - "free", - ); - expect(limiter.points).toBe(2); +// it("should return the correct rate limiter for 'crawl' mode", () => { +// const limiter = getRateLimiter( +// "crawl" as RateLimiterMode, +// "test-prefix:someToken", +// "free", +// ); +// expect(limiter.points).toBe(2); - const limiter2 = getRateLimiter( - "crawl" as RateLimiterMode, - "test-prefix:someToken", - "starter", - ); - expect(limiter2.points).toBe(10); +// const limiter2 = getRateLimiter( +// "crawl" as RateLimiterMode, +// "test-prefix:someToken", +// "starter", +// ); +// expect(limiter2.points).toBe(10); - const limiter3 = getRateLimiter( - "crawl" as RateLimiterMode, - "test-prefix:someToken", - "standard", - ); - expect(limiter3.points).toBe(5); - }); +// const limiter3 = getRateLimiter( +// "crawl" as RateLimiterMode, +// "test-prefix:someToken", +// "standard", +// ); +// expect(limiter3.points).toBe(5); +// }); - it("should return the correct rate limiter for 'scrape' mode", () => { - const limiter = getRateLimiter( - "scrape" as RateLimiterMode, - "test-prefix:someToken", - "free", - ); - expect(limiter.points).toBe(10); +// it("should return the correct rate limiter for 'scrape' mode", () => { +// const limiter = getRateLimiter( +// "scrape" as RateLimiterMode, +// "test-prefix:someToken", +// "free", +// ); +// expect(limiter.points).toBe(10); - const limiter2 = getRateLimiter( - "scrape" as RateLimiterMode, - "test-prefix:someToken", - "starter", - ); - expect(limiter2.points).toBe(100); +// const limiter2 = getRateLimiter( +// "scrape" as RateLimiterMode, +// "test-prefix:someToken", +// "starter", +// ); +// expect(limiter2.points).toBe(100); - const limiter3 = getRateLimiter( - "scrape" as RateLimiterMode, - "test-prefix:someToken", - "standard", - ); - expect(limiter3.points).toBe(100); +// const limiter3 = getRateLimiter( +// "scrape" as RateLimiterMode, +// "test-prefix:someToken", +// "standard", +// ); +// expect(limiter3.points).toBe(100); - const limiter4 = getRateLimiter( - "scrape" as RateLimiterMode, - "test-prefix:someToken", - "growth", - ); - expect(limiter4.points).toBe(1000); - }); +// const limiter4 = getRateLimiter( +// "scrape" as RateLimiterMode, +// "test-prefix:someToken", +// "growth", +// ); +// expect(limiter4.points).toBe(1000); +// }); - it("should return the correct rate limiter for 'search' mode", () => { - const limiter = getRateLimiter( - "search" as RateLimiterMode, - "test-prefix:someToken", - "free", - ); - expect(limiter.points).toBe(5); +// it("should return the correct rate limiter for 'search' mode", () => { +// const limiter = getRateLimiter( +// "search" as RateLimiterMode, +// "test-prefix:someToken", +// "free", +// ); +// expect(limiter.points).toBe(5); - const limiter2 = getRateLimiter( - "search" as RateLimiterMode, - "test-prefix:someToken", - "starter", - ); - expect(limiter2.points).toBe(50); +// const limiter2 = getRateLimiter( +// "search" as RateLimiterMode, +// "test-prefix:someToken", +// "starter", +// ); +// expect(limiter2.points).toBe(50); - const limiter3 = getRateLimiter( - "search" as RateLimiterMode, - "test-prefix:someToken", - "standard", - ); - expect(limiter3.points).toBe(50); - }); +// const limiter3 = getRateLimiter( +// "search" as RateLimiterMode, +// "test-prefix:someToken", +// "standard", +// ); +// expect(limiter3.points).toBe(50); +// }); - it("should return the correct rate limiter for 'preview' mode", () => { - const limiter = getRateLimiter( - "preview" as RateLimiterMode, - "test-prefix:someToken", - "free", - ); - expect(limiter.points).toBe(5); +// it("should return the correct rate limiter for 'preview' mode", () => { +// const limiter = getRateLimiter( +// "preview" as RateLimiterMode, +// "test-prefix:someToken", +// "free", +// ); +// expect(limiter.points).toBe(5); - const limiter2 = getRateLimiter( - "preview" as RateLimiterMode, - "test-prefix:someToken", - ); - expect(limiter2.points).toBe(5); - }); +// const limiter2 = getRateLimiter( +// "preview" as RateLimiterMode, +// "test-prefix:someToken", +// ); +// expect(limiter2.points).toBe(5); +// }); - it("should return the correct rate limiter for 'account' mode", () => { - const limiter = getRateLimiter( - "account" as RateLimiterMode, - "test-prefix:someToken", - "free", - ); - expect(limiter.points).toBe(100); +// it("should return the correct rate limiter for 'account' mode", () => { +// const limiter = getRateLimiter( +// "account" as RateLimiterMode, +// "test-prefix:someToken", +// "free", +// ); +// expect(limiter.points).toBe(100); - const limiter2 = getRateLimiter( - "account" as RateLimiterMode, - "test-prefix:someToken", - ); - expect(limiter2.points).toBe(100); - }); +// const limiter2 = getRateLimiter( +// "account" as RateLimiterMode, +// "test-prefix:someToken", +// ); +// expect(limiter2.points).toBe(100); +// }); - it("should return the correct rate limiter for 'crawlStatus' mode", () => { - const limiter = getRateLimiter( - "crawlStatus" as RateLimiterMode, - "test-prefix:someToken", - "free", - ); - expect(limiter.points).toBe(150); +// it("should return the correct rate limiter for 'crawlStatus' mode", () => { +// const limiter = getRateLimiter( +// "crawlStatus" as RateLimiterMode, +// "test-prefix:someToken", +// "free", +// ); +// expect(limiter.points).toBe(150); - const limiter2 = getRateLimiter( - "crawlStatus" as RateLimiterMode, - "test-prefix:someToken", - ); - expect(limiter2.points).toBe(250); - }); +// const limiter2 = getRateLimiter( +// "crawlStatus" as RateLimiterMode, +// "test-prefix:someToken", +// ); +// expect(limiter2.points).toBe(250); +// }); - it("should return the correct rate limiter for 'testSuite' mode", () => { - const limiter = getRateLimiter( - "testSuite" as RateLimiterMode, - "test-prefix:someToken", - "free", - ); - expect(limiter.points).toBe(10000); +// it("should return the correct rate limiter for 'testSuite' mode", () => { +// const limiter = getRateLimiter( +// "testSuite" as RateLimiterMode, +// "test-prefix:someToken", +// "free", +// ); +// expect(limiter.points).toBe(10000); - const limiter2 = getRateLimiter( - "testSuite" as RateLimiterMode, - "test-prefix:someToken", - ); - expect(limiter2.points).toBe(10000); - }); +// const limiter2 = getRateLimiter( +// "testSuite" as RateLimiterMode, +// "test-prefix:someToken", +// ); +// expect(limiter2.points).toBe(10000); +// }); - it("should throw an error when consuming more points than available", async () => { - const limiter = getRateLimiter( - "crawl" as RateLimiterMode, - "test-prefix:someToken", - ); - const consumePoints = limiter.points + 1; +// it("should throw an error when consuming more points than available", async () => { +// const limiter = getRateLimiter( +// "crawl" as RateLimiterMode, +// "test-prefix:someToken", +// ); +// const consumePoints = limiter.points + 1; - try { - await limiter.consume("test-prefix:someToken", consumePoints); - } catch (error) { - // expect remaining points to be 0 - const res = await limiter.get("test-prefix:someToken"); - expect(res?.remainingPoints).toBe(0); - } - }); +// try { +// await limiter.consume("test-prefix:someToken", consumePoints); +// } catch (error) { +// // expect remaining points to be 0 +// const res = await limiter.get("test-prefix:someToken"); +// expect(res?.remainingPoints).toBe(0); +// } +// }); - it("should reset points after duration", async () => { - const keyPrefix = "test-prefix"; - const points = 10; - const duration = 1; // 1 second - const limiter = new RateLimiterRedis({ - storeClient: redisRateLimitClient, - keyPrefix, - points, - duration, - }); +// it("should reset points after duration", async () => { +// const keyPrefix = "test-prefix"; +// const points = 10; +// const duration = 1; // 1 second +// const limiter = new RateLimiterRedis({ +// storeClient: redisRateLimitClient, +// keyPrefix, +// points, +// duration, +// }); - const consumePoints = 5; - await limiter.consume("test-prefix:someToken", consumePoints); - await new Promise((resolve) => setTimeout(resolve, duration * 1000 + 100)); // Wait for duration + 100ms +// const consumePoints = 5; +// await limiter.consume("test-prefix:someToken", consumePoints); +// await new Promise((resolve) => setTimeout(resolve, duration * 1000 + 100)); // Wait for duration + 100ms - const res = await limiter.consume("test-prefix:someToken", consumePoints); - expect(res.remainingPoints).toBe(points - consumePoints); - }); -}); +// const res = await limiter.consume("test-prefix:someToken", consumePoints); +// expect(res.remainingPoints).toBe(points - consumePoints); +// }); +// }); +// TODO: FIX \ No newline at end of file diff --git a/apps/api/src/services/rate-limiter.ts b/apps/api/src/services/rate-limiter.ts index 522f4ed3..600b42a6 100644 --- a/apps/api/src/services/rate-limiter.ts +++ b/apps/api/src/services/rate-limiter.ts @@ -1,162 +1,7 @@ import { RateLimiterRedis } from "rate-limiter-flexible"; -import { PlanType, RateLimiterMode } from "../../src/types"; +import { RateLimiterMode } from "../types"; import Redis from "ioredis"; - -export const CONCURRENCY_LIMIT: Omit, ""> = { - free: 2, - hobby: 5, - starter: 50, - standard: 50, - standardNew: 50, - standardnew: 50, - scale: 100, - growth: 100, - growthdouble: 100, - etier2c: 300, - etier1a: 200, - etier2a: 300, - etierscale1: 150, - etierscale2: 200, - testSuite: 200, - devB: 120, - etier2d: 250, - manual: 200, - extract_starter: 20, - extract_explorer: 100, - extract_pro: 200 -}; - -const RATE_LIMITS = { - crawl: { - default: 15, - free: 2, - starter: 50, - standard: 25, - standardOld: 200, - scale: 250, - hobby: 15, - standardNew: 50, - standardnew: 50, - growth: 250, - growthdouble: 250, - etier2c: 1500, - etier1a: 5000, - etier2a: 1500, - etierscale1: 750, - etierscale2: 1500, - // extract ops - extract_starter: 100, - extract_explorer: 500, - extract_pro: 1000, - }, - scrape: { - default: 100, - free: 10, - starter: 500, - standard: 500, - standardOld: 500, - scale: 2500, - hobby: 100, - standardNew: 500, - standardnew: 500, - growth: 5000, - growthdouble: 5000, - etier2c: 12500, - etier1a: 5000, - etier2a: 12500, - etierscale1: 7500, - etierscale2: 12500, - // extract ops - extract_starter: 100, - extract_explorer: 500, - extract_pro: 1000, - }, - search: { - default: 100, - free: 5, - starter: 250, - standard: 250, - standardOld: 200, - scale: 2500, - hobby: 50, - standardNew: 250, - standardnew: 250, - growth: 2500, - growthdouble: 2500, - etier2c: 12500, - etier1a: 5000, - etier2a: 12500, - etierscale1: 7500, - etierscale2: 12500, - // extract ops - extract_starter: 100, - extract_explorer: 500, - extract_pro: 1000, - }, - map: { - default: 100, - free: 5, - starter: 250, - standard: 250, - standardOld: 250, - scale: 2500, - hobby: 50, - standardNew: 250, - standardnew: 250, - growth: 5000, - growthdouble: 5000, - etier2c: 12500, - etier1a: 5000, - etier2a: 12500, - etierscale1: 7500, - etierscale2: 12500, - // extract ops - extract_starter: 100, - extract_explorer: 500, - extract_pro: 1000, - }, - extract: { - default: 100, - free: 10, - starter: 500, - standard: 500, - standardOld: 500, - scale: 1000, - hobby: 100, - standardNew: 500, - standardnew: 500, - growth: 1000, - growthdouble: 1000, - etier2c: 1000, - etier1a: 1000, - etier2a: 1000, - etierscale1: 1000, - etierscale2: 1000, - extract_starter: 100, - extract_explorer: 500, - extract_pro: 1000, - }, - preview: { - free: 5, - default: 25, - }, - account: { - free: 100, - default: 500, - }, - crawlStatus: { - free: 500, - default: 25000, - }, - extractStatus: { - free: 500, - default: 25000, - }, - testSuite: { - free: 10000, - default: 50000, - }, -}; +import type { AuthCreditUsageChunk } from "../controllers/v1/types"; export const redisRateLimitClient = new Redis( process.env.REDIS_RATE_LIMIT_URL!, @@ -170,11 +15,6 @@ const createRateLimiter = (keyPrefix, points) => duration: 60, // Duration in seconds }); -export const serverRateLimiter = createRateLimiter( - "server", - RATE_LIMITS.account.default, -); - export const testSuiteRateLimiter = new RateLimiterRedis({ storeClient: redisRateLimitClient, keyPrefix: "test-suite", @@ -182,41 +22,7 @@ export const testSuiteRateLimiter = new RateLimiterRedis({ duration: 60, // Duration in seconds }); -export const devBRateLimiter = new RateLimiterRedis({ - storeClient: redisRateLimitClient, - keyPrefix: "dev-b", - points: 1200, - duration: 60, // Duration in seconds -}); - -export const manualRateLimiter = new RateLimiterRedis({ - storeClient: redisRateLimitClient, - keyPrefix: "manual", - points: 10000, - duration: 60, // Duration in seconds -}); - -export const scrapeStatusRateLimiter = new RateLimiterRedis({ - storeClient: redisRateLimitClient, - keyPrefix: "scrape-status", - points: 400, - duration: 60, // Duration in seconds -}); - -export const etier1aRateLimiter = new RateLimiterRedis({ - storeClient: redisRateLimitClient, - keyPrefix: "etier1a", - points: 10000, - duration: 60, // Duration in seconds -}); - -export const etier2aRateLimiter = new RateLimiterRedis({ - storeClient: redisRateLimitClient, - keyPrefix: "etier2a", - points: 2500, - duration: 60, // Duration in seconds -}); - +// TODO: PUT OVERRIDES FOR THESE INTO THE DB - mogery const testSuiteTokens = [ "a01ccae", "6254cf9", @@ -240,105 +46,32 @@ const testSuiteTokens = [ "0a18c9e", // gh ]; -const manual_growth = ["22a07b64-cbfe-4924-9273-e3f01709cdf2"]; -const manual = ["69be9e74-7624-4990-b20d-08e0acc70cf6", "9661a311-3d75-45d2-bb70-71004d995873"]; -const manual_etier2c = ["77545e01-9cec-4fa9-8356-883fc66ac13e", "778c62c4-306f-4039-b372-eb20174760c0"]; +// TODO: PUT OVERRIDES FOR THESE INTO THE DB - mogery +// const manual_growth = ["22a07b64-cbfe-4924-9273-e3f01709cdf2"]; +// const manual = ["69be9e74-7624-4990-b20d-08e0acc70cf6", "9661a311-3d75-45d2-bb70-71004d995873"]; +// const manual_etier2c = ["77545e01-9cec-4fa9-8356-883fc66ac13e", "778c62c4-306f-4039-b372-eb20174760c0"]; -function makePlanKey(plan?: string) { - return plan ? plan.replace("-", "") : "default"; // "default" -} - -export function getRateLimiterPoints( - mode: RateLimiterMode, - token?: string, - plan?: string, - teamId?: string, -): number { - const rateLimitConfig = RATE_LIMITS[mode]; // {default : 5} - - if (!rateLimitConfig) return RATE_LIMITS.account.default; - - const points: number = - rateLimitConfig[makePlanKey(plan)] || rateLimitConfig.default; // 5 - - return points; -} +const fallbackRateLimits: AuthCreditUsageChunk["rate_limits"] = { + crawl: 15, + scrape: 100, + search: 100, + map: 100, + extract: 100, + preview: 25, + extractStatus: 25000, + crawlStatus: 25000, +}; export function getRateLimiter( mode: RateLimiterMode, - token?: string, - plan?: string, - teamId?: string, + rate_limits: AuthCreditUsageChunk["rate_limits"] | null, ): RateLimiterRedis { - if (token && testSuiteTokens.some((testToken) => token.includes(testToken))) { - return testSuiteRateLimiter; - } - - if (teamId && teamId === process.env.DEV_B_TEAM_ID) { - return devBRateLimiter; - } - - if (teamId && (teamId === process.env.ETIER1A_TEAM_ID || teamId === process.env.ETIER1A_TEAM_ID_O)) { - return etier1aRateLimiter; - } - - if (teamId && (teamId === process.env.ETIER2A_TEAM_ID || teamId === process.env.ETIER2A_TEAM_ID_B)) { - return etier2aRateLimiter; - } - - if (teamId && teamId === process.env.ETIER2D_TEAM_ID) { - return etier2aRateLimiter; - } - - if (teamId && (manual.includes(teamId) || manual_etier2c.includes(teamId))) { - return manualRateLimiter; - } - return createRateLimiter( - `${mode}-${makePlanKey(plan)}`, - getRateLimiterPoints(mode, token, plan, teamId), + `${mode}`, + (rate_limits ?? fallbackRateLimits)[mode] ?? 500, ); } -export function getConcurrencyLimitMax( - plan: PlanType, - teamId?: string, -): number { - // Moved this to auth check, plan will come as testSuite if token is present - // if (token && testSuiteTokens.some((testToken) => token.includes(testToken))) { - // return CONCURRENCY_LIMIT.testSuite; - // } - if (teamId && teamId === process.env.DEV_B_TEAM_ID) { - return CONCURRENCY_LIMIT.devB; - } - - if (teamId && (teamId === process.env.ETIER1A_TEAM_ID || teamId === process.env.ETIER1A_TEAM_ID_O)) { - return CONCURRENCY_LIMIT.etier1a; - } - - if (teamId && (teamId === process.env.ETIER2A_TEAM_ID || teamId === process.env.ETIER2A_TEAM_ID_B)) { - return CONCURRENCY_LIMIT.etier2a; - } - - if (teamId && teamId === process.env.ETIER2D_TEAM_ID) { - return CONCURRENCY_LIMIT.etier2a; - } - - if (teamId && manual.includes(teamId)) { - return CONCURRENCY_LIMIT.manual; - } - - if (teamId && manual_etier2c.includes(teamId)) { - return CONCURRENCY_LIMIT.etier2c; - } - - if (teamId && manual_growth.includes(teamId)) { - return CONCURRENCY_LIMIT.growth; - } - - return CONCURRENCY_LIMIT[plan] ?? 10; -} - export function isTestSuiteToken(token: string): boolean { return testSuiteTokens.some((testToken) => token.includes(testToken)); } diff --git a/apps/api/src/types.ts b/apps/api/src/types.ts index adc7df37..a42d4cb1 100644 --- a/apps/api/src/types.ts +++ b/apps/api/src/types.ts @@ -37,7 +37,6 @@ export interface WebScraperOptions { scrapeOptions: ScrapeOptions; internalOptions?: InternalOptions; team_id: string; - plan: string; origin?: string; crawl_id?: string; sitemapped?: boolean; @@ -144,7 +143,6 @@ export type AuthResponse = success: true; team_id: string; api_key?: string; - plan?: PlanType; chunk: AuthCreditUsageChunk | null; } | { @@ -178,30 +176,6 @@ export type ScrapeLog = { ipv6_support?: boolean | null; }; -export type PlanType = - | "starter" - | "standard" - | "scale" - | "hobby" - | "standardnew" - | "standardNew" - | "growth" - | "growthdouble" - | "etier2c" - | "etier1a" - | "etierscale1" - | "etierscale2" - | "etier2a" - | "free" - | "testSuite" - | "devB" - | "etier2d" - | "manual" - | "extract_starter" - | "extract_explorer" - | "extract_pro" - | ""; - export type WebhookEventType = | "crawl.page" | "batch_scrape.page" From f16f034463d4216de4a9ae12fcad18193d66bd6d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 10 Apr 2025 18:58:59 +0200 Subject: [PATCH 069/160] temp: get acuc from main db --- apps/api/src/controllers/auth.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/api/src/controllers/auth.ts b/apps/api/src/controllers/auth.ts index 395331af..98192bc8 100644 --- a/apps/api/src/controllers/auth.ts +++ b/apps/api/src/controllers/auth.ts @@ -136,7 +136,7 @@ export async function getACUC( const maxRetries = 5; while (retries < maxRetries) { const client = - Math.random() > (2/3) ? supabase_rr_service : supabase_service; + /* Math.random() > (2/3) ? supabase_rr_service : */ supabase_service; ({ data, error } = await client.rpc( "auth_credit_usage_chunk_28", { input_key: api_key, i_is_extract: isExtract, tally_untallied_credits: true }, @@ -247,7 +247,7 @@ export async function getACUCTeam( while (retries < maxRetries) { const client = - Math.random() > (2/3) ? supabase_rr_service : supabase_service; + /* Math.random() > (2/3) ? supabase_rr_service : */ supabase_service; ({ data, error } = await client.rpc( "auth_credit_usage_chunk_28_from_team", { input_team: team_id, i_is_extract: isExtract, tally_untallied_credits: true }, From 8566bff35c56ac042b54afd661a0cba7c4556e53 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 10 Apr 2025 20:28:43 +0200 Subject: [PATCH 070/160] Revert "temp: get acuc from main db" This reverts commit f16f034463d4216de4a9ae12fcad18193d66bd6d. --- apps/api/src/controllers/auth.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/api/src/controllers/auth.ts b/apps/api/src/controllers/auth.ts index 98192bc8..395331af 100644 --- a/apps/api/src/controllers/auth.ts +++ b/apps/api/src/controllers/auth.ts @@ -136,7 +136,7 @@ export async function getACUC( const maxRetries = 5; while (retries < maxRetries) { const client = - /* Math.random() > (2/3) ? supabase_rr_service : */ supabase_service; + Math.random() > (2/3) ? supabase_rr_service : supabase_service; ({ data, error } = await client.rpc( "auth_credit_usage_chunk_28", { input_key: api_key, i_is_extract: isExtract, tally_untallied_credits: true }, @@ -247,7 +247,7 @@ export async function getACUCTeam( while (retries < maxRetries) { const client = - /* Math.random() > (2/3) ? supabase_rr_service : */ supabase_service; + Math.random() > (2/3) ? supabase_rr_service : supabase_service; ({ data, error } = await client.rpc( "auth_credit_usage_chunk_28_from_team", { input_team: team_id, i_is_extract: isExtract, tally_untallied_credits: true }, From d3b821e8279414b2b998705aa93b644621a0d4cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Fri, 11 Apr 2025 00:12:57 +0200 Subject: [PATCH 071/160] Revert "Revert "temp: get acuc from main db"" This reverts commit 8566bff35c56ac042b54afd661a0cba7c4556e53. --- apps/api/src/controllers/auth.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/api/src/controllers/auth.ts b/apps/api/src/controllers/auth.ts index 395331af..98192bc8 100644 --- a/apps/api/src/controllers/auth.ts +++ b/apps/api/src/controllers/auth.ts @@ -136,7 +136,7 @@ export async function getACUC( const maxRetries = 5; while (retries < maxRetries) { const client = - Math.random() > (2/3) ? supabase_rr_service : supabase_service; + /* Math.random() > (2/3) ? supabase_rr_service : */ supabase_service; ({ data, error } = await client.rpc( "auth_credit_usage_chunk_28", { input_key: api_key, i_is_extract: isExtract, tally_untallied_credits: true }, @@ -247,7 +247,7 @@ export async function getACUCTeam( while (retries < maxRetries) { const client = - Math.random() > (2/3) ? supabase_rr_service : supabase_service; + /* Math.random() > (2/3) ? supabase_rr_service : */ supabase_service; ({ data, error } = await client.rpc( "auth_credit_usage_chunk_28_from_team", { input_team: team_id, i_is_extract: isExtract, tally_untallied_credits: true }, From 0bed648b0d711e0e5b7b1ea1a253f2b91c3b0013 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Fri, 11 Apr 2025 07:13:25 +0200 Subject: [PATCH 072/160] Revert "Revert "Revert "temp: get acuc from main db""" This reverts commit d3b821e8279414b2b998705aa93b644621a0d4cc. --- apps/api/src/controllers/auth.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/api/src/controllers/auth.ts b/apps/api/src/controllers/auth.ts index 98192bc8..395331af 100644 --- a/apps/api/src/controllers/auth.ts +++ b/apps/api/src/controllers/auth.ts @@ -136,7 +136,7 @@ export async function getACUC( const maxRetries = 5; while (retries < maxRetries) { const client = - /* Math.random() > (2/3) ? supabase_rr_service : */ supabase_service; + Math.random() > (2/3) ? supabase_rr_service : supabase_service; ({ data, error } = await client.rpc( "auth_credit_usage_chunk_28", { input_key: api_key, i_is_extract: isExtract, tally_untallied_credits: true }, @@ -247,7 +247,7 @@ export async function getACUCTeam( while (retries < maxRetries) { const client = - /* Math.random() > (2/3) ? supabase_rr_service : */ supabase_service; + Math.random() > (2/3) ? supabase_rr_service : supabase_service; ({ data, error } = await client.rpc( "auth_credit_usage_chunk_28_from_team", { input_team: team_id, i_is_extract: isExtract, tally_untallied_credits: true }, From f52d6aab9194aec671a00bb6c8149390ebd7f39d Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 10 Apr 2025 22:59:28 -0700 Subject: [PATCH 073/160] (feat/deep-research) Improvements to final analysis (#1443) * Nick: fixes * Update research-manager.ts * Update research-manager.ts --- apps/api/package.json | 4 +- apps/api/pnpm-lock.yaml | 93 +++++++++++-------- .../src/lib/deep-research/research-manager.ts | 12 ++- .../api/src/services/billing/batch_billing.ts | 2 +- 4 files changed, 66 insertions(+), 45 deletions(-) diff --git a/apps/api/package.json b/apps/api/package.json index 35791483..1dc26a05 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -51,7 +51,7 @@ "typescript": "^5.8.3" }, "dependencies": { - "@ai-sdk/openai": "^1.1.13", + "@ai-sdk/openai": "^1.3.10", "@anthropic-ai/sdk": "^0.24.3", "@apidevtools/json-schema-ref-parser": "^11.7.3", "@brillout/import": "^0.2.2", @@ -68,7 +68,7 @@ "@supabase/supabase-js": "^2.44.2", "@types/express-ws": "^3.0.4", "@types/ws": "^8.5.12", - "ai": "^4.1.45", + "ai": "^4.3.4", "ajv": "^8.16.0", "async": "^3.2.5", "async-mutex": "^0.5.0", diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index 508fb7cd..8644e7bd 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -9,8 +9,8 @@ importers: .: dependencies: '@ai-sdk/openai': - specifier: ^1.1.13 - version: 1.1.13(zod@3.24.2) + specifier: ^1.3.10 + version: 1.3.10(zod@3.24.2) '@anthropic-ai/sdk': specifier: ^0.24.3 version: 0.24.3(encoding@0.1.13) @@ -60,8 +60,8 @@ importers: specifier: ^8.5.12 version: 8.5.12 ai: - specifier: ^4.1.45 - version: 4.1.45(react@18.3.1)(zod@3.24.2) + specifier: ^4.3.4 + version: 4.3.4(react@18.3.1)(zod@3.24.2) ajv: specifier: ^8.16.0 version: 8.16.0 @@ -324,8 +324,8 @@ importers: packages: - '@ai-sdk/openai@1.1.13': - resolution: {integrity: sha512-IdChK1pJTW3NQis02PG/hHTG0gZSyQIMOLPt7f7ES56C0xH2yaKOU1Tp2aib7pZzWGwDlzTOW2h5TtAB8+V6CQ==} + '@ai-sdk/openai@1.3.10': + resolution: {integrity: sha512-XO0wF2lmAMWCYjkM5bLpWTKoXet61fBiIimTi+blqEGiLUjAvivt/1zZL1Lzhrv9+p19IC1rn9EWZI1dCelV8w==} engines: {node: '>=18'} peerDependencies: zod: ^3.0.0 @@ -339,30 +339,35 @@ packages: zod: optional: true + '@ai-sdk/provider-utils@2.2.6': + resolution: {integrity: sha512-sUlZ7Gnq84DCGWMQRIK8XVbkzIBnvPR1diV4v6JwPgpn5armnLI/j+rqn62MpLrU5ZCQZlDKl/Lw6ed3ulYqaA==} + engines: {node: '>=18'} + peerDependencies: + zod: ^3.23.8 + '@ai-sdk/provider@1.0.8': resolution: {integrity: sha512-f9jSYwKMdXvm44Dmab1vUBnfCDSFfI5rOtvV1W9oKB7WYHR5dGvCC6x68Mk3NUfrdmNoMVHGoh6JT9HCVMlMow==} engines: {node: '>=18'} - '@ai-sdk/react@1.1.17': - resolution: {integrity: sha512-NAuEflFvjw1uh1AOmpyi7rBF4xasWsiWUb86JQ8ScjDGxoGDYEdBnaHOxUpooLna0dGNbSPkvDMnVRhoLKoxPQ==} + '@ai-sdk/provider@1.1.2': + resolution: {integrity: sha512-ITdgNilJZwLKR7X5TnUr1BsQW6UTX5yFp0h66Nfx8XjBYkWD9W3yugr50GOz3CnE9m/U/Cd5OyEbTMI0rgi6ZQ==} + engines: {node: '>=18'} + + '@ai-sdk/react@1.2.8': + resolution: {integrity: sha512-S2FzCSi4uTF0JuSN6zYMXyiAWVAzi/Hho8ISYgHpGZiICYLNCP2si4DuXQOsnWef3IXzQPLVoE11C63lILZIkw==} engines: {node: '>=18'} peerDependencies: react: ^18 || ^19 || ^19.0.0-rc - zod: ^3.0.0 + zod: ^3.23.8 peerDependenciesMeta: - react: - optional: true zod: optional: true - '@ai-sdk/ui-utils@1.1.15': - resolution: {integrity: sha512-NsV/3CMmjc4m53snzRdtZM6teTQUXIKi8u0Kf7GBruSzaMSuZ4DWaAAlUshhR3p2FpZgtsogW+vYG1/rXsGu+Q==} + '@ai-sdk/ui-utils@1.2.7': + resolution: {integrity: sha512-OVRxa4SDj0wVsMZ8tGr/whT89oqNtNoXBKmqWC2BRv5ZG6azL2LYZ5ZK35u3lb4l1IE7cWGsLlmq0py0ttsL7A==} engines: {node: '>=18'} peerDependencies: - zod: ^3.0.0 - peerDependenciesMeta: - zod: - optional: true + zod: ^3.23.8 '@ampproject/remapping@2.3.0': resolution: {integrity: sha512-30iZtAPgz+LTIYoeivqYo853f02jBYSd5uGnGpkFV0M3xOt9aN73erkgYAmZU43x4VfqcnLxW9Kpg3R5LC4YYw==} @@ -1809,17 +1814,15 @@ packages: resolution: {integrity: sha512-5GG/5IbQQpC9FpkRGsSvZI5QYeSCzlJHdpBQntCsuTOxhKD8lqKhrleg2Yi7yvMIf82Ycmmqln9U8V9qwEiJew==} engines: {node: '>= 8.0.0'} - ai@4.1.45: - resolution: {integrity: sha512-nQkxQ2zCD+O/h8zJ+PxmBv9coyMaG1uP9kGJvhNaGAA25hbZRQWL0NbTsSJ/QMOUraXKLa+6fBm3VF1NkJK9Kg==} + ai@4.3.4: + resolution: {integrity: sha512-uMjzrowIqfU8CCCxhx8QGl7ETydHBROeNL0VoEwetkmDCY6Q8ZTacj6jNNqGJOiCk595aUrGR9VHPY9Ylvy1fg==} engines: {node: '>=18'} peerDependencies: react: ^18 || ^19 || ^19.0.0-rc - zod: ^3.0.0 + zod: ^3.23.8 peerDependenciesMeta: react: optional: true - zod: - optional: true ajv@8.16.0: resolution: {integrity: sha512-F0twR8U1ZU67JIEtekUcLkXkoO5mMMmgGD8sK/xUFzJ805jxHQl92hImFAqqXMyMYjSPOyUPAwHYhB72g5sTXw==} @@ -4749,10 +4752,10 @@ packages: snapshots: - '@ai-sdk/openai@1.1.13(zod@3.24.2)': + '@ai-sdk/openai@1.3.10(zod@3.24.2)': dependencies: - '@ai-sdk/provider': 1.0.8 - '@ai-sdk/provider-utils': 2.1.9(zod@3.24.2) + '@ai-sdk/provider': 1.1.2 + '@ai-sdk/provider-utils': 2.2.6(zod@3.24.2) zod: 3.24.2 '@ai-sdk/provider-utils@2.1.9(zod@3.24.2)': @@ -4764,27 +4767,37 @@ snapshots: optionalDependencies: zod: 3.24.2 + '@ai-sdk/provider-utils@2.2.6(zod@3.24.2)': + dependencies: + '@ai-sdk/provider': 1.1.2 + nanoid: 3.3.8 + secure-json-parse: 2.7.0 + zod: 3.24.2 + '@ai-sdk/provider@1.0.8': dependencies: json-schema: 0.4.0 - '@ai-sdk/react@1.1.17(react@18.3.1)(zod@3.24.2)': + '@ai-sdk/provider@1.1.2': dependencies: - '@ai-sdk/provider-utils': 2.1.9(zod@3.24.2) - '@ai-sdk/ui-utils': 1.1.15(zod@3.24.2) + json-schema: 0.4.0 + + '@ai-sdk/react@1.2.8(react@18.3.1)(zod@3.24.2)': + dependencies: + '@ai-sdk/provider-utils': 2.2.6(zod@3.24.2) + '@ai-sdk/ui-utils': 1.2.7(zod@3.24.2) + react: 18.3.1 swr: 2.3.2(react@18.3.1) throttleit: 2.1.0 optionalDependencies: - react: 18.3.1 zod: 3.24.2 - '@ai-sdk/ui-utils@1.1.15(zod@3.24.2)': + '@ai-sdk/ui-utils@1.2.7(zod@3.24.2)': dependencies: - '@ai-sdk/provider': 1.0.8 - '@ai-sdk/provider-utils': 2.1.9(zod@3.24.2) - zod-to-json-schema: 3.24.2(zod@3.24.2) - optionalDependencies: + '@ai-sdk/provider': 1.1.2 + '@ai-sdk/provider-utils': 2.2.6(zod@3.24.2) zod: 3.24.2 + zod-to-json-schema: 3.24.2(zod@3.24.2) '@ampproject/remapping@2.3.0': dependencies: @@ -6978,17 +6991,17 @@ snapshots: dependencies: humanize-ms: 1.2.1 - ai@4.1.45(react@18.3.1)(zod@3.24.2): + ai@4.3.4(react@18.3.1)(zod@3.24.2): dependencies: - '@ai-sdk/provider': 1.0.8 - '@ai-sdk/provider-utils': 2.1.9(zod@3.24.2) - '@ai-sdk/react': 1.1.17(react@18.3.1)(zod@3.24.2) - '@ai-sdk/ui-utils': 1.1.15(zod@3.24.2) + '@ai-sdk/provider': 1.1.2 + '@ai-sdk/provider-utils': 2.2.6(zod@3.24.2) + '@ai-sdk/react': 1.2.8(react@18.3.1)(zod@3.24.2) + '@ai-sdk/ui-utils': 1.2.7(zod@3.24.2) '@opentelemetry/api': 1.9.0 jsondiffpatch: 0.6.0 + zod: 3.24.2 optionalDependencies: react: 18.3.1 - zod: 3.24.2 ajv@8.16.0: dependencies: diff --git a/apps/api/src/lib/deep-research/research-manager.ts b/apps/api/src/lib/deep-research/research-manager.ts index 0e9afa91..900dcd21 100644 --- a/apps/api/src/lib/deep-research/research-manager.ts +++ b/apps/api/src/lib/deep-research/research-manager.ts @@ -7,6 +7,8 @@ import { } from "./deep-research-redis"; import { generateCompletions, trimToTokenLimit } from "../../scraper/scrapeURL/transformers/llmExtract"; import { ExtractOptions } from "../../controllers/v1/types"; +import { openai } from "@ai-sdk/openai/dist"; +import { getModel } from "../generic-ai"; interface AnalysisResult { gaps: string[]; nextSteps: string[]; @@ -277,7 +279,7 @@ export class ResearchLLMService { }), systemPrompt: formats.includes('json') ? "You are an expert research analyst who creates comprehensive, structured analysis following the provided JSON schema exactly." - : "You are an expert research analyst who creates comprehensive, well-structured reports. Your reports are detailed, properly formatted in Markdown, and include clear sections with citations. Today's date is " + + : "You are an expert research analyst who creates comprehensive, well-structured reports. Don't begin the report by saying 'Here is the report', nor 'Below is the report', nor something similar. ALWAYS start with a great title that reflects the research topic and findings. Your reports are detailed, properly formatted in Markdown, and include clear sections with citations. Today's date is " + new Date().toISOString().split("T")[0], prompt: trimToTokenLimit( analysisPrompt @@ -296,11 +298,17 @@ export class ResearchLLMService { - Make it comprehensive and thorough (aim for 4+ pages worth of content) - Include all relevant findings and insights from the research - Cite sources - - Use bullet points and lists where appropriate for readability`, + - Cite sources throughout the report + - Use bullet points and lists where appropriate for readability + - Don't begin the report by saying "Here is the report", nor "Below is the report", nor something similar. + - ALWAYS Start with a great title that reflects the research topic and findings - concise and to the point. That's the first thing you should output. + + Begin!`, 100000, ).text, }, markdown: "", + model: getModel('o3-mini'), }); return extract; diff --git a/apps/api/src/services/billing/batch_billing.ts b/apps/api/src/services/billing/batch_billing.ts index d5e6591d..8006b880 100644 --- a/apps/api/src/services/billing/batch_billing.ts +++ b/apps/api/src/services/billing/batch_billing.ts @@ -122,7 +122,7 @@ export async function processBillingBatch() { group.is_extract ); - logger.info(`✅ Successfully billed team ${group.team_id} for ${group.total_credits} credits`); + logger.info(`✅ Successfully billed team ${group.team_id} for ${group.total_credits} ${group.is_extract ? 'tokens' : 'credits'}`); } catch (error) { logger.error(`❌ Failed to bill team ${group.team_id}`, { error, group }); Sentry.captureException(error, { From 6e9396dc5752d5bc78b06657595ba8537ce55c11 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Fri, 11 Apr 2025 18:57:14 +0200 Subject: [PATCH 074/160] feat(search): add further logging --- apps/api/src/controllers/v1/search.ts | 59 +++++++++++++++++-- .../deep-research/deep-research-service.ts | 2 +- 2 files changed, 55 insertions(+), 6 deletions(-) diff --git a/apps/api/src/controllers/v1/search.ts b/apps/api/src/controllers/v1/search.ts index d48c511f..bcb18166 100644 --- a/apps/api/src/controllers/v1/search.ts +++ b/apps/api/src/controllers/v1/search.ts @@ -1,5 +1,4 @@ import { Response } from "express"; -import { logger } from "../../lib/logger"; import { Document, RequestWithAuth, @@ -19,6 +18,8 @@ import { search } from "../../search"; import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; import * as Sentry from "@sentry/node"; import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings"; +import { logger as _logger } from "../../lib/logger"; +import type { Logger } from "winston"; // Used for deep research export async function searchAndScrapeSearchResult( @@ -28,7 +29,8 @@ export async function searchAndScrapeSearchResult( origin: string; timeout: number; scrapeOptions: ScrapeOptions; - } + }, + logger: Logger, ): Promise { try { const searchResults = await search({ @@ -44,7 +46,8 @@ export async function searchAndScrapeSearchResult( title: result.title, description: result.description }, - options + options, + logger ) ) ); @@ -63,6 +66,7 @@ async function scrapeSearchResult( timeout: number; scrapeOptions: ScrapeOptions; }, + logger: Logger, ): Promise { const jobId = uuidv4(); const jobPriority = await getJobPriority({ @@ -74,6 +78,12 @@ async function scrapeSearchResult( if (isUrlBlocked(searchResult.url)) { throw new Error("Could not scrape url: " + BLOCKLISTED_URL_MESSAGE); } + logger.info("Adding scrape job", { + scrapeId: jobId, + url: searchResult.url, + teamId: options.teamId, + origin: options.origin, + }); await addScrapeJob( { url: searchResult.url, @@ -90,6 +100,12 @@ async function scrapeSearchResult( ); const doc = await waitForJob(jobId, options.timeout); + logger.info("Scrape job completed", { + scrapeId: jobId, + url: searchResult.url, + teamId: options.teamId, + origin: options.origin, + }); await getScrapeQueue().remove(jobId); // Move SERP results to top level @@ -101,6 +117,7 @@ async function scrapeSearchResult( }; } catch (error) { logger.error(`Error in scrapeSearchResult: ${error}`, { + scrapeId: jobId, url: searchResult.url, teamId: options.teamId, }); @@ -126,10 +143,22 @@ export async function searchController( req: RequestWithAuth<{}, SearchResponse, SearchRequest>, res: Response, ) { + const jobId = uuidv4(); + let logger = _logger.child({ + jobId, + teamId: req.auth.team_id, + module: "search", + method: "searchController", + }); + try { req.body = searchRequestSchema.parse(req.body); - const jobId = uuidv4(); + logger = logger.child({ + query: req.body.query, + origin: req.body.origin, + }); + const startTime = new Date().getTime(); let limit = req.body.limit; @@ -137,6 +166,8 @@ export async function searchController( // Buffer results by 50% to account for filtered URLs const num_results_buffer = Math.floor(limit * 2); + logger.info("Searching for results"); + let searchResults = await search({ query: req.body.query, advanced: false, @@ -148,12 +179,17 @@ export async function searchController( location: req.body.location, }); + logger.info("Searching completed", { + num_results: searchResults.length, + }); + // Filter blocked URLs early to avoid unnecessary billing if (searchResults.length > limit) { searchResults = searchResults.slice(0, limit); } if (searchResults.length === 0) { + logger.info("No search results found"); return res.status(200).json({ success: true, data: [], @@ -183,16 +219,20 @@ export async function searchController( } // Scrape each non-blocked result, handling timeouts individually + logger.info("Scraping search results"); const scrapePromises = searchResults.map((result) => scrapeSearchResult(result, { teamId: req.auth.team_id, origin: req.body.origin, timeout: req.body.timeout, scrapeOptions: req.body.scrapeOptions, - }), + }, logger), ); const docs = await Promise.all(scrapePromises); + logger.info("Scraping completed", { + num_docs: docs.length, + }); // Bill for successful scrapes only billTeam(req.auth.team_id, req.acuc?.sub_id, docs.length).catch((error) => { @@ -207,6 +247,10 @@ export async function searchController( doc.serpResults || (doc.markdown && doc.markdown.trim().length > 0), ); + logger.info("Filtering completed", { + num_docs: filteredDocs.length, + }); + if (filteredDocs.length === 0) { return res.status(200).json({ success: true, @@ -218,6 +262,11 @@ export async function searchController( const endTime = new Date().getTime(); const timeTakenInSeconds = (endTime - startTime) / 1000; + logger.info("Logging job", { + num_docs: filteredDocs.length, + time_taken: timeTakenInSeconds, + }); + logJob({ job_id: jobId, success: true, diff --git a/apps/api/src/lib/deep-research/deep-research-service.ts b/apps/api/src/lib/deep-research/deep-research-service.ts index 61b59a17..64d94946 100644 --- a/apps/api/src/lib/deep-research/deep-research-service.ts +++ b/apps/api/src/lib/deep-research/deep-research-service.ts @@ -109,7 +109,7 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) { fastMode: false, blockAds: false, }, - }); + }, logger); return response.length > 0 ? response : []; }); From f18a6b20ffbc908da00596c083b3eab1b79aff4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Fri, 11 Apr 2025 20:38:51 +0200 Subject: [PATCH 075/160] extract concurrency hotfix --- apps/api/src/lib/extract/document-scraper.ts | 2 ++ apps/api/src/lib/job-priority.ts | 5 ++++- apps/api/src/services/queue-jobs.ts | 6 +++--- apps/api/src/types.ts | 1 + 4 files changed, 10 insertions(+), 4 deletions(-) diff --git a/apps/api/src/lib/extract/document-scraper.ts b/apps/api/src/lib/extract/document-scraper.ts index 313f2f31..f5501230 100644 --- a/apps/api/src/lib/extract/document-scraper.ts +++ b/apps/api/src/lib/extract/document-scraper.ts @@ -31,6 +31,7 @@ export async function scrapeDocument( const jobPriority = await getJobPriority({ team_id: options.teamId, basePriority: 10, + from_extract: true, }); await addScrapeJob( @@ -45,6 +46,7 @@ export async function scrapeDocument( }, origin: options.origin, is_scrape: true, + from_extract: true, }, {}, jobId, diff --git a/apps/api/src/lib/job-priority.ts b/apps/api/src/lib/job-priority.ts index 5e89ad9d..02356c21 100644 --- a/apps/api/src/lib/job-priority.ts +++ b/apps/api/src/lib/job-priority.ts @@ -1,3 +1,4 @@ +import { RateLimiterMode } from "../types"; import { getACUC, getACUCTeam } from "../controllers/auth"; import { redisConnection } from "../services/queue-service"; import { logger } from "./logger"; @@ -31,16 +32,18 @@ export async function deleteJobPriority(team_id, job_id) { export async function getJobPriority({ team_id, basePriority = 10, + from_extract = false, }: { team_id: string; basePriority?: number; + from_extract?: boolean; }): Promise { if (team_id === "d97c4ceb-290b-4957-8432-2b2a02727d95") { return 50; } try { - const acuc = await getACUCTeam(team_id); + const acuc = await getACUCTeam(team_id, false, true, from_extract ? RateLimiterMode.Extract : RateLimiterMode.Crawl); const setKey = SET_KEY_PREFIX + team_id; diff --git a/apps/api/src/services/queue-jobs.ts b/apps/api/src/services/queue-jobs.ts index 3e8985fe..24924d7d 100644 --- a/apps/api/src/services/queue-jobs.ts +++ b/apps/api/src/services/queue-jobs.ts @@ -1,6 +1,6 @@ import { getScrapeQueue } from "./queue-service"; import { v4 as uuidv4 } from "uuid"; -import { NotificationType, WebScraperOptions } from "../types"; +import { NotificationType, RateLimiterMode, WebScraperOptions } from "../types"; import * as Sentry from "@sentry/node"; import { cleanOldConcurrencyLimitEntries, @@ -78,7 +78,7 @@ async function addScrapeJobRaw( webScraperOptions.team_id ) { const now = Date.now(); - maxConcurrency = (await getACUCTeam(webScraperOptions.team_id))?.concurrency ?? 2; + maxConcurrency = (await getACUCTeam(webScraperOptions.team_id, false, true, webScraperOptions.is_extract ? RateLimiterMode.Extract : RateLimiterMode.Crawl))?.concurrency ?? 2; cleanOldConcurrencyLimitEntries(webScraperOptions.team_id, now); currentActiveConcurrency = (await getConcurrencyLimitActiveJobs(webScraperOptions.team_id, now)).length; concurrencyLimited = currentActiveConcurrency >= maxConcurrency; @@ -171,7 +171,7 @@ export async function addScrapeJobs( if (jobs[0].data && jobs[0].data.team_id) { const now = Date.now(); - maxConcurrency = (await getACUCTeam(jobs[0].data.team_id))?.concurrency ?? 2; + maxConcurrency = (await getACUCTeam(jobs[0].data.team_id, false, true, jobs[0].data.from_extract ? RateLimiterMode.Extract : RateLimiterMode.Crawl))?.concurrency ?? 2; cleanOldConcurrencyLimitEntries(jobs[0].data.team_id, now); currentActiveConcurrency = (await getConcurrencyLimitActiveJobs(jobs[0].data.team_id, now)).length; diff --git a/apps/api/src/types.ts b/apps/api/src/types.ts index a42d4cb1..6a6ae6d9 100644 --- a/apps/api/src/types.ts +++ b/apps/api/src/types.ts @@ -44,6 +44,7 @@ export interface WebScraperOptions { v1?: boolean; is_scrape?: boolean; isCrawlSourceScrape?: boolean; + from_extract?: boolean; } export interface RunWebScraperParams { From 138a9757aef81b9a02d2520192af09f999713ca0 Mon Sep 17 00:00:00 2001 From: "devin-ai-integration[bot]" <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Sat, 12 Apr 2025 16:38:56 -0700 Subject: [PATCH 076/160] (feat/change-tracking) Change Tracking Modes (#1445) * Add git-diff support to change tracking format Co-Authored-By: Nicolas Camara * Fix type issues with parse-diff library Co-Authored-By: Nicolas Camara * Fix parse-diff type definitions to match actual library structure Co-Authored-By: Nicolas Camara * Add structured output/prompt support to change tracking Co-Authored-By: Nicolas Camara * (feat/change-tracking) Change Tracking Modes (#1447) * Refactor change tracking to use modes array instead of separate formats Co-Authored-By: Nicolas Camara * Implement schema-based change tracking with old/new value comparison Co-Authored-By: Nicolas Camara * Nick: * Nick: .json * Update diff.ts --------- Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: Nicolas Camara Co-authored-by: Nicolas * Update index.ts * Update types.ts * Update diff.ts * Update scrape.ts --------- Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: Nicolas Camara Co-authored-by: Nicolas --- apps/api/package.json | 2 + apps/api/pnpm-lock.yaml | 60 ++++++++ apps/api/src/__tests__/snips/scrape.test.ts | 110 ++++++++++++++ apps/api/src/controllers/v1/scrape.ts | 2 +- apps/api/src/controllers/v1/types.ts | 29 ++++ .../scraper/scrapeURL/transformers/diff.ts | 140 +++++++++++++++++- .../scraper/scrapeURL/transformers/index.ts | 18 +++ apps/api/src/types/parse-diff.d.ts | 49 ++++++ apps/js-sdk/firecrawl/src/index.ts | 20 +++ 9 files changed, 426 insertions(+), 4 deletions(-) create mode 100644 apps/api/src/types/parse-diff.d.ts diff --git a/apps/api/package.json b/apps/api/package.json index 1dc26a05..eae70214 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -90,6 +90,7 @@ "express": "^4.18.2", "express-rate-limit": "^7.3.1", "express-ws": "^5.0.2", + "git-diff": "^2.0.6", "glob": "^10.4.2", "gpt3-tokenizer": "^1.1.5", "ioredis": "^5.4.1", @@ -110,6 +111,7 @@ "mongoose": "^8.4.4", "natural": "^7.0.7", "ollama-ai-provider": "^1.2.0", + "parse-diff": "^0.11.1", "pdf-parse": "^1.1.1", "pos": "^0.4.2", "posthog-node": "^4.0.1", diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index 8644e7bd..028423d3 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -125,6 +125,9 @@ importers: express-ws: specifier: ^5.0.2 version: 5.0.2(express@4.19.2) + git-diff: + specifier: ^2.0.6 + version: 2.0.6 glob: specifier: ^10.4.2 version: 10.4.2 @@ -185,6 +188,9 @@ importers: ollama-ai-provider: specifier: ^1.2.0 version: 1.2.0(zod@3.24.2) + parse-diff: + specifier: ^0.11.1 + version: 0.11.1 pdf-parse: specifier: ^1.1.1 version: 1.1.1 @@ -2384,6 +2390,10 @@ packages: resolution: {integrity: sha512-EjePK1srD3P08o2j4f0ExnylqRs5B9tJjcp9t1krH2qRi8CCdsYfwe9JgSLurFBWwq4uOlipzfk5fHNvwFKr8Q==} engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + diff@3.5.0: + resolution: {integrity: sha512-A46qtFgd+g7pDZinpnwiRJtxbC1hpgf0uzP3iG89scHk0AUC7A1TGxf5OiiOUv/JMZR8GOt8hL900hV0bOy5xA==} + engines: {node: '>=0.3.1'} + diff@4.0.2: resolution: {integrity: sha512-58lmxKSA4BNyLz+HHMUzlOEpg09FV+ev6ZMe3vJihgdxzgcwZ8VoEEPmALCZG9LmqfVoNMMKpttIYTVG6uDY7A==} engines: {node: '>=0.3.1'} @@ -2762,6 +2772,10 @@ packages: resolution: {integrity: sha512-BzUrJBS9EcUb4cFol8r4W3v1cPsSyajLSthNkz5BxbpDcHN5tIrM10E2eNvfnvBn3DaT3DUgx0OpsBKkaOpanw==} engines: {node: '>= 14'} + git-diff@2.0.6: + resolution: {integrity: sha512-/Iu4prUrydE3Pb3lCBMbcSNIf81tgGt0W1ZwknnyF62t3tHmtiJTRj0f+1ZIhp3+Rh0ktz1pJVoa7ZXUCskivA==} + engines: {node: '>= 4.8.0'} + glob-parent@5.1.2: resolution: {integrity: sha512-AOIgSQCepiJYwP3ARnGx+5VnTu2HBYdzbGP45eLw1vr3zB3vZLeyed1sC9hnbcOc9/SrMyM5RPQrkGz4aS9Zow==} engines: {node: '>= 6'} @@ -2955,6 +2969,10 @@ packages: ini@1.3.8: resolution: {integrity: sha512-JV/yugV2uzW5iMRSiZAyDtQd+nxtUnjeLt0acNdw98kKLrvuRVyB80tsREOE7yvGVgalhZ6RNXCmEHkUKBKxew==} + interpret@1.4.0: + resolution: {integrity: sha512-agE4QfB2Lkp9uICn7BAqoscw4SZP9kTE2hxiFI3jBPmXJfdqiahTbUuKGsMoN2GtqL9AxhYioAcVvgsb1HvRbA==} + engines: {node: '>= 0.10'} + ioredis@5.4.1: resolution: {integrity: sha512-2YZsvl7jopIa1gaePkeMtd9rAcSjOOjPtpcLlOeusyO+XH2SK5ZcT+UCrElPP+WVIInh2TzeI4XW9ENaSLVVHA==} engines: {node: '>=12.22.0'} @@ -3721,6 +3739,9 @@ packages: resolution: {integrity: sha512-GQ2EWRpQV8/o+Aw8YqtfZZPfNRWZYkbidE9k5rpl/hC3vtHHBfGm2Ifi6qWV+coDGkrUKZAxE3Lot5kcsRlh+g==} engines: {node: '>=6'} + parse-diff@0.11.1: + resolution: {integrity: sha512-Oq4j8LAOPOcssanQkIjxosjATBIEJhCxMCxPhMu+Ci4wdNmAEdx0O+a7gzbR2PyKXgKPvRLIN5g224+dJAsKHA==} + parse-json@5.2.0: resolution: {integrity: sha512-ayCKvm/phCGxOkYRSCM82iDwct8/EonSEgCSxWxD7ve6jHggsFl4fZVQBPRNgQoKiuV/odhFrGzQXZwbifC8Rg==} engines: {node: '>=8'} @@ -3982,6 +4003,10 @@ packages: resolution: {integrity: sha512-hOS089on8RduqdbhvQ5Z37A0ESjsqz6qnRcffsMU3495FuTdqSm+7bhJ29JvIOsBDEEnan5DPu9t3To9VRlMzA==} engines: {node: '>=8.10.0'} + rechoir@0.6.2: + resolution: {integrity: sha512-HFM8rkZ+i3zrV+4LQjwQ0W+ez98pApMGM3HUrN04j3CqzPOzl9nmP15Y8YXNm8QHGv/eacOVEjqhmWpkRV0NAw==} + engines: {node: '>= 0.10'} + redis-errors@1.2.0: resolution: {integrity: sha512-1qny3OExCf0UvUV/5wpYKf2YwPcOqXzkwKKSmKHiE6ZMQs5heeE/c8eXK+PNllPvmjgAbfnsbpkGZWy8cBpn9w==} engines: {node: '>=4'} @@ -4148,6 +4173,15 @@ packages: resolution: {integrity: sha512-7++dFhtcx3353uBaq8DDR4NuxBetBzC7ZQOhmTQInHEd6bSrXdiEyzCvG07Z44UYdLShWUyXt5M/yhz8ekcb1A==} engines: {node: '>=8'} + shelljs.exec@1.1.8: + resolution: {integrity: sha512-vFILCw+lzUtiwBAHV8/Ex8JsFjelFMdhONIsgKNLgTzeRckp2AOYRQtHJE/9LhNvdMmE27AGtzWx0+DHpwIwSw==} + engines: {node: '>= 4.0.0'} + + shelljs@0.8.5: + resolution: {integrity: sha512-TiwcRcrkhHvbrZbnRcFYMLl30Dfov3HKqzp5tO5b4pt6G/SezKcYhmDg15zXVBswHmctSAQKznqNW2LO5tTDow==} + engines: {node: '>=4'} + hasBin: true + shimmer@1.2.1: resolution: {integrity: sha512-sQTKC1Re/rM6XyFM6fIAGHRPVGvyXfgzIDvzoq608vM+jeyVD0Tu1E6Np0Kc2zAIFWIj963V2800iF/9LPieQw==} @@ -7603,6 +7637,8 @@ snapshots: diff-sequences@29.6.3: {} + diff@3.5.0: {} + diff@4.0.2: {} dingbat-to-unicode@1.0.1: {} @@ -8026,6 +8062,14 @@ snapshots: transitivePeerDependencies: - supports-color + git-diff@2.0.6: + dependencies: + chalk: 2.4.2 + diff: 3.5.0 + loglevel: 1.9.1 + shelljs: 0.8.5 + shelljs.exec: 1.1.8 + glob-parent@5.1.2: dependencies: is-glob: 4.0.3 @@ -8271,6 +8315,8 @@ snapshots: ini@1.3.8: {} + interpret@1.4.0: {} + ioredis@5.4.1: dependencies: '@ioredis/commands': 1.2.0 @@ -9249,6 +9295,8 @@ snapshots: dependencies: callsites: 3.1.0 + parse-diff@0.11.1: {} + parse-json@5.2.0: dependencies: '@babel/code-frame': 7.24.7 @@ -9546,6 +9594,10 @@ snapshots: dependencies: picomatch: 2.3.1 + rechoir@0.6.2: + dependencies: + resolve: 1.22.8 + redis-errors@1.2.0: {} redis-info@3.1.0: @@ -9718,6 +9770,14 @@ snapshots: shebang-regex@3.0.0: {} + shelljs.exec@1.1.8: {} + + shelljs@0.8.5: + dependencies: + glob: 7.2.3 + interpret: 1.4.0 + rechoir: 0.6.2 + shimmer@1.2.1: {} side-channel@1.0.6: diff --git a/apps/api/src/__tests__/snips/scrape.test.ts b/apps/api/src/__tests__/snips/scrape.test.ts index 097f5504..d9d97330 100644 --- a/apps/api/src/__tests__/snips/scrape.test.ts +++ b/apps/api/src/__tests__/snips/scrape.test.ts @@ -95,6 +95,116 @@ describe("Scrape tests", () => { expect(response.changeTracking).toBeDefined(); expect(response.changeTracking?.previousScrapeAt).not.toBeNull(); }, 30000); + + it.concurrent("includes git diff when requested", async () => { + const response = await scrape({ + url: "https://example.com", + formats: ["markdown", "changeTracking"], + changeTrackingOptions: { + modes: ["git-diff"] + } + }); + + expect(response.changeTracking).toBeDefined(); + expect(response.changeTracking?.previousScrapeAt).not.toBeNull(); + + if (response.changeTracking?.changeStatus === "changed") { + expect(response.changeTracking?.diff).toBeDefined(); + expect(response.changeTracking?.diff?.text).toBeDefined(); + expect(response.changeTracking?.diff?.json).toBeDefined(); + expect(response.changeTracking?.diff?.json.files).toBeInstanceOf(Array); + } + }, 30000); + + it.concurrent("includes structured output when requested", async () => { + const response = await scrape({ + url: "https://example.com", + formats: ["markdown", "changeTracking"], + changeTrackingOptions: { + modes: ["json"], + prompt: "Summarize the changes between the previous and current content", + systemPrompt: "You are a helpful assistant that summarizes changes between document versions." + } + }); + + expect(response.changeTracking).toBeDefined(); + expect(response.changeTracking?.previousScrapeAt).not.toBeNull(); + + if (response.changeTracking?.changeStatus === "changed") { + expect(response.changeTracking?.json).toBeDefined(); + } + }, 30000); + + it.concurrent("supports schema-based extraction for change tracking", async () => { + const response = await scrape({ + url: "https://example.com", + formats: ["markdown", "changeTracking"], + changeTrackingOptions: { + modes: ["json"], + schema: { + type: "object", + properties: { + pricing: { + type: "object", + properties: { + amount: { type: "number" }, + currency: { type: "string" } + } + }, + features: { + type: "array", + items: { type: "string" } + } + } + } + } + }); + + expect(response.changeTracking).toBeDefined(); + expect(response.changeTracking?.previousScrapeAt).not.toBeNull(); + + if (response.changeTracking?.changeStatus === "changed") { + expect(response.changeTracking?.json).toBeDefined(); + if (response.changeTracking?.json.pricing) { + expect(response.changeTracking?.json.pricing).toHaveProperty("old"); + expect(response.changeTracking?.json.pricing).toHaveProperty("new"); + } + if (response.changeTracking?.json.features) { + expect(response.changeTracking?.json.features).toHaveProperty("old"); + expect(response.changeTracking?.json.features).toHaveProperty("new"); + } + } + }, 30000); + + it.concurrent("supports both git-diff and structured modes together", async () => { + const response = await scrape({ + url: "https://example.com", + formats: ["markdown", "changeTracking"], + changeTrackingOptions: { + modes: ["git-diff", "json"], + schema: { + type: "object", + properties: { + summary: { type: "string" }, + changes: { type: "array", items: { type: "string" } } + } + } + } + }); + + expect(response.changeTracking).toBeDefined(); + expect(response.changeTracking?.previousScrapeAt).not.toBeNull(); + + if (response.changeTracking?.changeStatus === "changed") { + expect(response.changeTracking?.diff).toBeDefined(); + expect(response.changeTracking?.diff?.text).toBeDefined(); + expect(response.changeTracking?.diff?.json).toBeDefined(); + + expect(response.changeTracking?.json).toBeDefined(); + expect(response.changeTracking?.json).toHaveProperty("summary"); + expect(response.changeTracking?.json).toHaveProperty("changes"); + } + }, 30000); }); describe("Location API (f-e dependant)", () => { diff --git a/apps/api/src/controllers/v1/scrape.ts b/apps/api/src/controllers/v1/scrape.ts index ccf81cda..6259981d 100644 --- a/apps/api/src/controllers/v1/scrape.ts +++ b/apps/api/src/controllers/v1/scrape.ts @@ -104,7 +104,7 @@ export async function scrapeController( // Don't bill if we're early returning return; } - if (req.body.extract && req.body.formats.includes("extract")) { + if ((req.body.extract && req.body.formats?.includes("extract")) || (req.body.formats?.includes("changeTracking") && req.body.changeTrackingOptions?.modes?.includes("json"))) { creditsToBeBilled = 5; } diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index fa059e6f..56a93072 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -20,6 +20,7 @@ export type Format = | "screenshot" | "screenshot@fullPage" | "extract" + | "json" | "changeTracking"; export const url = z.preprocess( @@ -195,6 +196,13 @@ const baseScrapeOptions = z extract: extractOptions.optional(), // New jsonOptions: extractOptions.optional(), + changeTrackingOptions: z + .object({ + prompt: z.string().optional(), + schema: z.any().optional(), + modes: z.enum(["json", "git-diff"]).array().optional().default([]), + }) + .optional(), mobile: z.boolean().default(false), parsePDF: z.boolean().default(true), actions: actionsSchema.optional(), @@ -555,6 +563,27 @@ export type Document = { previousScrapeAt: string | null; changeStatus: "new" | "same" | "changed" | "removed"; visibility: "visible" | "hidden"; + diff?: { + text: string; + json: { + files: Array<{ + from: string | null; + to: string | null; + chunks: Array<{ + content: string; + changes: Array<{ + type: string; + normal?: boolean; + ln?: number; + ln1?: number; + ln2?: number; + content: string; + }>; + }>; + }>; + }; + }; + json?: any; } metadata: { title?: string; diff --git a/apps/api/src/scraper/scrapeURL/transformers/diff.ts b/apps/api/src/scraper/scrapeURL/transformers/diff.ts index ea98fa6a..f1c2d1fc 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/diff.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/diff.ts @@ -2,6 +2,53 @@ import { supabase_service } from "../../../services/supabase"; import { Document } from "../../../controllers/v1/types"; import { Meta } from "../index"; import { getJob } from "../../../controllers/v1/crawl-status"; +import gitDiff from 'git-diff'; +import parseDiff from 'parse-diff'; +import { generateCompletions } from "./llmExtract"; + +async function extractDataWithSchema(content: string, meta: Meta): Promise { + try { + const { extract } = await generateCompletions({ + logger: meta.logger.child({ + method: "extractDataWithSchema/generateCompletions", + }), + options: { + mode: "llm", + schema: meta.options.changeTrackingOptions?.schema, + systemPrompt: "Extract the requested information from the content based on the provided schema.", + temperature: 0 + }, + markdown: content + }); + return extract; + } catch (error) { + meta.logger.error("Error extracting data with schema", { error }); + return null; + } +} + +function compareExtractedData(previousData: any, currentData: any): any { + const result: Record = {}; + + const allKeys = new Set([ + ...Object.keys(previousData || {}), + ...Object.keys(currentData || {}) + ]); + + for (const key of allKeys) { + const oldValue = previousData?.[key]; + const newValue = currentData?.[key]; + + if (JSON.stringify(oldValue) !== JSON.stringify(newValue)) { + result[key] = { + previous: oldValue, + current: newValue + }; + } + } + + return result; +} export async function deriveDiff(meta: Meta, document: Document): Promise { if (meta.options.formats.includes("changeTracking")) { @@ -20,19 +67,106 @@ export async function deriveDiff(meta: Meta, document: Document): Promise [...x.replace(/\s+/g, "").replace(/\[iframe\]\(.+?\)/g, "")].sort().join(""); + const isChanged = transformer(previousMarkdown) !== transformer(currentMarkdown); + const changeStatus = document.metadata.statusCode === 404 ? "removed" : isChanged ? "changed" : "same"; document.changeTracking = { previousScrapeAt: data.o_date_added, - changeStatus: document.metadata.statusCode === 404 ? "removed" : transformer(previousMarkdown) === transformer(currentMarkdown) ? "same" : "changed", + changeStatus, visibility: meta.internalOptions.urlInvisibleInCurrentCrawl ? "hidden" : "visible", } + + if (meta.options.changeTrackingOptions?.modes?.includes("git-diff") && changeStatus === "changed") { + const diffText = gitDiff(previousMarkdown, currentMarkdown, { + color: false, + wordDiff: false + }); + + if (diffText) { + const diffStructured = parseDiff(diffText); + document.changeTracking.diff = { + text: diffText, + json: { + files: diffStructured.map(file => ({ + from: file.from || null, + to: file.to || null, + chunks: file.chunks.map(chunk => ({ + content: chunk.content, + changes: chunk.changes.map(change => { + const baseChange = { + type: change.type, + content: change.content + }; + + if (change.type === 'normal' && 'ln1' in change && 'ln2' in change) { + return { + ...baseChange, + normal: true, + ln1: change.ln1, + ln2: change.ln2 + }; + } else if (change.type === 'add' && 'ln' in change) { + return { + ...baseChange, + add: true, + ln: change.ln + }; + } else if (change.type === 'del' && 'ln' in change) { + return { + ...baseChange, + del: true, + ln: change.ln + }; + } + + return baseChange; + }) + })) + })) + } + }; + } + } + + if (meta.options.changeTrackingOptions?.modes?.includes("json") && + meta.options.changeTrackingOptions && changeStatus === "changed") { + try { + const previousData = meta.options.changeTrackingOptions.schema ? + await extractDataWithSchema(previousMarkdown, meta) : null; + + const currentData = meta.options.changeTrackingOptions.schema ? + await extractDataWithSchema(currentMarkdown, meta) : null; + + if (previousData && currentData) { + document.changeTracking.json = compareExtractedData(previousData, currentData); + } else { + const { extract } = await generateCompletions({ + logger: meta.logger.child({ + method: "deriveDiff/generateCompletions", + }), + options: { + mode: "llm", + systemPrompt: "Analyze the differences between the previous and current content and provide a structured summary of the changes.", + schema: meta.options.changeTrackingOptions.schema, + prompt: meta.options.changeTrackingOptions.prompt, + temperature: 0 + }, + markdown: `Previous Content:\n${previousMarkdown}\n\nCurrent Content:\n${currentMarkdown}`, + previousWarning: document.warning + }); + + document.changeTracking.json = extract; + } + } catch (error) { + meta.logger.error("Error generating structured diff with LLM", { error }); + document.warning = "Structured diff generation failed." + (document.warning ? ` ${document.warning}` : ""); + } + } } else if (!res.error) { document.changeTracking = { previousScrapeAt: null, diff --git a/apps/api/src/scraper/scrapeURL/transformers/index.ts b/apps/api/src/scraper/scrapeURL/transformers/index.ts index 5b16ae99..2544995d 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/index.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/index.ts @@ -159,6 +159,24 @@ export function coerceFieldsToFormats( ); } + if (document.changeTracking && + (!meta.options.changeTrackingOptions?.modes?.includes("git-diff")) && + document.changeTracking.diff !== undefined) { + meta.logger.warn( + "Removed diff from changeTracking because git-diff mode wasn't specified in changeTrackingOptions.modes.", + ); + delete document.changeTracking.diff; + } + + if (document.changeTracking && + (!meta.options.changeTrackingOptions?.modes?.includes("json")) && + document.changeTracking.json !== undefined) { + meta.logger.warn( + "Removed structured from changeTracking because structured mode wasn't specified in changeTrackingOptions.modes.", + ); + delete document.changeTracking.json; + } + if (meta.options.actions === undefined || meta.options.actions.length === 0) { delete document.actions; } diff --git a/apps/api/src/types/parse-diff.d.ts b/apps/api/src/types/parse-diff.d.ts new file mode 100644 index 00000000..8eca3b2b --- /dev/null +++ b/apps/api/src/types/parse-diff.d.ts @@ -0,0 +1,49 @@ +declare module 'parse-diff' { + interface NormalChange { + type: 'normal'; + normal: true; + ln1: number; + ln2: number; + content: string; + } + + interface AddChange { + type: 'add'; + add: true; + ln: number; + content: string; + } + + interface DeleteChange { + type: 'del'; + del: true; + ln: number; + content: string; + } + + type Change = NormalChange | AddChange | DeleteChange; + + interface Chunk { + content: string; + changes: Change[]; + oldStart: number; + oldLines: number; + newStart: number; + newLines: number; + } + + interface File { + chunks: Chunk[]; + deletions: number; + additions: number; + from: string | null; + to: string | null; + index?: string[]; + newMode?: string; + oldMode?: string; + binary?: boolean; + } + + function parseDiff(diff: string): File[]; + export = parseDiff; +} diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 6f76220f..7ba44b70 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -72,6 +72,26 @@ export interface FirecrawlDocument; + }>; + }>; + }; + }; }; // v1 search only title?: string; From ef341399f00df3ec4082ecba6d123766a62f2dbf Mon Sep 17 00:00:00 2001 From: "devin-ai-integration[bot]" <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Sat, 12 Apr 2025 16:47:24 -0700 Subject: [PATCH 077/160] Add change tracking support to Python and JS SDKs (#1448) * Add change tracking support to Python and JS SDKs Co-Authored-By: Nicolas Camara * Replace test API keys with TEST_API_KEY placeholder Co-Authored-By: Nicolas Camara * Replace API keys with dummy values for testing Co-Authored-By: Nicolas Camara * Use environment variables for API keys in tests Co-Authored-By: Nicolas Camara * Move JS SDK test to correct location and add dependencies Co-Authored-By: Nicolas Camara * Remove old test file location Co-Authored-By: Nicolas Camara * Update test file to use TEST_API_KEY environment variable Co-Authored-By: Nicolas Camara * Update Python SDK test to use TEST_API_KEY environment variable Co-Authored-By: Nicolas Camara * Update package.json * Update __init__.py --------- Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: Nicolas Camara Co-authored-by: Nicolas --- apps/js-sdk/firecrawl/package-lock.json | 37 +++--- apps/js-sdk/firecrawl/package.json | 8 +- .../v1/snips/change-tracking.test.ts | 105 ++++++++++++++++++ apps/js-sdk/firecrawl/src/index.ts | 8 +- apps/python-sdk/firecrawl/__init__.py | 2 +- apps/python-sdk/firecrawl/firecrawl.py | 16 ++- apps/python-sdk/tests/test_change_tracking.py | 98 ++++++++++++++++ 7 files changed, 254 insertions(+), 20 deletions(-) create mode 100644 apps/js-sdk/firecrawl/src/__tests__/v1/snips/change-tracking.test.ts create mode 100644 apps/python-sdk/tests/test_change_tracking.py diff --git a/apps/js-sdk/firecrawl/package-lock.json b/apps/js-sdk/firecrawl/package-lock.json index 593a039e..a96dc2f3 100644 --- a/apps/js-sdk/firecrawl/package-lock.json +++ b/apps/js-sdk/firecrawl/package-lock.json @@ -1,15 +1,14 @@ { "name": "@mendable/firecrawl-js", - "version": "1.22.0", + "version": "1.22.1", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "@mendable/firecrawl-js", - "version": "1.22.0", + "version": "1.22.1", "license": "MIT", "dependencies": { - "axios": "^1.6.8", "typescript-event-target": "^1.1.1", "zod": "^3.23.8", "zod-to-json-schema": "^3.23.0" @@ -18,10 +17,11 @@ "@jest/globals": "^29.7.0", "@types/axios": "^0.14.0", "@types/dotenv": "^8.2.0", - "@types/jest": "^29.5.12", + "@types/jest": "^29.5.14", "@types/mocha": "^10.0.6", - "@types/node": "^20.12.12", + "@types/node": "^20.17.30", "@types/uuid": "^9.0.8", + "axios": "^1.8.4", "dotenv": "^16.4.5", "jest": "^29.7.0", "ts-jest": "^29.2.2", @@ -1812,10 +1812,11 @@ } }, "node_modules/@types/jest": { - "version": "29.5.12", - "resolved": "https://registry.npmjs.org/@types/jest/-/jest-29.5.12.tgz", - "integrity": "sha512-eDC8bTvT/QhYdxJAulQikueigY5AsdBRH2yDKW3yveW7svY3+DzN84/2NUgkw10RTiJbWqZrTtoGVdYlvFJdLw==", + "version": "29.5.14", + "resolved": "https://registry.npmjs.org/@types/jest/-/jest-29.5.14.tgz", + "integrity": "sha512-ZN+4sdnLUbo8EVvVc2ao0GFW6oVrQRPn4K2lglySj7APvSrgzxHiNNK99us4WDMi57xxA2yggblIAMNhXOotLQ==", "dev": true, + "license": "MIT", "dependencies": { "expect": "^29.0.0", "pretty-format": "^29.0.0" @@ -1949,12 +1950,15 @@ "node_modules/asynckit": { "version": "0.4.0", "resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz", - "integrity": "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==" + "integrity": "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==", + "dev": true }, "node_modules/axios": { - "version": "1.6.8", - "resolved": "https://registry.npmjs.org/axios/-/axios-1.6.8.tgz", - "integrity": "sha512-v/ZHtJDU39mDpyBoFVkETcd/uNdxrWRrg3bKpOKzXFA6Bvqopts6ALSMU3y6ijYxbw2B+wPrIv46egTzJXCLGQ==", + "version": "1.8.4", + "resolved": "https://registry.npmjs.org/axios/-/axios-1.8.4.tgz", + "integrity": "sha512-eBSYY4Y68NNlHbHBMdeDmKNtDgXWhQsJcGqzO3iLUM0GraQFSS9cVgPX5I9b3lbdFKyYoAEGAZF1DwhTaljNAw==", + "dev": true, + "license": "MIT", "dependencies": { "follow-redirects": "^1.15.6", "form-data": "^4.0.0", @@ -2351,6 +2355,7 @@ "version": "1.0.8", "resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.8.tgz", "integrity": "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==", + "dev": true, "dependencies": { "delayed-stream": "~1.0.0" }, @@ -2467,6 +2472,7 @@ "version": "1.0.0", "resolved": "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz", "integrity": "sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==", + "dev": true, "engines": { "node": ">=0.4.0" } @@ -2784,6 +2790,7 @@ "version": "1.15.6", "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.6.tgz", "integrity": "sha512-wWN62YITEaOpSK584EZXJafH1AGpO8RVgElfkuXbTOrPX4fIfOyEpW/CsiNd8JdYrAoOvafRTOEnvsO++qCqFA==", + "dev": true, "funding": [ { "type": "individual", @@ -2831,6 +2838,7 @@ "version": "4.0.0", "resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.0.tgz", "integrity": "sha512-ETEklSGi5t0QMZuiXoA/Q6vcnxcLQP5vdugSpuAyi6SVGi2clPPp+xgEhuMaHC+zGgn31Kd235W35f7Hykkaww==", + "dev": true, "dependencies": { "asynckit": "^0.4.0", "combined-stream": "^1.0.8", @@ -4111,6 +4119,7 @@ "version": "1.52.0", "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.52.0.tgz", "integrity": "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg==", + "dev": true, "engines": { "node": ">= 0.6" } @@ -4119,6 +4128,7 @@ "version": "2.1.35", "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.35.tgz", "integrity": "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==", + "dev": true, "dependencies": { "mime-db": "1.52.0" }, @@ -4507,7 +4517,8 @@ "node_modules/proxy-from-env": { "version": "1.1.0", "resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-1.1.0.tgz", - "integrity": "sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg==" + "integrity": "sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg==", + "dev": true }, "node_modules/punycode": { "version": "2.3.1", diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 7c64b33e..bf7ed81d 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.22.1", + "version": "1.23.0", "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts", @@ -25,7 +25,6 @@ "author": "Mendable.ai", "license": "MIT", "dependencies": { - "axios": "^1.6.8", "typescript-event-target": "^1.1.1", "zod": "^3.23.8", "zod-to-json-schema": "^3.23.0" @@ -38,10 +37,11 @@ "@jest/globals": "^29.7.0", "@types/axios": "^0.14.0", "@types/dotenv": "^8.2.0", - "@types/jest": "^29.5.12", + "@types/jest": "^29.5.14", "@types/mocha": "^10.0.6", - "@types/node": "^20.12.12", + "@types/node": "^20.17.30", "@types/uuid": "^9.0.8", + "axios": "^1.8.4", "dotenv": "^16.4.5", "jest": "^29.7.0", "ts-jest": "^29.2.2", diff --git a/apps/js-sdk/firecrawl/src/__tests__/v1/snips/change-tracking.test.ts b/apps/js-sdk/firecrawl/src/__tests__/v1/snips/change-tracking.test.ts new file mode 100644 index 00000000..835582ce --- /dev/null +++ b/apps/js-sdk/firecrawl/src/__tests__/v1/snips/change-tracking.test.ts @@ -0,0 +1,105 @@ +import axios from 'axios'; +import FirecrawlApp from '../../../../src/index'; + +jest.mock('axios'); +const mockedAxios = axios as jest.Mocked; + +describe('Change Tracking Tests', () => { + beforeEach(() => { + jest.resetAllMocks(); + }); + + it('should support basic change tracking format', async () => { + mockedAxios.post.mockResolvedValueOnce({ + status: 200, + data: { + success: true, + data: { + markdown: 'Test markdown content', + changeTracking: { + previousScrapeAt: '2023-01-01T00:00:00Z', + changeStatus: 'changed', + visibility: 'visible' + } + } + } + }); + + const app = new FirecrawlApp({ apiKey: process.env.TEST_API_KEY || 'dummy-api-key-for-testing' }); + const result = await app.scrapeUrl('https://example.com', { + formats: ['markdown', 'changeTracking'] + }); + + expect(mockedAxios.post).toHaveBeenCalledTimes(1); + expect(mockedAxios.post.mock.calls[0][1].formats).toContain('changeTracking'); + + expect(result).toHaveProperty('changeTracking'); + expect(result.changeTracking?.previousScrapeAt).toBe('2023-01-01T00:00:00Z'); + expect(result.changeTracking?.changeStatus).toBe('changed'); + expect(result.changeTracking?.visibility).toBe('visible'); + }); + + it('should support change tracking options with git-diff and json modes', async () => { + mockedAxios.post.mockResolvedValueOnce({ + status: 200, + data: { + success: true, + data: { + markdown: 'Test markdown content', + changeTracking: { + previousScrapeAt: '2023-01-01T00:00:00Z', + changeStatus: 'changed', + visibility: 'visible', + diff: { + text: '@@ -1,1 +1,1 @@\n-old content\n+new content', + json: { + files: [{ + from: null, + to: null, + chunks: [{ + content: '@@ -1,1 +1,1 @@', + changes: [{ + type: 'del', + content: '-old content', + del: true, + ln: 1 + }, { + type: 'add', + content: '+new content', + add: true, + ln: 1 + }] + }] + }] + } + }, + json: { + title: { + previous: 'Old Title', + current: 'New Title' + } + } + } + } + } + }); + + const app = new FirecrawlApp({ apiKey: process.env.TEST_API_KEY || 'dummy-api-key-for-testing' }); + const result = await app.scrapeUrl('https://example.com', { + formats: ['markdown', 'changeTracking'], + changeTrackingOptions: { + modes: ['git-diff', 'json'], + schema: { type: 'object', properties: { title: { type: 'string' } } } + } + }); + + expect(mockedAxios.post).toHaveBeenCalledTimes(1); + expect(mockedAxios.post.mock.calls[0][1].formats).toContain('changeTracking'); + expect(mockedAxios.post.mock.calls[0][1].changeTrackingOptions.modes).toEqual(['git-diff', 'json']); + + expect(result).toHaveProperty('changeTracking'); + expect(result.changeTracking?.diff?.text).toBe('@@ -1,1 +1,1 @@\n-old content\n+new content'); + expect(result.changeTracking?.json?.title.previous).toBe('Old Title'); + expect(result.changeTracking?.json?.title.current).toBe('New Title'); + }); +}); diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 7ba44b70..28fbe075 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -74,7 +74,7 @@ export interface FirecrawlDocument; }; }; + json?: any; }; // v1 search only title?: string; @@ -160,6 +161,11 @@ export interface ScrapeParams Date: Sat, 12 Apr 2025 16:49:36 -0700 Subject: [PATCH 078/160] fix(api/tests/scrape): schema change --- apps/api/src/__tests__/snips/scrape.test.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/apps/api/src/__tests__/snips/scrape.test.ts b/apps/api/src/__tests__/snips/scrape.test.ts index d9d97330..b9f1401e 100644 --- a/apps/api/src/__tests__/snips/scrape.test.ts +++ b/apps/api/src/__tests__/snips/scrape.test.ts @@ -123,7 +123,6 @@ describe("Scrape tests", () => { changeTrackingOptions: { modes: ["json"], prompt: "Summarize the changes between the previous and current content", - systemPrompt: "You are a helpful assistant that summarizes changes between document versions." } }); From a03b26a45f2d8812d364f1be4774426585e1fb43 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sat, 12 Apr 2025 17:11:56 -0700 Subject: [PATCH 079/160] Update package.json --- apps/js-sdk/firecrawl/package.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index bf7ed81d..d1ca7c9a 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.23.0", + "version": "1.23.1", "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts", @@ -39,9 +39,9 @@ "@types/dotenv": "^8.2.0", "@types/jest": "^29.5.14", "@types/mocha": "^10.0.6", - "@types/node": "^20.17.30", + "@types/node": "^20.12.12", "@types/uuid": "^9.0.8", - "axios": "^1.8.4", + "axios": "^1.6.8", "dotenv": "^16.4.5", "jest": "^29.7.0", "ts-jest": "^29.2.2", From c5079074bcdfc69ad684bcfb44290c8b95b5bdd0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Sat, 12 Apr 2025 17:16:28 -0700 Subject: [PATCH 080/160] js-sdk: bump --- apps/js-sdk/firecrawl/package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index d1ca7c9a..698c007e 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.23.1", + "version": "1.23.2", "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts", From 950a9512bd12a172afdff7ed64027e1975b355ad Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sat, 12 Apr 2025 17:21:36 -0700 Subject: [PATCH 081/160] Nick: --- apps/js-sdk/firecrawl/package.json | 2 +- .../v1/snips/change-tracking.test.ts | 105 ------------------ 2 files changed, 1 insertion(+), 106 deletions(-) delete mode 100644 apps/js-sdk/firecrawl/src/__tests__/v1/snips/change-tracking.test.ts diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 698c007e..98cfb77a 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.23.2", + "version": "1.23.3", "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts", diff --git a/apps/js-sdk/firecrawl/src/__tests__/v1/snips/change-tracking.test.ts b/apps/js-sdk/firecrawl/src/__tests__/v1/snips/change-tracking.test.ts deleted file mode 100644 index 835582ce..00000000 --- a/apps/js-sdk/firecrawl/src/__tests__/v1/snips/change-tracking.test.ts +++ /dev/null @@ -1,105 +0,0 @@ -import axios from 'axios'; -import FirecrawlApp from '../../../../src/index'; - -jest.mock('axios'); -const mockedAxios = axios as jest.Mocked; - -describe('Change Tracking Tests', () => { - beforeEach(() => { - jest.resetAllMocks(); - }); - - it('should support basic change tracking format', async () => { - mockedAxios.post.mockResolvedValueOnce({ - status: 200, - data: { - success: true, - data: { - markdown: 'Test markdown content', - changeTracking: { - previousScrapeAt: '2023-01-01T00:00:00Z', - changeStatus: 'changed', - visibility: 'visible' - } - } - } - }); - - const app = new FirecrawlApp({ apiKey: process.env.TEST_API_KEY || 'dummy-api-key-for-testing' }); - const result = await app.scrapeUrl('https://example.com', { - formats: ['markdown', 'changeTracking'] - }); - - expect(mockedAxios.post).toHaveBeenCalledTimes(1); - expect(mockedAxios.post.mock.calls[0][1].formats).toContain('changeTracking'); - - expect(result).toHaveProperty('changeTracking'); - expect(result.changeTracking?.previousScrapeAt).toBe('2023-01-01T00:00:00Z'); - expect(result.changeTracking?.changeStatus).toBe('changed'); - expect(result.changeTracking?.visibility).toBe('visible'); - }); - - it('should support change tracking options with git-diff and json modes', async () => { - mockedAxios.post.mockResolvedValueOnce({ - status: 200, - data: { - success: true, - data: { - markdown: 'Test markdown content', - changeTracking: { - previousScrapeAt: '2023-01-01T00:00:00Z', - changeStatus: 'changed', - visibility: 'visible', - diff: { - text: '@@ -1,1 +1,1 @@\n-old content\n+new content', - json: { - files: [{ - from: null, - to: null, - chunks: [{ - content: '@@ -1,1 +1,1 @@', - changes: [{ - type: 'del', - content: '-old content', - del: true, - ln: 1 - }, { - type: 'add', - content: '+new content', - add: true, - ln: 1 - }] - }] - }] - } - }, - json: { - title: { - previous: 'Old Title', - current: 'New Title' - } - } - } - } - } - }); - - const app = new FirecrawlApp({ apiKey: process.env.TEST_API_KEY || 'dummy-api-key-for-testing' }); - const result = await app.scrapeUrl('https://example.com', { - formats: ['markdown', 'changeTracking'], - changeTrackingOptions: { - modes: ['git-diff', 'json'], - schema: { type: 'object', properties: { title: { type: 'string' } } } - } - }); - - expect(mockedAxios.post).toHaveBeenCalledTimes(1); - expect(mockedAxios.post.mock.calls[0][1].formats).toContain('changeTracking'); - expect(mockedAxios.post.mock.calls[0][1].changeTrackingOptions.modes).toEqual(['git-diff', 'json']); - - expect(result).toHaveProperty('changeTracking'); - expect(result.changeTracking?.diff?.text).toBe('@@ -1,1 +1,1 @@\n-old content\n+new content'); - expect(result.changeTracking?.json?.title.previous).toBe('Old Title'); - expect(result.changeTracking?.json?.title.current).toBe('New Title'); - }); -}); From 4414fbca43ae4bff7e8bb73d2a1f2aaf8c719c54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Sat, 12 Apr 2025 17:23:01 -0700 Subject: [PATCH 082/160] fix lock --- apps/js-sdk/firecrawl/package-lock.json | 571 +----------------------- apps/js-sdk/firecrawl/package.json | 2 +- 2 files changed, 5 insertions(+), 568 deletions(-) diff --git a/apps/js-sdk/firecrawl/package-lock.json b/apps/js-sdk/firecrawl/package-lock.json index a96dc2f3..b6263b8c 100644 --- a/apps/js-sdk/firecrawl/package-lock.json +++ b/apps/js-sdk/firecrawl/package-lock.json @@ -1,12 +1,12 @@ { "name": "@mendable/firecrawl-js", - "version": "1.22.1", + "version": "1.23.4", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "@mendable/firecrawl-js", - "version": "1.22.1", + "version": "1.23.4", "license": "MIT", "dependencies": { "typescript-event-target": "^1.1.1", @@ -19,9 +19,9 @@ "@types/dotenv": "^8.2.0", "@types/jest": "^29.5.14", "@types/mocha": "^10.0.6", - "@types/node": "^20.17.30", + "@types/node": "^20.12.12", "@types/uuid": "^9.0.8", - "axios": "^1.8.4", + "axios": "^1.6.8", "dotenv": "^16.4.5", "jest": "^29.7.0", "ts-jest": "^29.2.2", @@ -603,70 +603,6 @@ "integrity": "sha512-0hYQ8SB4Db5zvZB4axdMHGwEaQjkZzFjQiN9LVYvIFB2nSUHW9tYpxWriPrWDASIxiaXax83REcLxuSdnGPZtw==", "dev": true }, - "node_modules/@esbuild/aix-ppc64": { - "version": "0.23.1", - "resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.23.1.tgz", - "integrity": "sha512-6VhYk1diRqrhBAqpJEdjASR/+WVRtfjpqKuNw11cLiaWpAT/Uu+nokB+UJnevzy/P9C/ty6AOe0dwueMrGh/iQ==", - "cpu": [ - "ppc64" - ], - "dev": true, - "optional": true, - "os": [ - "aix" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/android-arm": { - "version": "0.23.1", - "resolved": "https://registry.npmjs.org/@esbuild/android-arm/-/android-arm-0.23.1.tgz", - "integrity": "sha512-uz6/tEy2IFm9RYOyvKl88zdzZfwEfKZmnX9Cj1BHjeSGNuGLuMD1kR8y5bteYmwqKm1tj8m4cb/aKEorr6fHWQ==", - "cpu": [ - "arm" - ], - "dev": true, - "optional": true, - "os": [ - "android" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/android-arm64": { - "version": "0.23.1", - "resolved": "https://registry.npmjs.org/@esbuild/android-arm64/-/android-arm64-0.23.1.tgz", - "integrity": "sha512-xw50ipykXcLstLeWH7WRdQuysJqejuAGPd30vd1i5zSyKK3WE+ijzHmLKxdiCMtH1pHz78rOg0BKSYOSB/2Khw==", - "cpu": [ - "arm64" - ], - "dev": true, - "optional": true, - "os": [ - "android" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/android-x64": { - "version": "0.23.1", - "resolved": "https://registry.npmjs.org/@esbuild/android-x64/-/android-x64-0.23.1.tgz", - "integrity": "sha512-nlN9B69St9BwUoB+jkyU090bru8L0NA3yFvAd7k8dNsVH8bi9a8cUAUSEcEEgTp2z3dbEDGJGfP6VUnkQnlReg==", - "cpu": [ - "x64" - ], - "dev": true, - "optional": true, - "os": [ - "android" - ], - "engines": { - "node": ">=18" - } - }, "node_modules/@esbuild/darwin-arm64": { "version": "0.23.1", "resolved": "https://registry.npmjs.org/@esbuild/darwin-arm64/-/darwin-arm64-0.23.1.tgz", @@ -683,310 +619,6 @@ "node": ">=18" } }, - "node_modules/@esbuild/darwin-x64": { - "version": "0.23.1", - "resolved": "https://registry.npmjs.org/@esbuild/darwin-x64/-/darwin-x64-0.23.1.tgz", - "integrity": "sha512-aClqdgTDVPSEGgoCS8QDG37Gu8yc9lTHNAQlsztQ6ENetKEO//b8y31MMu2ZaPbn4kVsIABzVLXYLhCGekGDqw==", - "cpu": [ - "x64" - ], - "dev": true, - "optional": true, - "os": [ - "darwin" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/freebsd-arm64": { - "version": "0.23.1", - "resolved": "https://registry.npmjs.org/@esbuild/freebsd-arm64/-/freebsd-arm64-0.23.1.tgz", - "integrity": "sha512-h1k6yS8/pN/NHlMl5+v4XPfikhJulk4G+tKGFIOwURBSFzE8bixw1ebjluLOjfwtLqY0kewfjLSrO6tN2MgIhA==", - "cpu": [ - "arm64" - ], - "dev": true, - "optional": true, - "os": [ - "freebsd" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/freebsd-x64": { - "version": "0.23.1", - "resolved": "https://registry.npmjs.org/@esbuild/freebsd-x64/-/freebsd-x64-0.23.1.tgz", - "integrity": "sha512-lK1eJeyk1ZX8UklqFd/3A60UuZ/6UVfGT2LuGo3Wp4/z7eRTRYY+0xOu2kpClP+vMTi9wKOfXi2vjUpO1Ro76g==", - "cpu": [ - "x64" - ], - "dev": true, - "optional": true, - "os": [ - "freebsd" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/linux-arm": { - "version": "0.23.1", - "resolved": "https://registry.npmjs.org/@esbuild/linux-arm/-/linux-arm-0.23.1.tgz", - "integrity": "sha512-CXXkzgn+dXAPs3WBwE+Kvnrf4WECwBdfjfeYHpMeVxWE0EceB6vhWGShs6wi0IYEqMSIzdOF1XjQ/Mkm5d7ZdQ==", - "cpu": [ - "arm" - ], - "dev": true, - "optional": true, - "os": [ - "linux" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/linux-arm64": { - "version": "0.23.1", - "resolved": "https://registry.npmjs.org/@esbuild/linux-arm64/-/linux-arm64-0.23.1.tgz", - "integrity": "sha512-/93bf2yxencYDnItMYV/v116zff6UyTjo4EtEQjUBeGiVpMmffDNUyD9UN2zV+V3LRV3/on4xdZ26NKzn6754g==", - "cpu": [ - "arm64" - ], - "dev": true, - "optional": true, - "os": [ - "linux" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/linux-ia32": { - "version": "0.23.1", - "resolved": "https://registry.npmjs.org/@esbuild/linux-ia32/-/linux-ia32-0.23.1.tgz", - "integrity": "sha512-VTN4EuOHwXEkXzX5nTvVY4s7E/Krz7COC8xkftbbKRYAl96vPiUssGkeMELQMOnLOJ8k3BY1+ZY52tttZnHcXQ==", - "cpu": [ - "ia32" - ], - "dev": true, - "optional": true, - "os": [ - "linux" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/linux-loong64": { - "version": "0.23.1", - "resolved": "https://registry.npmjs.org/@esbuild/linux-loong64/-/linux-loong64-0.23.1.tgz", - "integrity": "sha512-Vx09LzEoBa5zDnieH8LSMRToj7ir/Jeq0Gu6qJ/1GcBq9GkfoEAoXvLiW1U9J1qE/Y/Oyaq33w5p2ZWrNNHNEw==", - "cpu": [ - "loong64" - ], - "dev": true, - "optional": true, - "os": [ - "linux" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/linux-mips64el": { - "version": "0.23.1", - "resolved": "https://registry.npmjs.org/@esbuild/linux-mips64el/-/linux-mips64el-0.23.1.tgz", - "integrity": "sha512-nrFzzMQ7W4WRLNUOU5dlWAqa6yVeI0P78WKGUo7lg2HShq/yx+UYkeNSE0SSfSure0SqgnsxPvmAUu/vu0E+3Q==", - "cpu": [ - "mips64el" - ], - "dev": true, - "optional": true, - "os": [ - "linux" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/linux-ppc64": { - "version": "0.23.1", - "resolved": "https://registry.npmjs.org/@esbuild/linux-ppc64/-/linux-ppc64-0.23.1.tgz", - "integrity": "sha512-dKN8fgVqd0vUIjxuJI6P/9SSSe/mB9rvA98CSH2sJnlZ/OCZWO1DJvxj8jvKTfYUdGfcq2dDxoKaC6bHuTlgcw==", - "cpu": [ - "ppc64" - ], - "dev": true, - "optional": true, - "os": [ - "linux" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/linux-riscv64": { - "version": "0.23.1", - "resolved": "https://registry.npmjs.org/@esbuild/linux-riscv64/-/linux-riscv64-0.23.1.tgz", - "integrity": "sha512-5AV4Pzp80fhHL83JM6LoA6pTQVWgB1HovMBsLQ9OZWLDqVY8MVobBXNSmAJi//Csh6tcY7e7Lny2Hg1tElMjIA==", - "cpu": [ - "riscv64" - ], - "dev": true, - "optional": true, - "os": [ - "linux" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/linux-s390x": { - "version": "0.23.1", - "resolved": "https://registry.npmjs.org/@esbuild/linux-s390x/-/linux-s390x-0.23.1.tgz", - "integrity": "sha512-9ygs73tuFCe6f6m/Tb+9LtYxWR4c9yg7zjt2cYkjDbDpV/xVn+68cQxMXCjUpYwEkze2RcU/rMnfIXNRFmSoDw==", - "cpu": [ - "s390x" - ], - "dev": true, - "optional": true, - "os": [ - "linux" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/linux-x64": { - "version": "0.23.1", - "resolved": "https://registry.npmjs.org/@esbuild/linux-x64/-/linux-x64-0.23.1.tgz", - "integrity": "sha512-EV6+ovTsEXCPAp58g2dD68LxoP/wK5pRvgy0J/HxPGB009omFPv3Yet0HiaqvrIrgPTBuC6wCH1LTOY91EO5hQ==", - "cpu": [ - "x64" - ], - "dev": true, - "optional": true, - "os": [ - "linux" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/netbsd-x64": { - "version": "0.23.1", - "resolved": "https://registry.npmjs.org/@esbuild/netbsd-x64/-/netbsd-x64-0.23.1.tgz", - "integrity": "sha512-aevEkCNu7KlPRpYLjwmdcuNz6bDFiE7Z8XC4CPqExjTvrHugh28QzUXVOZtiYghciKUacNktqxdpymplil1beA==", - "cpu": [ - "x64" - ], - "dev": true, - "optional": true, - "os": [ - "netbsd" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/openbsd-arm64": { - "version": "0.23.1", - "resolved": "https://registry.npmjs.org/@esbuild/openbsd-arm64/-/openbsd-arm64-0.23.1.tgz", - "integrity": "sha512-3x37szhLexNA4bXhLrCC/LImN/YtWis6WXr1VESlfVtVeoFJBRINPJ3f0a/6LV8zpikqoUg4hyXw0sFBt5Cr+Q==", - "cpu": [ - "arm64" - ], - "dev": true, - "optional": true, - "os": [ - "openbsd" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/openbsd-x64": { - "version": "0.23.1", - "resolved": "https://registry.npmjs.org/@esbuild/openbsd-x64/-/openbsd-x64-0.23.1.tgz", - "integrity": "sha512-aY2gMmKmPhxfU+0EdnN+XNtGbjfQgwZj43k8G3fyrDM/UdZww6xrWxmDkuz2eCZchqVeABjV5BpildOrUbBTqA==", - "cpu": [ - "x64" - ], - "dev": true, - "optional": true, - "os": [ - "openbsd" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/sunos-x64": { - "version": "0.23.1", - "resolved": "https://registry.npmjs.org/@esbuild/sunos-x64/-/sunos-x64-0.23.1.tgz", - "integrity": "sha512-RBRT2gqEl0IKQABT4XTj78tpk9v7ehp+mazn2HbUeZl1YMdaGAQqhapjGTCe7uw7y0frDi4gS0uHzhvpFuI1sA==", - "cpu": [ - "x64" - ], - "dev": true, - "optional": true, - "os": [ - "sunos" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/win32-arm64": { - "version": "0.23.1", - "resolved": "https://registry.npmjs.org/@esbuild/win32-arm64/-/win32-arm64-0.23.1.tgz", - "integrity": "sha512-4O+gPR5rEBe2FpKOVyiJ7wNDPA8nGzDuJ6gN4okSA1gEOYZ67N8JPk58tkWtdtPeLz7lBnY6I5L3jdsr3S+A6A==", - "cpu": [ - "arm64" - ], - "dev": true, - "optional": true, - "os": [ - "win32" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/win32-ia32": { - "version": "0.23.1", - "resolved": "https://registry.npmjs.org/@esbuild/win32-ia32/-/win32-ia32-0.23.1.tgz", - "integrity": "sha512-BcaL0Vn6QwCwre3Y717nVHZbAa4UBEigzFm6VdsVdT/MbZ38xoj1X9HPkZhbmaBGUD1W8vxAfffbDe8bA6AKnQ==", - "cpu": [ - "ia32" - ], - "dev": true, - "optional": true, - "os": [ - "win32" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/win32-x64": { - "version": "0.23.1", - "resolved": "https://registry.npmjs.org/@esbuild/win32-x64/-/win32-x64-0.23.1.tgz", - "integrity": "sha512-BHpFFeslkWrXWyUPnbKm+xYYVYruCinGcftSBaa8zoF9hZO4BcSCFUvHVTtzpIY6YzUnYtuEhZ+C9iEXjxnasg==", - "cpu": [ - "x64" - ], - "dev": true, - "optional": true, - "os": [ - "win32" - ], - "engines": { - "node": ">=18" - } - }, "node_modules/@isaacs/cliui": { "version": "8.0.2", "resolved": "https://registry.npmjs.org/@isaacs/cliui/-/cliui-8.0.2.tgz", @@ -1479,32 +1111,6 @@ "node": ">=14" } }, - "node_modules/@rollup/rollup-android-arm-eabi": { - "version": "4.21.2", - "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm-eabi/-/rollup-android-arm-eabi-4.21.2.tgz", - "integrity": "sha512-fSuPrt0ZO8uXeS+xP3b+yYTCBUd05MoSp2N/MFOgjhhUhMmchXlpTQrTpI8T+YAwAQuK7MafsCOxW7VrPMrJcg==", - "cpu": [ - "arm" - ], - "dev": true, - "optional": true, - "os": [ - "android" - ] - }, - "node_modules/@rollup/rollup-android-arm64": { - "version": "4.21.2", - "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm64/-/rollup-android-arm64-4.21.2.tgz", - "integrity": "sha512-xGU5ZQmPlsjQS6tzTTGwMsnKUtu0WVbl0hYpTPauvbRAnmIvpInhJtgjj3mcuJpEiuUw4v1s4BimkdfDWlh7gA==", - "cpu": [ - "arm64" - ], - "dev": true, - "optional": true, - "os": [ - "android" - ] - }, "node_modules/@rollup/rollup-darwin-arm64": { "version": "4.21.2", "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-arm64/-/rollup-darwin-arm64-4.21.2.tgz", @@ -1518,175 +1124,6 @@ "darwin" ] }, - "node_modules/@rollup/rollup-darwin-x64": { - "version": "4.21.2", - "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-x64/-/rollup-darwin-x64-4.21.2.tgz", - "integrity": "sha512-ZbRaUvw2iN/y37x6dY50D8m2BnDbBjlnMPotDi/qITMJ4sIxNY33HArjikDyakhSv0+ybdUxhWxE6kTI4oX26w==", - "cpu": [ - "x64" - ], - "dev": true, - "optional": true, - "os": [ - "darwin" - ] - }, - "node_modules/@rollup/rollup-linux-arm-gnueabihf": { - "version": "4.21.2", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-gnueabihf/-/rollup-linux-arm-gnueabihf-4.21.2.tgz", - "integrity": "sha512-ztRJJMiE8nnU1YFcdbd9BcH6bGWG1z+jP+IPW2oDUAPxPjo9dverIOyXz76m6IPA6udEL12reYeLojzW2cYL7w==", - "cpu": [ - "arm" - ], - "dev": true, - "optional": true, - "os": [ - "linux" - ] - }, - "node_modules/@rollup/rollup-linux-arm-musleabihf": { - "version": "4.21.2", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-musleabihf/-/rollup-linux-arm-musleabihf-4.21.2.tgz", - "integrity": "sha512-flOcGHDZajGKYpLV0JNc0VFH361M7rnV1ee+NTeC/BQQ1/0pllYcFmxpagltANYt8FYf9+kL6RSk80Ziwyhr7w==", - "cpu": [ - "arm" - ], - "dev": true, - "optional": true, - "os": [ - "linux" - ] - }, - "node_modules/@rollup/rollup-linux-arm64-gnu": { - "version": "4.21.2", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-gnu/-/rollup-linux-arm64-gnu-4.21.2.tgz", - "integrity": "sha512-69CF19Kp3TdMopyteO/LJbWufOzqqXzkrv4L2sP8kfMaAQ6iwky7NoXTp7bD6/irKgknDKM0P9E/1l5XxVQAhw==", - "cpu": [ - "arm64" - ], - "dev": true, - "optional": true, - "os": [ - "linux" - ] - }, - "node_modules/@rollup/rollup-linux-arm64-musl": { - "version": "4.21.2", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-musl/-/rollup-linux-arm64-musl-4.21.2.tgz", - "integrity": "sha512-48pD/fJkTiHAZTnZwR0VzHrao70/4MlzJrq0ZsILjLW/Ab/1XlVUStYyGt7tdyIiVSlGZbnliqmult/QGA2O2w==", - "cpu": [ - "arm64" - ], - "dev": true, - "optional": true, - "os": [ - "linux" - ] - }, - "node_modules/@rollup/rollup-linux-powerpc64le-gnu": { - "version": "4.21.2", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-powerpc64le-gnu/-/rollup-linux-powerpc64le-gnu-4.21.2.tgz", - "integrity": "sha512-cZdyuInj0ofc7mAQpKcPR2a2iu4YM4FQfuUzCVA2u4HI95lCwzjoPtdWjdpDKyHxI0UO82bLDoOaLfpZ/wviyQ==", - "cpu": [ - "ppc64" - ], - "dev": true, - "optional": true, - "os": [ - "linux" - ] - }, - "node_modules/@rollup/rollup-linux-riscv64-gnu": { - "version": "4.21.2", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-gnu/-/rollup-linux-riscv64-gnu-4.21.2.tgz", - "integrity": "sha512-RL56JMT6NwQ0lXIQmMIWr1SW28z4E4pOhRRNqwWZeXpRlykRIlEpSWdsgNWJbYBEWD84eocjSGDu/XxbYeCmwg==", - "cpu": [ - "riscv64" - ], - "dev": true, - "optional": true, - "os": [ - "linux" - ] - }, - "node_modules/@rollup/rollup-linux-s390x-gnu": { - "version": "4.21.2", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-s390x-gnu/-/rollup-linux-s390x-gnu-4.21.2.tgz", - "integrity": "sha512-PMxkrWS9z38bCr3rWvDFVGD6sFeZJw4iQlhrup7ReGmfn7Oukrr/zweLhYX6v2/8J6Cep9IEA/SmjXjCmSbrMQ==", - "cpu": [ - "s390x" - ], - "dev": true, - "optional": true, - "os": [ - "linux" - ] - }, - "node_modules/@rollup/rollup-linux-x64-gnu": { - "version": "4.21.2", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-gnu/-/rollup-linux-x64-gnu-4.21.2.tgz", - "integrity": "sha512-B90tYAUoLhU22olrafY3JQCFLnT3NglazdwkHyxNDYF/zAxJt5fJUB/yBoWFoIQ7SQj+KLe3iL4BhOMa9fzgpw==", - "cpu": [ - "x64" - ], - "dev": true, - "optional": true, - "os": [ - "linux" - ] - }, - "node_modules/@rollup/rollup-linux-x64-musl": { - "version": "4.21.2", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-musl/-/rollup-linux-x64-musl-4.21.2.tgz", - "integrity": "sha512-7twFizNXudESmC9oneLGIUmoHiiLppz/Xs5uJQ4ShvE6234K0VB1/aJYU3f/4g7PhssLGKBVCC37uRkkOi8wjg==", - "cpu": [ - "x64" - ], - "dev": true, - "optional": true, - "os": [ - "linux" - ] - }, - "node_modules/@rollup/rollup-win32-arm64-msvc": { - "version": "4.21.2", - "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-arm64-msvc/-/rollup-win32-arm64-msvc-4.21.2.tgz", - "integrity": "sha512-9rRero0E7qTeYf6+rFh3AErTNU1VCQg2mn7CQcI44vNUWM9Ze7MSRS/9RFuSsox+vstRt97+x3sOhEey024FRQ==", - "cpu": [ - "arm64" - ], - "dev": true, - "optional": true, - "os": [ - "win32" - ] - }, - "node_modules/@rollup/rollup-win32-ia32-msvc": { - "version": "4.21.2", - "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-ia32-msvc/-/rollup-win32-ia32-msvc-4.21.2.tgz", - "integrity": "sha512-5rA4vjlqgrpbFVVHX3qkrCo/fZTj1q0Xxpg+Z7yIo3J2AilW7t2+n6Q8Jrx+4MrYpAnjttTYF8rr7bP46BPzRw==", - "cpu": [ - "ia32" - ], - "dev": true, - "optional": true, - "os": [ - "win32" - ] - }, - "node_modules/@rollup/rollup-win32-x64-msvc": { - "version": "4.21.2", - "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-x64-msvc/-/rollup-win32-x64-msvc-4.21.2.tgz", - "integrity": "sha512-6UUxd0+SKomjdzuAcp+HAmxw1FlGBnl1v2yEPSabtx4lBfdXHDVsW7+lQkgz9cNFJGY3AWR7+V8P5BqkD9L9nA==", - "cpu": [ - "x64" - ], - "dev": true, - "optional": true, - "os": [ - "win32" - ] - }, "node_modules/@sinclair/typebox": { "version": "0.27.8", "resolved": "https://registry.npmjs.org/@sinclair/typebox/-/typebox-0.27.8.tgz", diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 98cfb77a..ce4f1f15 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.23.3", + "version": "1.23.4", "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts", From 3cf6d88bfff055fc88e116dbae48e5e871c64d85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Sat, 12 Apr 2025 17:26:17 -0700 Subject: [PATCH 083/160] js-sdk: change ci --- .github/workflows/publish-js-sdk.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/publish-js-sdk.yml b/.github/workflows/publish-js-sdk.yml index d9f13611..88708b8a 100644 --- a/.github/workflows/publish-js-sdk.yml +++ b/.github/workflows/publish-js-sdk.yml @@ -24,7 +24,7 @@ jobs: run: echo "//registry.npmjs.org/:_authToken=${{ secrets.NPM_TOKEN }}" > ~/.npmrc - name: Publish run: | - npm ci + npm install npm run build npm publish --access public sed -i 's/"name": "@mendable\/firecrawl-js"/"name": "@mendable\/firecrawl"/g' package.json From 557df1ab95296531e445c33f2d36ec2ee0f15ba5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Sat, 12 Apr 2025 17:26:35 -0700 Subject: [PATCH 084/160] js-sdk: bump --- apps/js-sdk/firecrawl/package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index ce4f1f15..a62e5866 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.23.4", + "version": "1.23.5", "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts", From 32798e2200ba456a9f458353435158632abd21fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Sat, 12 Apr 2025 17:27:47 -0700 Subject: [PATCH 085/160] revert lock --- apps/js-sdk/firecrawl/package-lock.json | 598 +++++++++++++++++++++++- apps/js-sdk/firecrawl/package.json | 2 +- 2 files changed, 576 insertions(+), 24 deletions(-) diff --git a/apps/js-sdk/firecrawl/package-lock.json b/apps/js-sdk/firecrawl/package-lock.json index b6263b8c..c6cfbace 100644 --- a/apps/js-sdk/firecrawl/package-lock.json +++ b/apps/js-sdk/firecrawl/package-lock.json @@ -1,14 +1,15 @@ { "name": "@mendable/firecrawl-js", - "version": "1.23.4", + "version": "1.23.6", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "@mendable/firecrawl-js", - "version": "1.23.4", + "version": "1.23.6", "license": "MIT", "dependencies": { + "axios": "^1.6.8", "typescript-event-target": "^1.1.1", "zod": "^3.23.8", "zod-to-json-schema": "^3.23.0" @@ -17,11 +18,10 @@ "@jest/globals": "^29.7.0", "@types/axios": "^0.14.0", "@types/dotenv": "^8.2.0", - "@types/jest": "^29.5.14", + "@types/jest": "^29.5.12", "@types/mocha": "^10.0.6", "@types/node": "^20.12.12", "@types/uuid": "^9.0.8", - "axios": "^1.6.8", "dotenv": "^16.4.5", "jest": "^29.7.0", "ts-jest": "^29.2.2", @@ -603,6 +603,70 @@ "integrity": "sha512-0hYQ8SB4Db5zvZB4axdMHGwEaQjkZzFjQiN9LVYvIFB2nSUHW9tYpxWriPrWDASIxiaXax83REcLxuSdnGPZtw==", "dev": true }, + "node_modules/@esbuild/aix-ppc64": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.23.1.tgz", + "integrity": "sha512-6VhYk1diRqrhBAqpJEdjASR/+WVRtfjpqKuNw11cLiaWpAT/Uu+nokB+UJnevzy/P9C/ty6AOe0dwueMrGh/iQ==", + "cpu": [ + "ppc64" + ], + "dev": true, + "optional": true, + "os": [ + "aix" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/android-arm": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/android-arm/-/android-arm-0.23.1.tgz", + "integrity": "sha512-uz6/tEy2IFm9RYOyvKl88zdzZfwEfKZmnX9Cj1BHjeSGNuGLuMD1kR8y5bteYmwqKm1tj8m4cb/aKEorr6fHWQ==", + "cpu": [ + "arm" + ], + "dev": true, + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/android-arm64": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/android-arm64/-/android-arm64-0.23.1.tgz", + "integrity": "sha512-xw50ipykXcLstLeWH7WRdQuysJqejuAGPd30vd1i5zSyKK3WE+ijzHmLKxdiCMtH1pHz78rOg0BKSYOSB/2Khw==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/android-x64": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/android-x64/-/android-x64-0.23.1.tgz", + "integrity": "sha512-nlN9B69St9BwUoB+jkyU090bru8L0NA3yFvAd7k8dNsVH8bi9a8cUAUSEcEEgTp2z3dbEDGJGfP6VUnkQnlReg==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=18" + } + }, "node_modules/@esbuild/darwin-arm64": { "version": "0.23.1", "resolved": "https://registry.npmjs.org/@esbuild/darwin-arm64/-/darwin-arm64-0.23.1.tgz", @@ -619,6 +683,310 @@ "node": ">=18" } }, + "node_modules/@esbuild/darwin-x64": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/darwin-x64/-/darwin-x64-0.23.1.tgz", + "integrity": "sha512-aClqdgTDVPSEGgoCS8QDG37Gu8yc9lTHNAQlsztQ6ENetKEO//b8y31MMu2ZaPbn4kVsIABzVLXYLhCGekGDqw==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/freebsd-arm64": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/freebsd-arm64/-/freebsd-arm64-0.23.1.tgz", + "integrity": "sha512-h1k6yS8/pN/NHlMl5+v4XPfikhJulk4G+tKGFIOwURBSFzE8bixw1ebjluLOjfwtLqY0kewfjLSrO6tN2MgIhA==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "freebsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/freebsd-x64": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/freebsd-x64/-/freebsd-x64-0.23.1.tgz", + "integrity": "sha512-lK1eJeyk1ZX8UklqFd/3A60UuZ/6UVfGT2LuGo3Wp4/z7eRTRYY+0xOu2kpClP+vMTi9wKOfXi2vjUpO1Ro76g==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "freebsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-arm": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/linux-arm/-/linux-arm-0.23.1.tgz", + "integrity": "sha512-CXXkzgn+dXAPs3WBwE+Kvnrf4WECwBdfjfeYHpMeVxWE0EceB6vhWGShs6wi0IYEqMSIzdOF1XjQ/Mkm5d7ZdQ==", + "cpu": [ + "arm" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-arm64": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/linux-arm64/-/linux-arm64-0.23.1.tgz", + "integrity": "sha512-/93bf2yxencYDnItMYV/v116zff6UyTjo4EtEQjUBeGiVpMmffDNUyD9UN2zV+V3LRV3/on4xdZ26NKzn6754g==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-ia32": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/linux-ia32/-/linux-ia32-0.23.1.tgz", + "integrity": "sha512-VTN4EuOHwXEkXzX5nTvVY4s7E/Krz7COC8xkftbbKRYAl96vPiUssGkeMELQMOnLOJ8k3BY1+ZY52tttZnHcXQ==", + "cpu": [ + "ia32" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-loong64": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/linux-loong64/-/linux-loong64-0.23.1.tgz", + "integrity": "sha512-Vx09LzEoBa5zDnieH8LSMRToj7ir/Jeq0Gu6qJ/1GcBq9GkfoEAoXvLiW1U9J1qE/Y/Oyaq33w5p2ZWrNNHNEw==", + "cpu": [ + "loong64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-mips64el": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/linux-mips64el/-/linux-mips64el-0.23.1.tgz", + "integrity": "sha512-nrFzzMQ7W4WRLNUOU5dlWAqa6yVeI0P78WKGUo7lg2HShq/yx+UYkeNSE0SSfSure0SqgnsxPvmAUu/vu0E+3Q==", + "cpu": [ + "mips64el" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-ppc64": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/linux-ppc64/-/linux-ppc64-0.23.1.tgz", + "integrity": "sha512-dKN8fgVqd0vUIjxuJI6P/9SSSe/mB9rvA98CSH2sJnlZ/OCZWO1DJvxj8jvKTfYUdGfcq2dDxoKaC6bHuTlgcw==", + "cpu": [ + "ppc64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-riscv64": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/linux-riscv64/-/linux-riscv64-0.23.1.tgz", + "integrity": "sha512-5AV4Pzp80fhHL83JM6LoA6pTQVWgB1HovMBsLQ9OZWLDqVY8MVobBXNSmAJi//Csh6tcY7e7Lny2Hg1tElMjIA==", + "cpu": [ + "riscv64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-s390x": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/linux-s390x/-/linux-s390x-0.23.1.tgz", + "integrity": "sha512-9ygs73tuFCe6f6m/Tb+9LtYxWR4c9yg7zjt2cYkjDbDpV/xVn+68cQxMXCjUpYwEkze2RcU/rMnfIXNRFmSoDw==", + "cpu": [ + "s390x" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-x64": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/linux-x64/-/linux-x64-0.23.1.tgz", + "integrity": "sha512-EV6+ovTsEXCPAp58g2dD68LxoP/wK5pRvgy0J/HxPGB009omFPv3Yet0HiaqvrIrgPTBuC6wCH1LTOY91EO5hQ==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/netbsd-x64": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/netbsd-x64/-/netbsd-x64-0.23.1.tgz", + "integrity": "sha512-aevEkCNu7KlPRpYLjwmdcuNz6bDFiE7Z8XC4CPqExjTvrHugh28QzUXVOZtiYghciKUacNktqxdpymplil1beA==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "netbsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/openbsd-arm64": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/openbsd-arm64/-/openbsd-arm64-0.23.1.tgz", + "integrity": "sha512-3x37szhLexNA4bXhLrCC/LImN/YtWis6WXr1VESlfVtVeoFJBRINPJ3f0a/6LV8zpikqoUg4hyXw0sFBt5Cr+Q==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "openbsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/openbsd-x64": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/openbsd-x64/-/openbsd-x64-0.23.1.tgz", + "integrity": "sha512-aY2gMmKmPhxfU+0EdnN+XNtGbjfQgwZj43k8G3fyrDM/UdZww6xrWxmDkuz2eCZchqVeABjV5BpildOrUbBTqA==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "openbsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/sunos-x64": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/sunos-x64/-/sunos-x64-0.23.1.tgz", + "integrity": "sha512-RBRT2gqEl0IKQABT4XTj78tpk9v7ehp+mazn2HbUeZl1YMdaGAQqhapjGTCe7uw7y0frDi4gS0uHzhvpFuI1sA==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "sunos" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/win32-arm64": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/win32-arm64/-/win32-arm64-0.23.1.tgz", + "integrity": "sha512-4O+gPR5rEBe2FpKOVyiJ7wNDPA8nGzDuJ6gN4okSA1gEOYZ67N8JPk58tkWtdtPeLz7lBnY6I5L3jdsr3S+A6A==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/win32-ia32": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/win32-ia32/-/win32-ia32-0.23.1.tgz", + "integrity": "sha512-BcaL0Vn6QwCwre3Y717nVHZbAa4UBEigzFm6VdsVdT/MbZ38xoj1X9HPkZhbmaBGUD1W8vxAfffbDe8bA6AKnQ==", + "cpu": [ + "ia32" + ], + "dev": true, + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/win32-x64": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/win32-x64/-/win32-x64-0.23.1.tgz", + "integrity": "sha512-BHpFFeslkWrXWyUPnbKm+xYYVYruCinGcftSBaa8zoF9hZO4BcSCFUvHVTtzpIY6YzUnYtuEhZ+C9iEXjxnasg==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=18" + } + }, "node_modules/@isaacs/cliui": { "version": "8.0.2", "resolved": "https://registry.npmjs.org/@isaacs/cliui/-/cliui-8.0.2.tgz", @@ -1111,6 +1479,32 @@ "node": ">=14" } }, + "node_modules/@rollup/rollup-android-arm-eabi": { + "version": "4.21.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm-eabi/-/rollup-android-arm-eabi-4.21.2.tgz", + "integrity": "sha512-fSuPrt0ZO8uXeS+xP3b+yYTCBUd05MoSp2N/MFOgjhhUhMmchXlpTQrTpI8T+YAwAQuK7MafsCOxW7VrPMrJcg==", + "cpu": [ + "arm" + ], + "dev": true, + "optional": true, + "os": [ + "android" + ] + }, + "node_modules/@rollup/rollup-android-arm64": { + "version": "4.21.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm64/-/rollup-android-arm64-4.21.2.tgz", + "integrity": "sha512-xGU5ZQmPlsjQS6tzTTGwMsnKUtu0WVbl0hYpTPauvbRAnmIvpInhJtgjj3mcuJpEiuUw4v1s4BimkdfDWlh7gA==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "android" + ] + }, "node_modules/@rollup/rollup-darwin-arm64": { "version": "4.21.2", "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-arm64/-/rollup-darwin-arm64-4.21.2.tgz", @@ -1124,6 +1518,175 @@ "darwin" ] }, + "node_modules/@rollup/rollup-darwin-x64": { + "version": "4.21.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-x64/-/rollup-darwin-x64-4.21.2.tgz", + "integrity": "sha512-ZbRaUvw2iN/y37x6dY50D8m2BnDbBjlnMPotDi/qITMJ4sIxNY33HArjikDyakhSv0+ybdUxhWxE6kTI4oX26w==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "darwin" + ] + }, + "node_modules/@rollup/rollup-linux-arm-gnueabihf": { + "version": "4.21.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-gnueabihf/-/rollup-linux-arm-gnueabihf-4.21.2.tgz", + "integrity": "sha512-ztRJJMiE8nnU1YFcdbd9BcH6bGWG1z+jP+IPW2oDUAPxPjo9dverIOyXz76m6IPA6udEL12reYeLojzW2cYL7w==", + "cpu": [ + "arm" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-arm-musleabihf": { + "version": "4.21.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-musleabihf/-/rollup-linux-arm-musleabihf-4.21.2.tgz", + "integrity": "sha512-flOcGHDZajGKYpLV0JNc0VFH361M7rnV1ee+NTeC/BQQ1/0pllYcFmxpagltANYt8FYf9+kL6RSk80Ziwyhr7w==", + "cpu": [ + "arm" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-arm64-gnu": { + "version": "4.21.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-gnu/-/rollup-linux-arm64-gnu-4.21.2.tgz", + "integrity": "sha512-69CF19Kp3TdMopyteO/LJbWufOzqqXzkrv4L2sP8kfMaAQ6iwky7NoXTp7bD6/irKgknDKM0P9E/1l5XxVQAhw==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-arm64-musl": { + "version": "4.21.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-musl/-/rollup-linux-arm64-musl-4.21.2.tgz", + "integrity": "sha512-48pD/fJkTiHAZTnZwR0VzHrao70/4MlzJrq0ZsILjLW/Ab/1XlVUStYyGt7tdyIiVSlGZbnliqmult/QGA2O2w==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-powerpc64le-gnu": { + "version": "4.21.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-powerpc64le-gnu/-/rollup-linux-powerpc64le-gnu-4.21.2.tgz", + "integrity": "sha512-cZdyuInj0ofc7mAQpKcPR2a2iu4YM4FQfuUzCVA2u4HI95lCwzjoPtdWjdpDKyHxI0UO82bLDoOaLfpZ/wviyQ==", + "cpu": [ + "ppc64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-riscv64-gnu": { + "version": "4.21.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-gnu/-/rollup-linux-riscv64-gnu-4.21.2.tgz", + "integrity": "sha512-RL56JMT6NwQ0lXIQmMIWr1SW28z4E4pOhRRNqwWZeXpRlykRIlEpSWdsgNWJbYBEWD84eocjSGDu/XxbYeCmwg==", + "cpu": [ + "riscv64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-s390x-gnu": { + "version": "4.21.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-s390x-gnu/-/rollup-linux-s390x-gnu-4.21.2.tgz", + "integrity": "sha512-PMxkrWS9z38bCr3rWvDFVGD6sFeZJw4iQlhrup7ReGmfn7Oukrr/zweLhYX6v2/8J6Cep9IEA/SmjXjCmSbrMQ==", + "cpu": [ + "s390x" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-x64-gnu": { + "version": "4.21.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-gnu/-/rollup-linux-x64-gnu-4.21.2.tgz", + "integrity": "sha512-B90tYAUoLhU22olrafY3JQCFLnT3NglazdwkHyxNDYF/zAxJt5fJUB/yBoWFoIQ7SQj+KLe3iL4BhOMa9fzgpw==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-x64-musl": { + "version": "4.21.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-musl/-/rollup-linux-x64-musl-4.21.2.tgz", + "integrity": "sha512-7twFizNXudESmC9oneLGIUmoHiiLppz/Xs5uJQ4ShvE6234K0VB1/aJYU3f/4g7PhssLGKBVCC37uRkkOi8wjg==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-win32-arm64-msvc": { + "version": "4.21.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-arm64-msvc/-/rollup-win32-arm64-msvc-4.21.2.tgz", + "integrity": "sha512-9rRero0E7qTeYf6+rFh3AErTNU1VCQg2mn7CQcI44vNUWM9Ze7MSRS/9RFuSsox+vstRt97+x3sOhEey024FRQ==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@rollup/rollup-win32-ia32-msvc": { + "version": "4.21.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-ia32-msvc/-/rollup-win32-ia32-msvc-4.21.2.tgz", + "integrity": "sha512-5rA4vjlqgrpbFVVHX3qkrCo/fZTj1q0Xxpg+Z7yIo3J2AilW7t2+n6Q8Jrx+4MrYpAnjttTYF8rr7bP46BPzRw==", + "cpu": [ + "ia32" + ], + "dev": true, + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@rollup/rollup-win32-x64-msvc": { + "version": "4.21.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-x64-msvc/-/rollup-win32-x64-msvc-4.21.2.tgz", + "integrity": "sha512-6UUxd0+SKomjdzuAcp+HAmxw1FlGBnl1v2yEPSabtx4lBfdXHDVsW7+lQkgz9cNFJGY3AWR7+V8P5BqkD9L9nA==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "win32" + ] + }, "node_modules/@sinclair/typebox": { "version": "0.27.8", "resolved": "https://registry.npmjs.org/@sinclair/typebox/-/typebox-0.27.8.tgz", @@ -1249,11 +1812,10 @@ } }, "node_modules/@types/jest": { - "version": "29.5.14", - "resolved": "https://registry.npmjs.org/@types/jest/-/jest-29.5.14.tgz", - "integrity": "sha512-ZN+4sdnLUbo8EVvVc2ao0GFW6oVrQRPn4K2lglySj7APvSrgzxHiNNK99us4WDMi57xxA2yggblIAMNhXOotLQ==", + "version": "29.5.12", + "resolved": "https://registry.npmjs.org/@types/jest/-/jest-29.5.12.tgz", + "integrity": "sha512-eDC8bTvT/QhYdxJAulQikueigY5AsdBRH2yDKW3yveW7svY3+DzN84/2NUgkw10RTiJbWqZrTtoGVdYlvFJdLw==", "dev": true, - "license": "MIT", "dependencies": { "expect": "^29.0.0", "pretty-format": "^29.0.0" @@ -1387,15 +1949,12 @@ "node_modules/asynckit": { "version": "0.4.0", "resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz", - "integrity": "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==", - "dev": true + "integrity": "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==" }, "node_modules/axios": { - "version": "1.8.4", - "resolved": "https://registry.npmjs.org/axios/-/axios-1.8.4.tgz", - "integrity": "sha512-eBSYY4Y68NNlHbHBMdeDmKNtDgXWhQsJcGqzO3iLUM0GraQFSS9cVgPX5I9b3lbdFKyYoAEGAZF1DwhTaljNAw==", - "dev": true, - "license": "MIT", + "version": "1.6.8", + "resolved": "https://registry.npmjs.org/axios/-/axios-1.6.8.tgz", + "integrity": "sha512-v/ZHtJDU39mDpyBoFVkETcd/uNdxrWRrg3bKpOKzXFA6Bvqopts6ALSMU3y6ijYxbw2B+wPrIv46egTzJXCLGQ==", "dependencies": { "follow-redirects": "^1.15.6", "form-data": "^4.0.0", @@ -1792,7 +2351,6 @@ "version": "1.0.8", "resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.8.tgz", "integrity": "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==", - "dev": true, "dependencies": { "delayed-stream": "~1.0.0" }, @@ -1909,7 +2467,6 @@ "version": "1.0.0", "resolved": "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz", "integrity": "sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==", - "dev": true, "engines": { "node": ">=0.4.0" } @@ -2227,7 +2784,6 @@ "version": "1.15.6", "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.6.tgz", "integrity": "sha512-wWN62YITEaOpSK584EZXJafH1AGpO8RVgElfkuXbTOrPX4fIfOyEpW/CsiNd8JdYrAoOvafRTOEnvsO++qCqFA==", - "dev": true, "funding": [ { "type": "individual", @@ -2275,7 +2831,6 @@ "version": "4.0.0", "resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.0.tgz", "integrity": "sha512-ETEklSGi5t0QMZuiXoA/Q6vcnxcLQP5vdugSpuAyi6SVGi2clPPp+xgEhuMaHC+zGgn31Kd235W35f7Hykkaww==", - "dev": true, "dependencies": { "asynckit": "^0.4.0", "combined-stream": "^1.0.8", @@ -3556,7 +4111,6 @@ "version": "1.52.0", "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.52.0.tgz", "integrity": "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg==", - "dev": true, "engines": { "node": ">= 0.6" } @@ -3565,7 +4119,6 @@ "version": "2.1.35", "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.35.tgz", "integrity": "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==", - "dev": true, "dependencies": { "mime-db": "1.52.0" }, @@ -3954,8 +4507,7 @@ "node_modules/proxy-from-env": { "version": "1.1.0", "resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-1.1.0.tgz", - "integrity": "sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg==", - "dev": true + "integrity": "sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg==" }, "node_modules/punycode": { "version": "2.3.1", diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index a62e5866..1b5b93a9 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.23.5", + "version": "1.23.6", "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts", From 62c842e64a1b6275b80a3d7e3c2820bce4bcc31b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Sat, 12 Apr 2025 17:40:16 -0700 Subject: [PATCH 086/160] js-sdk: fix tsup config --- apps/js-sdk/firecrawl/package.json | 2 +- apps/js-sdk/firecrawl/tsup.config.ts | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 1b5b93a9..02c0be3f 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.23.6", + "version": "1.23.7", "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts", diff --git a/apps/js-sdk/firecrawl/tsup.config.ts b/apps/js-sdk/firecrawl/tsup.config.ts index 31d67739..1ab62aeb 100644 --- a/apps/js-sdk/firecrawl/tsup.config.ts +++ b/apps/js-sdk/firecrawl/tsup.config.ts @@ -9,6 +9,7 @@ export default defineConfig({ platform: "node", target: "node22", noExternal: ["typescript-event-target"], + external: ["axios"], esbuildOptions(options) { options.define = { ...options.define, From 58ad7f40b6b0a83ec7180f4ee6f5f85998040b62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Sat, 12 Apr 2025 17:42:30 -0700 Subject: [PATCH 087/160] js-sdk: once more --- apps/js-sdk/firecrawl/package.json | 2 +- apps/js-sdk/firecrawl/tsup.config.ts | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 02c0be3f..e19699b3 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.23.7", + "version": "1.23.8", "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts", diff --git a/apps/js-sdk/firecrawl/tsup.config.ts b/apps/js-sdk/firecrawl/tsup.config.ts index 1ab62aeb..c2d4d51f 100644 --- a/apps/js-sdk/firecrawl/tsup.config.ts +++ b/apps/js-sdk/firecrawl/tsup.config.ts @@ -8,8 +8,7 @@ export default defineConfig({ clean: true, platform: "node", target: "node22", - noExternal: ["typescript-event-target"], - external: ["axios"], + noExternal: ["typescript-event-target", "axios"], esbuildOptions(options) { options.define = { ...options.define, From 63a283bfe7ce8b44e616d5b016a6ca131b24e075 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Sat, 12 Apr 2025 17:44:19 -0700 Subject: [PATCH 088/160] js-sdk: once again again --- apps/js-sdk/firecrawl/package.json | 6 +++--- apps/js-sdk/firecrawl/tsup.config.ts | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index e19699b3..12b65dcf 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.23.8", + "version": "1.23.9", "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts", @@ -27,7 +27,8 @@ "dependencies": { "typescript-event-target": "^1.1.1", "zod": "^3.23.8", - "zod-to-json-schema": "^3.23.0" + "zod-to-json-schema": "^3.23.0", + "axios": "^1.6.8" }, "bugs": { "url": "https://github.com/mendableai/firecrawl/issues" @@ -41,7 +42,6 @@ "@types/mocha": "^10.0.6", "@types/node": "^20.12.12", "@types/uuid": "^9.0.8", - "axios": "^1.6.8", "dotenv": "^16.4.5", "jest": "^29.7.0", "ts-jest": "^29.2.2", diff --git a/apps/js-sdk/firecrawl/tsup.config.ts b/apps/js-sdk/firecrawl/tsup.config.ts index c2d4d51f..31d67739 100644 --- a/apps/js-sdk/firecrawl/tsup.config.ts +++ b/apps/js-sdk/firecrawl/tsup.config.ts @@ -8,7 +8,7 @@ export default defineConfig({ clean: true, platform: "node", target: "node22", - noExternal: ["typescript-event-target", "axios"], + noExternal: ["typescript-event-target"], esbuildOptions(options) { options.define = { ...options.define, From d260f367e42195f62ea8d367b674f8717cf85171 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Sun, 13 Apr 2025 10:12:35 -0700 Subject: [PATCH 089/160] feat(acuc): bump --- apps/api/src/controllers/auth.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/api/src/controllers/auth.ts b/apps/api/src/controllers/auth.ts index 395331af..8cd67c0e 100644 --- a/apps/api/src/controllers/auth.ts +++ b/apps/api/src/controllers/auth.ts @@ -138,7 +138,7 @@ export async function getACUC( const client = Math.random() > (2/3) ? supabase_rr_service : supabase_service; ({ data, error } = await client.rpc( - "auth_credit_usage_chunk_28", + "auth_credit_usage_chunk_29", { input_key: api_key, i_is_extract: isExtract, tally_untallied_credits: true }, { get: true }, )); @@ -249,7 +249,7 @@ export async function getACUCTeam( const client = Math.random() > (2/3) ? supabase_rr_service : supabase_service; ({ data, error } = await client.rpc( - "auth_credit_usage_chunk_28_from_team", + "auth_credit_usage_chunk_29_from_team", { input_team: team_id, i_is_extract: isExtract, tally_untallied_credits: true }, { get: true }, )); From 5658232ed6f93aed7fbe10c649cb47a80de4e303 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Sun, 13 Apr 2025 10:32:03 -0700 Subject: [PATCH 090/160] feat(acuc): bump 30 --- apps/api/src/controllers/auth.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/api/src/controllers/auth.ts b/apps/api/src/controllers/auth.ts index 8cd67c0e..de1ced3a 100644 --- a/apps/api/src/controllers/auth.ts +++ b/apps/api/src/controllers/auth.ts @@ -138,7 +138,7 @@ export async function getACUC( const client = Math.random() > (2/3) ? supabase_rr_service : supabase_service; ({ data, error } = await client.rpc( - "auth_credit_usage_chunk_29", + "auth_credit_usage_chunk_30", { input_key: api_key, i_is_extract: isExtract, tally_untallied_credits: true }, { get: true }, )); @@ -249,7 +249,7 @@ export async function getACUCTeam( const client = Math.random() > (2/3) ? supabase_rr_service : supabase_service; ({ data, error } = await client.rpc( - "auth_credit_usage_chunk_29_from_team", + "auth_credit_usage_chunk_30_from_team", { input_team: team_id, i_is_extract: isExtract, tally_untallied_credits: true }, { get: true }, )); From 9ab2a2663613c23d474c3a9a3b881289abdec809 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Sun, 13 Apr 2025 18:25:12 +0000 Subject: [PATCH 091/160] Add waitFor of 5000ms for changeTracking format Co-Authored-By: hello@sideguide.dev --- apps/api/src/__tests__/snips/scrape.test.ts | 17 +++++++++++++++++ apps/api/src/scraper/scrapeURL/index.ts | 3 +++ 2 files changed, 20 insertions(+) diff --git a/apps/api/src/__tests__/snips/scrape.test.ts b/apps/api/src/__tests__/snips/scrape.test.ts index b9f1401e..ebb311f9 100644 --- a/apps/api/src/__tests__/snips/scrape.test.ts +++ b/apps/api/src/__tests__/snips/scrape.test.ts @@ -95,6 +95,23 @@ describe("Scrape tests", () => { expect(response.changeTracking).toBeDefined(); expect(response.changeTracking?.previousScrapeAt).not.toBeNull(); }, 30000); + + it.concurrent("enforces a minimum waitFor of 5000ms", async () => { + const response1 = await scrape({ + url: "https://example.com", + formats: ["markdown", "changeTracking"], + }); + + expect(response1.changeTracking).toBeDefined(); + + const response2 = await scrape({ + url: "https://example.com", + formats: ["markdown", "changeTracking"], + waitFor: 1000, + }); + + expect(response2.changeTracking).toBeDefined(); + }, 30000); it.concurrent("includes git diff when requested", async () => { const response = await scrape({ diff --git a/apps/api/src/scraper/scrapeURL/index.ts b/apps/api/src/scraper/scrapeURL/index.ts index cedd275e..1408dde1 100644 --- a/apps/api/src/scraper/scrapeURL/index.ts +++ b/apps/api/src/scraper/scrapeURL/index.ts @@ -390,6 +390,9 @@ export async function scrapeURL( options: ScrapeOptions, internalOptions: InternalOptions, ): Promise { + if (options.formats.includes("changeTracking") && options.waitFor < 5000) { + options.waitFor = 5000; + } const meta = await buildMetaObject(id, url, options, internalOptions); try { while (true) { From 723e7b7c6055c64b3d83aea69f7251e676948bae Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sun, 13 Apr 2025 11:26:30 -0700 Subject: [PATCH 092/160] Update diff.ts --- apps/api/src/scraper/scrapeURL/transformers/diff.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/apps/api/src/scraper/scrapeURL/transformers/diff.ts b/apps/api/src/scraper/scrapeURL/transformers/diff.ts index f1c2d1fc..b84f66e9 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/diff.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/diff.ts @@ -86,9 +86,10 @@ export async function deriveDiff(meta: Meta, document: Document): Promise Date: Sun, 13 Apr 2025 11:38:24 -0700 Subject: [PATCH 093/160] feat(api): install git to docker to have proper diffs --- apps/api/Dockerfile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/apps/api/Dockerfile b/apps/api/Dockerfile index 5818b886..ec15a2cf 100644 --- a/apps/api/Dockerfile +++ b/apps/api/Dockerfile @@ -38,6 +38,9 @@ COPY --from=prod-deps /app/node_modules /app/node_modules COPY --from=go-base /app/sharedLibs/go-html-to-md/html-to-markdown.so /app/sharedLibs/go-html-to-md/html-to-markdown.so COPY --from=rust-base /app/sharedLibs/html-transformer/target/release/libhtml_transformer.so /app/sharedLibs/html-transformer/target/release/libhtml_transformer.so +# Install git +RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/* + # Start the server by default, this can be overwritten at runtime EXPOSE 8080 From 8bc2f167126b535a6e5bca8245cb8b6193fae2e5 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Sun, 13 Apr 2025 18:45:17 +0000 Subject: [PATCH 094/160] Move waitFor check for changeTracking to types.ts refine layer Co-Authored-By: hello@sideguide.dev --- apps/api/src/controllers/v1/types.ts | 4 ++++ apps/api/src/scraper/scrapeURL/index.ts | 3 --- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 56a93072..718cfba5 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -282,6 +282,10 @@ const extractTransform = (obj) => { obj = { ...obj, timeout: 60000 }; } + if (obj.formats?.includes("changeTracking") && obj.waitFor < 5000) { + obj = { ...obj, waitFor: 5000 }; + } + if (obj.formats?.includes("json")) { obj.formats.push("extract"); } diff --git a/apps/api/src/scraper/scrapeURL/index.ts b/apps/api/src/scraper/scrapeURL/index.ts index 1408dde1..cedd275e 100644 --- a/apps/api/src/scraper/scrapeURL/index.ts +++ b/apps/api/src/scraper/scrapeURL/index.ts @@ -390,9 +390,6 @@ export async function scrapeURL( options: ScrapeOptions, internalOptions: InternalOptions, ): Promise { - if (options.formats.includes("changeTracking") && options.waitFor < 5000) { - options.waitFor = 5000; - } const meta = await buildMetaObject(id, url, options, internalOptions); try { while (true) { From 4026866c15f6d738bfcc605af89e08e684c18c69 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Sun, 13 Apr 2025 18:50:22 +0000 Subject: [PATCH 095/160] Fix waitFor check to handle undefined values Co-Authored-By: hello@sideguide.dev --- apps/api/src/controllers/v1/types.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 718cfba5..780541ba 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -282,7 +282,7 @@ const extractTransform = (obj) => { obj = { ...obj, timeout: 60000 }; } - if (obj.formats?.includes("changeTracking") && obj.waitFor < 5000) { + if (obj.formats?.includes("changeTracking") && (obj.waitFor === undefined || obj.waitFor < 5000)) { obj = { ...obj, waitFor: 5000 }; } From 0c5bd8554094e345fd2049fd799259a2148952d5 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sun, 13 Apr 2025 11:50:52 -0700 Subject: [PATCH 096/160] Update apps/api/src/__tests__/snips/scrape.test.ts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Gergő Móricz --- apps/api/src/__tests__/snips/scrape.test.ts | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/apps/api/src/__tests__/snips/scrape.test.ts b/apps/api/src/__tests__/snips/scrape.test.ts index ebb311f9..b9f1401e 100644 --- a/apps/api/src/__tests__/snips/scrape.test.ts +++ b/apps/api/src/__tests__/snips/scrape.test.ts @@ -95,23 +95,6 @@ describe("Scrape tests", () => { expect(response.changeTracking).toBeDefined(); expect(response.changeTracking?.previousScrapeAt).not.toBeNull(); }, 30000); - - it.concurrent("enforces a minimum waitFor of 5000ms", async () => { - const response1 = await scrape({ - url: "https://example.com", - formats: ["markdown", "changeTracking"], - }); - - expect(response1.changeTracking).toBeDefined(); - - const response2 = await scrape({ - url: "https://example.com", - formats: ["markdown", "changeTracking"], - waitFor: 1000, - }); - - expect(response2.changeTracking).toBeDefined(); - }, 30000); it.concurrent("includes git diff when requested", async () => { const response = await scrape({ From 6bdae3cf4fffc8ec7e39c41b9d0c73f2e56bf267 Mon Sep 17 00:00:00 2001 From: "devin-ai-integration[bot]" <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Sun, 13 Apr 2025 11:52:01 -0700 Subject: [PATCH 097/160] Add waitFor of 5000ms for changeTracking format (#1450) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add waitFor of 5000ms for changeTracking format Co-Authored-By: hello@sideguide.dev * Move waitFor check for changeTracking to types.ts refine layer Co-Authored-By: hello@sideguide.dev * Fix waitFor check to handle undefined values Co-Authored-By: hello@sideguide.dev * Update apps/api/src/__tests__/snips/scrape.test.ts Co-authored-by: Gergő Móricz --------- Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: hello@sideguide.dev Co-authored-by: Nicolas Co-authored-by: Gergő Móricz --- apps/api/src/controllers/v1/types.ts | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 56a93072..780541ba 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -282,6 +282,10 @@ const extractTransform = (obj) => { obj = { ...obj, timeout: 60000 }; } + if (obj.formats?.includes("changeTracking") && (obj.waitFor === undefined || obj.waitFor < 5000)) { + obj = { ...obj, waitFor: 5000 }; + } + if (obj.formats?.includes("json")) { obj.formats.push("extract"); } From 94d0b4be438f72f26e8611d45b4f1b956e662a8c Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sun, 13 Apr 2025 11:53:35 -0700 Subject: [PATCH 098/160] Update types.ts --- apps/api/src/controllers/v1/types.ts | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 780541ba..c143a52f 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -286,6 +286,10 @@ const extractTransform = (obj) => { obj = { ...obj, waitFor: 5000 }; } + if (obj.formats?.includes("changeTracking") && obj.timeout === 30000) { + obj = { ...obj, timeout: 60000 }; + } + if (obj.formats?.includes("json")) { obj.formats.push("extract"); } From 2857496356758f557678e05ae6eae10d07b0bfb1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Sun, 13 Apr 2025 22:45:58 -0700 Subject: [PATCH 099/160] feat(diff): better rpc (revert this if broken) --- apps/api/src/scraper/scrapeURL/transformers/diff.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/scraper/scrapeURL/transformers/diff.ts b/apps/api/src/scraper/scrapeURL/transformers/diff.ts index b84f66e9..196027a5 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/diff.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/diff.ts @@ -53,7 +53,7 @@ function compareExtractedData(previousData: any, currentData: any): any { export async function deriveDiff(meta: Meta, document: Document): Promise { if (meta.options.formats.includes("changeTracking")) { const res = await supabase_service - .rpc("diff_get_last_scrape_2", { + .rpc("diff_get_last_scrape_3", { i_team_id: meta.internalOptions.teamId, i_url: document.metadata.sourceURL ?? meta.url, }); From 713d58675ebeb7b0033b63cb9f797da9f6db28f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Sun, 13 Apr 2025 23:16:55 -0700 Subject: [PATCH 100/160] fix(auth): preview acuc team --- apps/api/src/controllers/auth.ts | 38 ++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/apps/api/src/controllers/auth.ts b/apps/api/src/controllers/auth.ts index de1ced3a..297574a9 100644 --- a/apps/api/src/controllers/auth.ts +++ b/apps/api/src/controllers/auth.ts @@ -72,6 +72,38 @@ export async function setCachedACUC( } } +const mockPreviewACUC: (team_id: string) => AuthCreditUsageChunkFromTeam = (team_id) => ({ + api_key: "preview", + team_id, + sub_id: "bypass", + sub_current_period_start: new Date().toISOString(), + sub_current_period_end: new Date(new Date().getTime() + 30 * 24 * 60 * 60 * 1000).toISOString(), + sub_user_id: "bypass", + price_id: "bypass", + rate_limits: { + crawl: 2, + scrape: 10, + extract: 10, + search: 5, + map: 5, + preview: 5, + crawlStatus: 500, + extractStatus: 500, + }, + price_credits: 99999999, + credits_used: 0, + coupon_credits: 99999999, + adjusted_credits_used: 0, + remaining_credits: 99999999, + total_credits_sum: 99999999, + plan_priority: { + bucketLimit: 25, + planModifier: 0.1, + }, + concurrency: 2, + is_extract: false, +}); + const mockACUC: () => AuthCreditUsageChunk = () => ({ api_key: "bypass", team_id: "bypass", @@ -223,6 +255,12 @@ export async function getACUCTeam( let isExtract = mode === RateLimiterMode.Extract || mode === RateLimiterMode.ExtractStatus; + + if (team_id.startsWith("preview")) { + const acuc = mockPreviewACUC(team_id); + acuc.is_extract = isExtract; + return acuc; + } if (process.env.USE_DB_AUTHENTICATION !== "true") { const acuc = mockACUC(); From 07cdde74097a7e21c787f455e7b7cc65dcdb093c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Sun, 13 Apr 2025 23:21:34 -0700 Subject: [PATCH 101/160] feat(auth): preview acuc team more --- apps/api/src/controllers/auth.ts | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/apps/api/src/controllers/auth.ts b/apps/api/src/controllers/auth.ts index 297574a9..4ca3b967 100644 --- a/apps/api/src/controllers/auth.ts +++ b/apps/api/src/controllers/auth.ts @@ -72,7 +72,7 @@ export async function setCachedACUC( } } -const mockPreviewACUC: (team_id: string) => AuthCreditUsageChunkFromTeam = (team_id) => ({ +const mockPreviewACUC: (team_id: string, is_extract: boolean) => AuthCreditUsageChunk = (team_id, is_extract) => ({ api_key: "preview", team_id, sub_id: "bypass", @@ -100,8 +100,8 @@ const mockPreviewACUC: (team_id: string) => AuthCreditUsageChunkFromTeam = (team bucketLimit: 25, planModifier: 0.1, }, - concurrency: 2, - is_extract: false, + concurrency: is_extract ? 200 : 2, + is_extract, }); const mockACUC: () => AuthCreditUsageChunk = () => ({ @@ -145,6 +145,12 @@ export async function getACUC( let isExtract = mode === RateLimiterMode.Extract || mode === RateLimiterMode.ExtractStatus; + + if (api_key === process.env.PREVIEW_TOKEN) { + const acuc = mockPreviewACUC(api_key, isExtract); + acuc.is_extract = isExtract; + return acuc; + } if (process.env.USE_DB_AUTHENTICATION !== "true") { const acuc = mockACUC(); @@ -257,8 +263,7 @@ export async function getACUCTeam( mode === RateLimiterMode.ExtractStatus; if (team_id.startsWith("preview")) { - const acuc = mockPreviewACUC(team_id); - acuc.is_extract = isExtract; + const acuc = mockPreviewACUC(team_id, isExtract); return acuc; } From ebdf182b00a61f3901c1df504d2e5e562d209780 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Sun, 13 Apr 2025 23:26:57 -0700 Subject: [PATCH 102/160] feat(auth): more ip --- apps/api/src/controllers/auth.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/controllers/auth.ts b/apps/api/src/controllers/auth.ts index 4ca3b967..b8063da2 100644 --- a/apps/api/src/controllers/auth.ts +++ b/apps/api/src/controllers/auth.ts @@ -393,7 +393,7 @@ export async function supaAuthenticateUser( }; } - const incomingIP = (req.headers["x-forwarded-for"] || + const incomingIP = (req.headers["x-preview-ip"] || req.headers["x-forwarded-for"] || req.socket.remoteAddress) as string; const iptoken = incomingIP + token; From 0b50349fed21c8b4c1b7beca4ee9556d6f0fff53 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Mon, 14 Apr 2025 11:39:13 -0700 Subject: [PATCH 103/160] feat(v0): fix jobs --- apps/api/src/controllers/v0/crawl-status.ts | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/apps/api/src/controllers/v0/crawl-status.ts b/apps/api/src/controllers/v0/crawl-status.ts index 26ee0ee9..aefcda13 100644 --- a/apps/api/src/controllers/v0/crawl-status.ts +++ b/apps/api/src/controllers/v0/crawl-status.ts @@ -10,16 +10,19 @@ import { configDotenv } from "dotenv"; import { Job } from "bullmq"; import { toLegacyDocument } from "../v1/types"; import type { DBJob, PseudoJob } from "../v1/crawl-status"; +import { getJobFromGCS } from "../../lib/gcs-jobs"; configDotenv(); export async function getJobs(crawlId: string, ids: string[]): Promise[]> { - const [bullJobs, dbJobs] = await Promise.all([ + const [bullJobs, dbJobs, gcsJobs] = await Promise.all([ Promise.all(ids.map((x) => getScrapeQueue().getJob(x))).then(x => x.filter(x => x)) as Promise<(Job & { id: string })[]>, process.env.USE_DB_AUTHENTICATION === "true" ? await supabaseGetJobsByCrawlId(crawlId) : [], + process.env.GCS_BUCKET_NAME ? Promise.all(ids.map(async (x) => ({ id: x, job: await getJobFromGCS(x) }))).then(x => x.filter(x => x.job)) as Promise<({ id: string, job: any | null })[]> : [], ]); const bullJobMap = new Map>(); const dbJobMap = new Map(); + const gcsJobMap = new Map(); for (const job of bullJobs) { bullJobMap.set(job.id, job); @@ -28,16 +31,26 @@ export async function getJobs(crawlId: string, ids: string[]): Promise[] = []; for (const id of ids) { const bullJob = bullJobMap.get(id); const dbJob = dbJobMap.get(id); + const gcsJob = gcsJobMap.get(id); if (!bullJob && !dbJob) continue; - const data = dbJob?.docs ?? bullJob?.returnvalue; + const data = gcsJob ?? dbJob?.docs ?? bullJob?.returnvalue; + if (gcsJob === null && data) { + logger.warn("GCS Job not found", { + jobId: id, + }); + } const job: PseudoJob = { id, From 5dca350a11ca5cb3b8c357cf14df9a4ff2edeb91 Mon Sep 17 00:00:00 2001 From: Aparup Ganguly Date: Tue, 15 Apr 2025 00:48:19 +0530 Subject: [PATCH 104/160] Add examples/gpt-4.1-crawler --- examples/gpt-4.1-web-crawler/.env.example | 5 + examples/gpt-4.1-web-crawler/.gitignore | 38 +++ examples/gpt-4.1-web-crawler/README.md | 82 ++++++ .../gpt-4.1-web-crawler.py | 261 ++++++++++++++++++ examples/gpt-4.1-web-crawler/requirements.txt | 3 + 5 files changed, 389 insertions(+) create mode 100644 examples/gpt-4.1-web-crawler/.env.example create mode 100644 examples/gpt-4.1-web-crawler/.gitignore create mode 100644 examples/gpt-4.1-web-crawler/README.md create mode 100644 examples/gpt-4.1-web-crawler/gpt-4.1-web-crawler.py create mode 100644 examples/gpt-4.1-web-crawler/requirements.txt diff --git a/examples/gpt-4.1-web-crawler/.env.example b/examples/gpt-4.1-web-crawler/.env.example new file mode 100644 index 00000000..307a65f2 --- /dev/null +++ b/examples/gpt-4.1-web-crawler/.env.example @@ -0,0 +1,5 @@ +# Firecrawl API key +FIRECRAWL_API_KEY=your_firecrawl_api_key_here + +# OpenAI API key +OPENAI_API_KEY=your_openai_api_key_here \ No newline at end of file diff --git a/examples/gpt-4.1-web-crawler/.gitignore b/examples/gpt-4.1-web-crawler/.gitignore new file mode 100644 index 00000000..dbb2de94 --- /dev/null +++ b/examples/gpt-4.1-web-crawler/.gitignore @@ -0,0 +1,38 @@ +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +*.egg-info/ +.installed.cfg +*.egg + +# Virtual environment +venv/ +ENV/ +.env + +# IDE specific files +.idea/ +.vscode/ +*.swp +*.swo + +# Logs +*.log + +# OS specific files +.DS_Store +Thumbs.db \ No newline at end of file diff --git a/examples/gpt-4.1-web-crawler/README.md b/examples/gpt-4.1-web-crawler/README.md new file mode 100644 index 00000000..1cf0e5d7 --- /dev/null +++ b/examples/gpt-4.1-web-crawler/README.md @@ -0,0 +1,82 @@ +# GPT-4.1 Web Crawler + +A smart web crawler powered by GPT-4.1 that intelligently searches websites to find specific information based on user objectives. + +## Features + +- Intelligently maps website content using semantic search +- Ranks website pages by relevance to your objective +- Extracts structured information using GPT-4.1 +- Returns results in clean JSON format + +## Prerequisites + +- Python 3.8+ +- Firecrawl API key +- OpenAI API key (with access to GPT-4.1 models) + +## Installation + +1. Clone this repository: + + ``` + git clone https://github.com/yourusername/gpt-4.1-web-crawler.git + cd gpt-4.1-web-crawler + ``` + +2. Install the required dependencies: + + ``` + pip install -r requirements.txt + ``` + +3. Set up environment variables: + ``` + cp .env.example .env + ``` + Then edit the `.env` file and add your API keys. + +## Usage + +Run the script: + +``` +python gpt-4.1-web-crawler.py +``` + +The program will prompt you for: + +1. The website URL to crawl +2. Your specific objective (what information you want to find) + +Example: + +``` +Enter the website to crawl: https://example.com +Enter your objective: Find the company's leadership team with their roles and short bios +``` + +The crawler will then: + +1. Map the website +2. Identify the most relevant pages +3. Scrape and analyze those pages +4. Return structured information if the objective is met + +## How It Works + +1. **Mapping**: The crawler uses Firecrawl to map the website structure and find relevant pages based on search terms derived from your objective. + +2. **Ranking**: GPT-4.1 analyzes the URLs to determine which pages are most likely to contain the information you're looking for. + +3. **Extraction**: The top pages are scraped and analyzed to extract the specific information requested in your objective. + +4. **Results**: If found, the information is returned in a clean, structured JSON format. + +## License + +[MIT License](LICENSE) + +## Contributing + +Contributions are welcome! Please feel free to submit a Pull Request. diff --git a/examples/gpt-4.1-web-crawler/gpt-4.1-web-crawler.py b/examples/gpt-4.1-web-crawler/gpt-4.1-web-crawler.py new file mode 100644 index 00000000..a82ac050 --- /dev/null +++ b/examples/gpt-4.1-web-crawler/gpt-4.1-web-crawler.py @@ -0,0 +1,261 @@ +import os +from firecrawl import FirecrawlApp +import json +from dotenv import load_dotenv +from openai import OpenAI + +# ANSI color codes +class Colors: + CYAN = '\033[96m' + YELLOW = '\033[93m' + GREEN = '\033[92m' + RED = '\033[91m' + MAGENTA = '\033[95m' + BLUE = '\033[94m' + RESET = '\033[0m' + +# Load environment variables +load_dotenv() + +# Retrieve API keys from environment variables +firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY") +openai_api_key = os.getenv("OPENAI_API_KEY") + +# Initialize the FirecrawlApp and OpenAI client +app = FirecrawlApp(api_key=firecrawl_api_key) +client = OpenAI(api_key=openai_api_key) + +# Find the page that most likely contains the objective +def find_relevant_page_via_map(objective, url, app, client): + try: + print(f"{Colors.CYAN}Understood. The objective is: {objective}{Colors.RESET}") + print(f"{Colors.CYAN}Initiating search on the website: {url}{Colors.RESET}") + + map_prompt = f""" + The map function generates a list of URLs from a website and it accepts a search parameter. Based on the objective of: {objective}, come up with a 1-2 word search parameter that will help us find the information we need. Only respond with 1-2 words nothing else. + """ + + print(f"{Colors.YELLOW}Analyzing objective to determine optimal search parameter...{Colors.RESET}") + completion = client.chat.completions.create( + model="gpt-4.1-2025-04-14", + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": map_prompt + } + ] + } + ] + ) + + map_search_parameter = completion.choices[0].message.content + print(f"{Colors.GREEN}Optimal search parameter identified: {map_search_parameter}{Colors.RESET}") + + print(f"{Colors.YELLOW}Mapping website using the identified search parameter...{Colors.RESET}") + map_website = app.map_url(url, params={"search": map_search_parameter}) + + # Debug print to see the response structure + print(f"{Colors.MAGENTA}Debug - Map response structure: {json.dumps(map_website, indent=2)}{Colors.RESET}") + + print(f"{Colors.GREEN}Website mapping completed successfully.{Colors.RESET}") + + # Handle the response based on its structure + if isinstance(map_website, dict): + # Assuming the links are in a 'urls' or similar key + links = map_website.get('urls', []) or map_website.get('links', []) + elif isinstance(map_website, str): + try: + parsed = json.loads(map_website) + links = parsed.get('urls', []) or parsed.get('links', []) + except json.JSONDecodeError: + links = [] + else: + links = map_website if isinstance(map_website, list) else [] + + if not links: + print(f"{Colors.RED}No links found in map response.{Colors.RESET}") + return None + + rank_prompt = f""" + Given this list of URLs and the objective: {objective} + Analyze each URL and rank the top 3 most relevant ones that are most likely to contain the information we need. + Return your response as a JSON array with exactly 3 objects, each containing: + - "url": the full URL + - "relevance_score": number between 0-100 indicating relevance to objective + - "reason": brief explanation of why this URL is relevant + + Example output: + [ + {{ + "url": "https://example.com/about", + "relevance_score": 95, + "reason": "Main about page containing company information" + }}, + {{ + "url": "https://example.com/team", + "relevance_score": 80, + "reason": "Team page with leadership details" + }}, + {{ + "url": "https://example.com/contact", + "relevance_score": 70, + "reason": "Contact page with location information" + }} + ] + + URLs to analyze: + {json.dumps(links, indent=2)} + """ + + print(f"{Colors.YELLOW}Ranking URLs by relevance to objective...{Colors.RESET}") + completion = client.chat.completions.create( + model="gpt-4.1-2025-04-14", + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": rank_prompt + } + ] + } + ] + ) + + try: + ranked_results = json.loads(completion.choices[0].message.content) + links = [result["url"] for result in ranked_results] + + # Print detailed ranking info + print(f"{Colors.CYAN}Top 3 ranked URLs:{Colors.RESET}") + for result in ranked_results: + print(f"{Colors.GREEN}URL: {result['url']}{Colors.RESET}") + print(f"{Colors.YELLOW}Relevance Score: {result['relevance_score']}{Colors.RESET}") + print(f"{Colors.BLUE}Reason: {result['reason']}{Colors.RESET}") + print("---") + + if not links: + print(f"{Colors.RED}No relevant links identified.{Colors.RESET}") + return None + + except (json.JSONDecodeError, KeyError) as e: + print(f"{Colors.RED}Error parsing ranked results: {str(e)}{Colors.RESET}") + return None + + print(f"{Colors.GREEN}Located {len(links)} relevant links.{Colors.RESET}") + return links + + except Exception as e: + print(f"{Colors.RED}Error encountered during relevant page identification: {str(e)}{Colors.RESET}") + return None + +# Scrape the top 3 pages and see if the objective is met, if so return in json format else return None +def find_objective_in_top_pages(map_website, objective, app, client): + try: + # Get top 3 links from the map result + if not map_website: + print(f"{Colors.RED}No links found to analyze.{Colors.RESET}") + return None + + top_links = map_website[:3] + print(f"{Colors.CYAN}Proceeding to analyze top {len(top_links)} links: {top_links}{Colors.RESET}") + + for link in top_links: + print(f"{Colors.YELLOW}Initiating scrape of page: {link}{Colors.RESET}") + # Scrape the page + scrape_result = app.scrape_url(link, params={'formats': ['markdown']}) + print(f"{Colors.GREEN}Page scraping completed successfully.{Colors.RESET}") + + + # Check if objective is met + check_prompt = f""" + Given the following scraped content and objective, determine if the objective is met. + If it is, extract the relevant information in a simple and concise JSON format. Use only the necessary fields and avoid nested structures if possible. + If the objective is not met with confidence, respond with 'Objective not met'. + + Objective: {objective} + Scraped content: {scrape_result['markdown']} + + Remember: + 1. Only return JSON if you are confident the objective is fully met. + 2. Keep the JSON structure as simple and flat as possible. + 3. Do not include any explanations or markdown formatting in your response. + """ + + completion = client.chat.completions.create( + model="gpt-4.1-2025-04-14", + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": check_prompt + } + ] + } + ] + ) + + result = completion.choices[0].message.content + + if result != "Objective not met": + print(f"{Colors.GREEN}Objective potentially fulfilled. Relevant information identified.{Colors.RESET}") + try: + # Clean up potential markdown formatting or extra text + if "```json" in result: + result = result.split("```json")[1].split("```")[0].strip() + elif "```" in result: + result = result.split("```")[1].split("```")[0].strip() + + # Try to find JSON content if there's explanatory text + if "{" in result and "}" in result: + start_idx = result.find("{") + end_idx = result.rfind("}") + 1 + if start_idx >= 0 and end_idx > start_idx: + result = result[start_idx:end_idx] + + return json.loads(result) + except json.JSONDecodeError as e: + print(f"{Colors.RED}Error in parsing response: {str(e)}. Proceeding to next page...{Colors.RESET}") + # Optionally print the raw response for debugging + # print(f"{Colors.MAGENTA}Raw response: {result}{Colors.RESET}") + else: + print(f"{Colors.YELLOW}Objective not met on this page. Proceeding to next link...{Colors.RESET}") + + print(f"{Colors.RED}All available pages analyzed. Objective not fulfilled in examined content.{Colors.RESET}") + return None + + except Exception as e: + print(f"{Colors.RED}Error encountered during page analysis: {str(e)}{Colors.RESET}") + return None + +# Main function to execute the process +def main(): + # Get user input + url = input(f"{Colors.BLUE}Enter the website to crawl : {Colors.RESET}") + objective = input(f"{Colors.BLUE}Enter your objective: {Colors.RESET}") + + print(f"{Colors.YELLOW}Initiating web crawling process...{Colors.RESET}") + # Find the relevant page + map_website = find_relevant_page_via_map(objective, url, app, client) + + if map_website: + print(f"{Colors.GREEN}Relevant pages identified. Proceeding with detailed analysis using GPT-4.1...{Colors.RESET}") + # Find objective in top pages + result = find_objective_in_top_pages(map_website, objective, app, client) + + if result: + print(f"{Colors.GREEN}Objective successfully fulfilled. Extracted information :{Colors.RESET}") + print(f"{Colors.MAGENTA}{json.dumps(result, indent=2)}{Colors.RESET}") + else: + print(f"{Colors.RED}Unable to fulfill the objective with the available content.{Colors.RESET}") + else: + print(f"{Colors.RED}No relevant pages identified. Consider refining the search parameters or trying a different website.{Colors.RESET}") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/examples/gpt-4.1-web-crawler/requirements.txt b/examples/gpt-4.1-web-crawler/requirements.txt new file mode 100644 index 00000000..0710ac17 --- /dev/null +++ b/examples/gpt-4.1-web-crawler/requirements.txt @@ -0,0 +1,3 @@ +firecrawl==0.11.0 +openai==1.14.0 +python-dotenv==1.0.0 \ No newline at end of file From b415e625a0c27c83499c6e643f01b7a60d799d77 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 15 Apr 2025 00:07:44 +0200 Subject: [PATCH 105/160] feat(scrape): get job result from GCS, avoid Redis (#1461) * feat(scrape): get job result from GCS, avoid Redis * call logjob on scrapes * Fix inverse bool * fix more * migrate gracefully * refactor * feat(tests/search): test with scrape --- apps/api/src/__tests__/snips/search.test.ts | 14 +++++++++++++ apps/api/src/controllers/v0/scrape.ts | 3 ++- apps/api/src/controllers/v0/search.ts | 3 ++- apps/api/src/controllers/v1/scrape.ts | 19 +++--------------- apps/api/src/controllers/v1/search.ts | 4 +++- apps/api/src/lib/extract/document-scraper.ts | 4 +++- apps/api/src/lib/gcs-jobs.ts | 1 + apps/api/src/services/queue-jobs.ts | 19 +++++++++++++++--- apps/api/src/services/queue-worker.ts | 21 +++++++++++++++++--- 9 files changed, 62 insertions(+), 26 deletions(-) diff --git a/apps/api/src/__tests__/snips/search.test.ts b/apps/api/src/__tests__/snips/search.test.ts index 67b07674..22e7bb92 100644 --- a/apps/api/src/__tests__/snips/search.test.ts +++ b/apps/api/src/__tests__/snips/search.test.ts @@ -6,4 +6,18 @@ describe("Search tests", () => { query: "firecrawl" }); }, 60000); + + it.concurrent("works with scrape", async () => { + const res = await search({ + query: "firecrawl", + limit: 5, + scrapeOptions: { + formats: ["markdown"], + }, + }); + + for (const doc of res) { + expect(doc.markdown).toBeDefined(); + } + }, 60000); }); diff --git a/apps/api/src/controllers/v0/scrape.ts b/apps/api/src/controllers/v0/scrape.ts index 053afd50..b7bb3359 100644 --- a/apps/api/src/controllers/v0/scrape.ts +++ b/apps/api/src/controllers/v0/scrape.ts @@ -30,6 +30,7 @@ import { fromLegacyScrapeOptions } from "../v1/types"; import { ZodError } from "zod"; import { Document as V0Document } from "./../../lib/entities"; import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings"; +import { getJobFromGCS } from "../../lib/gcs-jobs"; export async function scrapeHelper( jobId: string, @@ -93,7 +94,7 @@ export async function scrapeHelper( }, async (span) => { try { - doc = await waitForJob(jobId, timeout); + doc = await waitForJob(jobId, timeout); } catch (e) { if ( e instanceof Error && diff --git a/apps/api/src/controllers/v0/search.ts b/apps/api/src/controllers/v0/search.ts index 45092b77..e216db4b 100644 --- a/apps/api/src/controllers/v0/search.ts +++ b/apps/api/src/controllers/v0/search.ts @@ -22,6 +22,7 @@ import { fromLegacyScrapeOptions, toLegacyDocument, } from "../v1/types"; +import { getJobFromGCS } from "../../lib/gcs-jobs"; export async function searchHelper( jobId: string, @@ -123,7 +124,7 @@ export async function searchHelper( const docs = ( await Promise.all( - jobDatas.map((x) => waitForJob(x.opts.jobId, 60000)), + jobDatas.map((x) => waitForJob(x.opts.jobId, 60000)), ) ).map((x) => toLegacyDocument(x, internalOptions)); diff --git a/apps/api/src/controllers/v1/scrape.ts b/apps/api/src/controllers/v1/scrape.ts index 6259981d..15fa097d 100644 --- a/apps/api/src/controllers/v1/scrape.ts +++ b/apps/api/src/controllers/v1/scrape.ts @@ -13,6 +13,8 @@ import { addScrapeJob, waitForJob } from "../../services/queue-jobs"; import { logJob } from "../../services/logging/log_job"; import { getJobPriority } from "../../lib/job-priority"; import { getScrapeQueue } from "../../services/queue-service"; +import { getJob } from "./crawl-status"; +import { getJobFromGCS } from "../../lib/gcs-jobs"; export async function scrapeController( req: RequestWithAuth<{}, ScrapeResponse, ScrapeRequest>, @@ -66,7 +68,7 @@ export async function scrapeController( let doc: Document; try { - doc = await waitForJob(jobId, timeout + totalWait); // TODO: better types for this + doc = await waitForJob(jobId, timeout + totalWait); } catch (e) { logger.error(`Error in scrapeController: ${e}`, { jobId, @@ -123,21 +125,6 @@ export async function scrapeController( } } - logJob({ - job_id: jobId, - success: true, - message: "Scrape completed", - num_docs: 1, - docs: [doc], - time_taken: timeTakenInSeconds, - team_id: req.auth.team_id, - mode: "scrape", - url: req.body.url, - scrapeOptions: req.body, - origin: origin, - num_tokens: numTokens, - }); - return res.status(200).json({ success: true, data: doc, diff --git a/apps/api/src/controllers/v1/search.ts b/apps/api/src/controllers/v1/search.ts index bcb18166..06f26700 100644 --- a/apps/api/src/controllers/v1/search.ts +++ b/apps/api/src/controllers/v1/search.ts @@ -20,6 +20,7 @@ import * as Sentry from "@sentry/node"; import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings"; import { logger as _logger } from "../../lib/logger"; import type { Logger } from "winston"; +import { getJobFromGCS } from "../../lib/gcs-jobs"; // Used for deep research export async function searchAndScrapeSearchResult( @@ -99,7 +100,8 @@ async function scrapeSearchResult( jobPriority, ); - const doc = await waitForJob(jobId, options.timeout); + const doc: Document = await waitForJob(jobId, options.timeout); + logger.info("Scrape job completed", { scrapeId: jobId, url: searchResult.url, diff --git a/apps/api/src/lib/extract/document-scraper.ts b/apps/api/src/lib/extract/document-scraper.ts index f5501230..b1d14193 100644 --- a/apps/api/src/lib/extract/document-scraper.ts +++ b/apps/api/src/lib/extract/document-scraper.ts @@ -5,6 +5,7 @@ import { waitForJob } from "../../services/queue-jobs"; import { addScrapeJob } from "../../services/queue-jobs"; import { getJobPriority } from "../job-priority"; import type { Logger } from "winston"; +import { getJobFromGCS } from "../gcs-jobs"; interface ScrapeDocumentOptions { url: string; @@ -53,7 +54,8 @@ export async function scrapeDocument( jobPriority, ); - const doc = await waitForJob(jobId, timeout); + const doc = await waitForJob(jobId, timeout); + await getScrapeQueue().remove(jobId); if (trace) { diff --git a/apps/api/src/lib/gcs-jobs.ts b/apps/api/src/lib/gcs-jobs.ts index 6895c7e1..f4e68cd2 100644 --- a/apps/api/src/lib/gcs-jobs.ts +++ b/apps/api/src/lib/gcs-jobs.ts @@ -1,6 +1,7 @@ import { FirecrawlJob } from "../types"; import { Storage } from "@google-cloud/storage"; import { logger } from "./logger"; +import { Document } from "../controllers/v1/types"; const credentials = process.env.GCS_CREDENTIALS ? JSON.parse(atob(process.env.GCS_CREDENTIALS)) : undefined; diff --git a/apps/api/src/services/queue-jobs.ts b/apps/api/src/services/queue-jobs.ts index 24924d7d..7778393a 100644 --- a/apps/api/src/services/queue-jobs.ts +++ b/apps/api/src/services/queue-jobs.ts @@ -13,6 +13,8 @@ import { logger } from "../lib/logger"; import { sendNotificationWithCustomDays } from './notification/email_notification'; import { shouldSendConcurrencyLimitNotification } from './notification/notification-check'; import { getACUC, getACUCTeam } from "../controllers/auth"; +import { getJobFromGCS } from "../lib/gcs-jobs"; +import { Document } from "../controllers/v1/types"; /** * Checks if a job is a crawl or batch scrape based on its options @@ -263,10 +265,10 @@ export async function addScrapeJobs( ); } -export function waitForJob( +export function waitForJob( jobId: string, timeout: number, -): Promise { +): Promise { return new Promise((resolve, reject) => { const start = Date.now(); const int = setInterval(async () => { @@ -277,7 +279,18 @@ export function waitForJob( const state = await getScrapeQueue().getJobState(jobId); if (state === "completed") { clearInterval(int); - resolve((await getScrapeQueue().getJob(jobId))!.returnvalue); + let doc: Document; + doc = (await getScrapeQueue().getJob(jobId))!.returnvalue; + + if (!doc) { + const docs = await getJobFromGCS(jobId); + if (!docs || docs.length === 0) { + throw new Error("Job not found in GCS"); + } + doc = docs[0]; + } + + resolve(doc); } else if (state === "failed") { // console.log("failed", (await getScrapeQueue().getJob(jobId)).failedReason); const job = await getScrapeQueue().getJob(jobId); diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index a1bf4df9..57ae8eb8 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -353,11 +353,11 @@ const processJobInternal = async (token: string, job: Job & { id: string }) => { if (result.success) { try { if ( - job.data.crawl_id && - process.env.USE_DB_AUTHENTICATION === "true" + process.env.USE_DB_AUTHENTICATION === "true" && + (job.data.crawl_id || process.env.GCS_BUCKET_NAME) ) { logger.debug( - "Job succeeded -- has crawl associated, putting null in Redis", + "Job succeeded -- putting null in Redis", ); await job.moveToCompleted(null, token, false); } else { @@ -1207,6 +1207,21 @@ async function processJob(job: Job & { id: string }, token: string) { await finishCrawlIfNeeded(job, sc); } else { + await logJob({ + job_id: job.id, + success: true, + message: "Scrape completed", + num_docs: 1, + docs: [doc], + time_taken: timeTakenInSeconds, + team_id: job.data.team_id, + mode: "scrape", + url: job.data.url, + scrapeOptions: job.data.scrapeOptions, + origin: job.data.origin, + num_tokens: 0, // TODO: fix + }); + indexJob(job, doc); } From 0446443bbec9752a2bd455b04b665db9c9d1d059 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 14 Apr 2025 16:06:38 -0700 Subject: [PATCH 106/160] Nick: acuc cache on now --- apps/api/src/controllers/auth.ts | 60 ++++++++++++++++---------------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/apps/api/src/controllers/auth.ts b/apps/api/src/controllers/auth.ts index b8063da2..8e2e260f 100644 --- a/apps/api/src/controllers/auth.ts +++ b/apps/api/src/controllers/auth.ts @@ -64,8 +64,8 @@ export async function setCachedACUC( throw signal.error; } - // Cache for 1 hour. - mogery - await setValue(cacheKeyACUC, JSON.stringify(acuc), 3600, true); + // Cache for 10 minutes. - mogery + await setValue(cacheKeyACUC, JSON.stringify(acuc), 600, true); }); } catch (error) { logger.error(`Error updating cached ACUC ${cacheKeyACUC}: ${error}`); @@ -160,14 +160,14 @@ export async function getACUC( const cacheKeyACUC = `acuc_${api_key}_${isExtract ? "extract" : "scrape"}`; - // if (useCache) { - // const cachedACUC = await getValue(cacheKeyACUC); - // if (cachedACUC !== null) { - // return JSON.parse(cachedACUC); - // } - // } + if (useCache) { + const cachedACUC = await getValue(cacheKeyACUC); + if (cachedACUC !== null) { + return JSON.parse(cachedACUC); + } + } - // if (!cacheOnly) { + if (!cacheOnly) { let data; let error; let retries = 0; @@ -205,14 +205,14 @@ export async function getACUC( data.length === 0 ? null : data[0].team_id === null ? null : data[0]; // NOTE: Should we cache null chunks? - mogery - // if (chunk !== null && useCache) { - // setCachedACUC(api_key, isExtract, chunk); - // } + if (chunk !== null && useCache) { + setCachedACUC(api_key, isExtract, chunk); + } return chunk ? { ...chunk, is_extract: isExtract } : null; - // } else { - // return null; - // } + } else { + return null; + } } export async function setCachedACUCTeam( @@ -244,8 +244,8 @@ export async function setCachedACUCTeam( throw signal.error; } - // Cache for 1 hour. - mogery - await setValue(cacheKeyACUC, JSON.stringify(acuc), 3600, true); + // Cache for 10 minutes. - mogery + await setValue(cacheKeyACUC, JSON.stringify(acuc), 600, true); }); } catch (error) { logger.error(`Error updating cached ACUC ${cacheKeyACUC}: ${error}`); @@ -275,14 +275,14 @@ export async function getACUCTeam( const cacheKeyACUC = `acuc_team_${team_id}_${isExtract ? "extract" : "scrape"}`; - // if (useCache) { - // const cachedACUC = await getValue(cacheKeyACUC); - // if (cachedACUC !== null) { - // return JSON.parse(cachedACUC); - // } - // } + if (useCache) { + const cachedACUC = await getValue(cacheKeyACUC); + if (cachedACUC !== null) { + return JSON.parse(cachedACUC); + } + } - // if (!cacheOnly) { + if (!cacheOnly) { let data; let error; let retries = 0; @@ -321,14 +321,14 @@ export async function getACUCTeam( data.length === 0 ? null : data[0].team_id === null ? null : data[0]; // NOTE: Should we cache null chunks? - mogery - // if (chunk !== null && useCache) { - // setCachedACUC(api_key, chunk); - // } + if (chunk !== null && useCache) { + setCachedACUCTeam(team_id, isExtract, chunk); + } return chunk ? { ...chunk, is_extract: isExtract } : null; - // } else { - // return null; - // } + } else { + return null; + } } export async function clearACUC(api_key: string): Promise { From e2c4b0e72f84df3a67bae61ae71982e769d2d9f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Mon, 14 Apr 2025 18:12:36 -0700 Subject: [PATCH 107/160] remove double v0 log --- apps/api/src/controllers/v0/scrape.ts | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/apps/api/src/controllers/v0/scrape.ts b/apps/api/src/controllers/v0/scrape.ts index b7bb3359..86efd2f5 100644 --- a/apps/api/src/controllers/v0/scrape.ts +++ b/apps/api/src/controllers/v0/scrape.ts @@ -299,22 +299,6 @@ export async function scrapeController(req: Request, res: Response) { team_id, ); - logJob({ - job_id: jobId, - success: result.success, - message: result.error, - num_docs: 1, - docs: [doc], - time_taken: timeTakenInSeconds, - team_id: team_id, - mode: "scrape", - url: req.body.url, - crawlerOptions: crawlerOptions, - scrapeOptions, - origin: origin, - num_tokens: numTokens, - }); - return res.status(result.returnCode).json(result); } catch (error) { Sentry.captureException(error); From 6634d236bf1fccb6b1cbfc032c8af820579e9d27 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 15 Apr 2025 00:19:45 -0700 Subject: [PATCH 108/160] (feat/fire-1) FIRE-1 (#1462) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * wip * integrating smart-scrape * integrate smartscrape into llmExtract * wip * smart scrape multiple links * fixes * fix * wip * it worked! * wip. there's a bug on the batchExtract TypeError: Converting circular structure to JSON * wip * retry model * retry models * feat/scrape+json+extract interfaces ready * vertex -> googleapi * fix/transformArrayToObject. required params on schema is still a bug * change model * o3-mini -> gemini * Update extractSmartScrape.ts * sessionId * sessionId * Nick: f-0 start * Update extraction-service-f0.ts * Update types.ts * Nick: * Update queue-worker.ts * Nick: new interface * rename analyzeSchemaAndPrompt -> F0 * refactor: rename agent ID to model in types and extract logic * agent * id->model * id->model * refactor: standardize agent model handling and validation across extraction logic * livecast agent * (feat/f1) sdks (#1459) * feat: add FIRE-1 agent support to Python and JavaScript SDKs Co-Authored-By: hello@sideguide.dev * feat: add FIRE-1 agent support to scrape methods in both SDKs Co-Authored-By: hello@sideguide.dev * feat: add prompt and sessionId to AgentOptions interface Co-Authored-By: hello@sideguide.dev * Update index.ts --------- Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: hello@sideguide.dev Co-authored-by: Nicolas * feat(v1): rate limits * Update types.ts * Update llmExtract.ts * add cost tracking * remove * Update requests.http * fix smart scrape cost calc * log sm cost * fix counts * fix * expose cost tracking * models fix * temp: skipLibcheck * get rid of it * fix ts * dont skip lib check * Update extractSmartScrape.ts * Update queue-worker.ts * Update smartScrape.ts * Update requests.http * fix(rate-limiter): * types: fire-1 refine * bill 150 * fix credits used on crawl * ban from crawl * route cost limit warning * Update generic-ai.ts * genres * Update llmExtract.ts * test server diff * cletu --------- Co-authored-by: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com> Co-authored-by: Thomas Kosmas Co-authored-by: Ademílson F. Tonato Co-authored-by: devin-ai-integration[bot] <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: hello@sideguide.dev Co-authored-by: Gergő Móricz --- .github/workflows/test-server.yml | 4 + .gitignore | 3 +- apps/api/package.json | 13 +- apps/api/pnpm-lock.yaml | 298 +- apps/api/requests.http | 38 +- apps/api/src/controllers/auth.ts | 6 +- apps/api/src/controllers/v0/crawl-status.ts | 2 +- apps/api/src/controllers/v0/scrape.ts | 7 - apps/api/src/controllers/v1/crawl-status.ts | 6 +- apps/api/src/controllers/v1/extract-status.ts | 1 + apps/api/src/controllers/v1/extract.ts | 2 + apps/api/src/controllers/v1/scrape-status.ts | 4 +- apps/api/src/controllers/v1/scrape.ts | 9 + apps/api/src/controllers/v1/types.ts | 143 +- apps/api/src/index.ts | 8 +- .../src/lib/deep-research/research-manager.ts | 31 +- apps/api/src/lib/extract/build-prompts.ts | 22 +- .../completions/analyzeSchemaAndPrompt.ts | 15 +- .../lib/extract/completions/batchExtract.ts | 85 +- .../extract/completions/checkShouldExtract.ts | 3 +- .../lib/extract/completions/singleAnswer.ts | 82 +- apps/api/src/lib/extract/config.ts | 4 +- apps/api/src/lib/extract/extract-redis.ts | 3 + .../api/src/lib/extract/extraction-service.ts | 418 +- .../lib/extract/fire-0/build-document-f0.ts | 17 + .../lib/extract/fire-0/build-prompts-f0.ts | 115 + .../completions/analyzeSchemaAndPrompt-f0.ts | 87 + .../fire-0/completions/batchExtract-f0.ts | 54 + .../completions/checkShouldExtract-f0.ts | 39 + .../fire-0/completions/singleAnswer-f0.ts | 42 + .../lib/extract/fire-0/document-scraper-f0.ts | 98 + .../extract/fire-0/extraction-service-f0.ts | 807 ++++ .../helpers/deduplicate-objs-array-f0.ts | 29 + .../fire-0/helpers/dereference-schema-f0.ts | 10 + .../fire-0/helpers/merge-null-val-objs-f0.ts | 153 + .../fire-0/helpers/mix-schema-objs-f0.ts | 48 + .../fire-0/helpers/source-tracker-f0.ts | 151 + .../fire-0/helpers/spread-schemas-f0.ts | 82 + .../helpers/transform-array-to-obj-f0.ts | 167 + .../src/lib/extract/fire-0/llmExtract-f0.ts | 469 +++ apps/api/src/lib/extract/fire-0/ranker-f0.ts | 86 + .../api/src/lib/extract/fire-0/reranker-f0.ts | 293 ++ .../lib/extract/fire-0/url-processor-f0.ts | 250 ++ .../lib/extract/fire-0/usage/llm-cost-f0.ts | 61 + .../extract/helpers/merge-null-val-objs.ts | 5 + .../extract/helpers/transform-array-to-obj.ts | 25 +- apps/api/src/lib/extract/reranker.ts | 176 +- apps/api/src/lib/extract/url-processor.ts | 104 +- apps/api/src/lib/generic-ai.ts | 65 +- apps/api/src/routes/v1.ts | 9 + .../scrapeURL/lib/extractSmartScrape.ts | 347 ++ .../src/scraper/scrapeURL/lib/smartScrape.ts | 164 + .../scraper/scrapeURL/transformers/agent.ts | 65 + .../scraper/scrapeURL/transformers/diff.ts | 21 +- .../scraper/scrapeURL/transformers/index.ts | 3 + .../scrapeURL/transformers/llmExtract.ts | 538 ++- apps/api/src/services/agentLivecastWS.ts | 56 + apps/api/src/services/logging/log_job.ts | 2 + apps/api/src/services/queue-worker.ts | 161 +- apps/api/src/services/rate-limiter.ts | 37 +- apps/api/src/types.ts | 4 + apps/api/tsconfig.json | 21 +- apps/js-sdk/firecrawl/pnpm-lock.yaml | 3611 +++++++++++++++++ apps/js-sdk/firecrawl/src/index.ts | 17 + apps/python-sdk/firecrawl/firecrawl.py | 10 +- 65 files changed, 9155 insertions(+), 551 deletions(-) create mode 100644 apps/api/src/lib/extract/fire-0/build-document-f0.ts create mode 100644 apps/api/src/lib/extract/fire-0/build-prompts-f0.ts create mode 100644 apps/api/src/lib/extract/fire-0/completions/analyzeSchemaAndPrompt-f0.ts create mode 100644 apps/api/src/lib/extract/fire-0/completions/batchExtract-f0.ts create mode 100644 apps/api/src/lib/extract/fire-0/completions/checkShouldExtract-f0.ts create mode 100644 apps/api/src/lib/extract/fire-0/completions/singleAnswer-f0.ts create mode 100644 apps/api/src/lib/extract/fire-0/document-scraper-f0.ts create mode 100644 apps/api/src/lib/extract/fire-0/extraction-service-f0.ts create mode 100644 apps/api/src/lib/extract/fire-0/helpers/deduplicate-objs-array-f0.ts create mode 100644 apps/api/src/lib/extract/fire-0/helpers/dereference-schema-f0.ts create mode 100644 apps/api/src/lib/extract/fire-0/helpers/merge-null-val-objs-f0.ts create mode 100644 apps/api/src/lib/extract/fire-0/helpers/mix-schema-objs-f0.ts create mode 100644 apps/api/src/lib/extract/fire-0/helpers/source-tracker-f0.ts create mode 100644 apps/api/src/lib/extract/fire-0/helpers/spread-schemas-f0.ts create mode 100644 apps/api/src/lib/extract/fire-0/helpers/transform-array-to-obj-f0.ts create mode 100644 apps/api/src/lib/extract/fire-0/llmExtract-f0.ts create mode 100644 apps/api/src/lib/extract/fire-0/ranker-f0.ts create mode 100644 apps/api/src/lib/extract/fire-0/reranker-f0.ts create mode 100644 apps/api/src/lib/extract/fire-0/url-processor-f0.ts create mode 100644 apps/api/src/lib/extract/fire-0/usage/llm-cost-f0.ts create mode 100644 apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts create mode 100644 apps/api/src/scraper/scrapeURL/lib/smartScrape.ts create mode 100644 apps/api/src/scraper/scrapeURL/transformers/agent.ts create mode 100644 apps/api/src/services/agentLivecastWS.ts create mode 100644 apps/js-sdk/firecrawl/pnpm-lock.yaml diff --git a/.github/workflows/test-server.yml b/.github/workflows/test-server.yml index 7a45dece..bf37baa3 100644 --- a/.github/workflows/test-server.yml +++ b/.github/workflows/test-server.yml @@ -31,6 +31,10 @@ env: RUNPOD_MU_API_KEY: ${{ secrets.RUNPOD_MU_API_KEY }} GCS_CREDENTIALS: ${{ secrets.GCS_CREDENTIALS }} GCS_BUCKET_NAME: ${{ secrets.GCS_BUCKET_NAME }} + GOOGLE_GENERATIVE_AI_API_KEY: ${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }} + GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }} + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + VERTEX_CREDENTIALS: ${{ secrets.VERTEX_CREDENTIALS }} USE_GO_MARKDOWN_PARSER: true jobs: diff --git a/.gitignore b/.gitignore index a19465c1..37baa733 100644 --- a/.gitignore +++ b/.gitignore @@ -39,4 +39,5 @@ apps/js-sdk/firecrawl/dist .vscode llm-links.txt -mapped-links.txt \ No newline at end of file +mapped-links.txt +gke-key.json \ No newline at end of file diff --git a/apps/api/package.json b/apps/api/package.json index eae70214..c3363c45 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -35,7 +35,8 @@ "@types/body-parser": "^1.19.2", "@types/cors": "^2.8.13", "@types/escape-html": "^1.0.4", - "@types/express": "^4.17.17", + "@types/express": "^4.17.21", + "@types/express-ws": "^3.0.5", "@types/jest": "^29.5.12", "@types/lodash": "^4.17.14", "@types/node": "^20.14.1", @@ -51,7 +52,13 @@ "typescript": "^5.8.3" }, "dependencies": { - "@ai-sdk/openai": "^1.3.10", + "@ai-sdk/anthropic": "^1.2.4", + "@ai-sdk/deepinfra": "^0.2.4", + "@ai-sdk/fireworks": "^0.2.4", + "@ai-sdk/google": "^1.2.3", + "@ai-sdk/google-vertex": "^2.2.15", + "@ai-sdk/groq": "^1.2.1", + "@ai-sdk/openai": "^1.3.12", "@anthropic-ai/sdk": "^0.24.3", "@apidevtools/json-schema-ref-parser": "^11.7.3", "@brillout/import": "^0.2.2", @@ -61,12 +68,12 @@ "@dqbd/tiktoken": "^1.0.17", "@google-cloud/storage": "^7.16.0", "@nangohq/node": "^0.40.8", + "@openrouter/ai-sdk-provider": "^0.4.5", "@pinecone-database/pinecone": "^4.0.0", "@sentry/cli": "^2.33.1", "@sentry/node": "^8.26.0", "@sentry/profiling-node": "^8.26.0", "@supabase/supabase-js": "^2.44.2", - "@types/express-ws": "^3.0.4", "@types/ws": "^8.5.12", "ai": "^4.3.4", "ajv": "^8.16.0", diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index 028423d3..6d3aa164 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -8,9 +8,27 @@ importers: .: dependencies: + '@ai-sdk/anthropic': + specifier: ^1.2.4 + version: 1.2.4(zod@3.24.2) + '@ai-sdk/deepinfra': + specifier: ^0.2.4 + version: 0.2.4(zod@3.24.2) + '@ai-sdk/fireworks': + specifier: ^0.2.4 + version: 0.2.4(zod@3.24.2) + '@ai-sdk/google': + specifier: ^1.2.3 + version: 1.2.3(zod@3.24.2) + '@ai-sdk/google-vertex': + specifier: ^2.2.15 + version: 2.2.15(encoding@0.1.13)(zod@3.24.2) + '@ai-sdk/groq': + specifier: ^1.2.1 + version: 1.2.1(zod@3.24.2) '@ai-sdk/openai': - specifier: ^1.3.10 - version: 1.3.10(zod@3.24.2) + specifier: ^1.3.12 + version: 1.3.12(zod@3.24.2) '@anthropic-ai/sdk': specifier: ^0.24.3 version: 0.24.3(encoding@0.1.13) @@ -38,6 +56,9 @@ importers: '@nangohq/node': specifier: ^0.40.8 version: 0.40.8 + '@openrouter/ai-sdk-provider': + specifier: ^0.4.5 + version: 0.4.5(zod@3.24.2) '@pinecone-database/pinecone': specifier: ^4.0.0 version: 4.0.0 @@ -53,15 +74,12 @@ importers: '@supabase/supabase-js': specifier: ^2.44.2 version: 2.44.2 - '@types/express-ws': - specifier: ^3.0.4 - version: 3.0.4 '@types/ws': specifier: ^8.5.12 version: 8.5.12 ai: specifier: ^4.3.4 - version: 4.3.4(react@18.3.1)(zod@3.24.2) + version: 4.3.5(react@18.3.1)(zod@3.24.2) ajv: specifier: ^8.16.0 version: 8.16.0 @@ -286,8 +304,11 @@ importers: specifier: ^1.0.4 version: 1.0.4 '@types/express': - specifier: ^4.17.17 + specifier: ^4.17.21 version: 4.17.21 + '@types/express-ws': + specifier: ^3.0.5 + version: 3.0.5 '@types/jest': specifier: ^29.5.12 version: 29.5.12 @@ -330,12 +351,75 @@ importers: packages: - '@ai-sdk/openai@1.3.10': - resolution: {integrity: sha512-XO0wF2lmAMWCYjkM5bLpWTKoXet61fBiIimTi+blqEGiLUjAvivt/1zZL1Lzhrv9+p19IC1rn9EWZI1dCelV8w==} + '@ai-sdk/anthropic@1.2.10': + resolution: {integrity: sha512-PyE7EC2fPjs9DnzRAHDrPQmcnI2m2Eojr8pfhckOejOlDEh2w7NnSJr1W3qe5hUWzKr+6d7NG1ZKR9fhmpDdEQ==} engines: {node: '>=18'} peerDependencies: zod: ^3.0.0 + '@ai-sdk/anthropic@1.2.4': + resolution: {integrity: sha512-dAN6MXvLffeFVAr2gz3RGvOTgX1KL/Yn5q1l4/Dt0TUeDjQgCt4AbbYxZZB2qIAYzQvoyAFPhlw0sB3nNizG/g==} + engines: {node: '>=18'} + peerDependencies: + zod: ^3.0.0 + + '@ai-sdk/deepinfra@0.2.4': + resolution: {integrity: sha512-JBF3tUOLYgQDCwkvN9I5ZbSqsAxTJWOKmIpyJXJl5RpLXOEviJUqpKSZufs11J9S4Z0U9vZX9jfhO1+DBjS56w==} + engines: {node: '>=18'} + peerDependencies: + zod: ^3.0.0 + + '@ai-sdk/fireworks@0.2.4': + resolution: {integrity: sha512-tNXJfEyyXHBD4hMoYjZW/IrsZNcTlmZkQFx3hFRwhiz35rT9TC9QG/RuKCz+UtziQU765g7NP4G/t7f0cJ154Q==} + engines: {node: '>=18'} + peerDependencies: + zod: ^3.0.0 + + '@ai-sdk/google-vertex@2.2.15': + resolution: {integrity: sha512-XTl0dQ1rvLjhrkifSy/483qw3O7vCI6H2b4aAJnzQMfy0vzczMXmvQFS5RA8KmnO+YvsKTuZwBM2xRCNvKw1oQ==} + engines: {node: '>=18'} + peerDependencies: + zod: ^3.0.0 + + '@ai-sdk/google@1.2.11': + resolution: {integrity: sha512-gjGcxKcRri/Jbkujs9nVwP4qOW5GI4rYQ6vQ17uLAvGMo3qnwr26Q2KUqUWuVHQYtboXVSrxC/Kb6sm3hE5WUQ==} + engines: {node: '>=18'} + peerDependencies: + zod: ^3.0.0 + + '@ai-sdk/google@1.2.3': + resolution: {integrity: sha512-zsgwko7T+MFIdEfhg4fIXv6O2dnzTLFr6BOpAA21eo/moOBA5szVzOto1jTwIwoBYsF2ixPGNZBoc+k/fQ2AWw==} + engines: {node: '>=18'} + peerDependencies: + zod: ^3.0.0 + + '@ai-sdk/groq@1.2.1': + resolution: {integrity: sha512-e9Vn6sE6u+pm97YSK9+xiTgQ2ScRdipE5gAwXj/9HdgMnUyp3mDpWjFsmDM6bzyeb2iKOGv6f3eiRsLxOAPv4A==} + engines: {node: '>=18'} + peerDependencies: + zod: ^3.0.0 + + '@ai-sdk/openai-compatible@0.2.4': + resolution: {integrity: sha512-hLQnBn5e69rUXvXW+9SOkiL+S4yQX62hjtlX3zKXBI/3VnfOTcGKMamK51GoQB7uQCN1h7l9orvWqWpuQXxzRg==} + engines: {node: '>=18'} + peerDependencies: + zod: ^3.0.0 + + '@ai-sdk/openai@1.3.12': + resolution: {integrity: sha512-ueAP69p8a/ZR2ns+pmlr9h/nyV2/DAwzfnPUGZiLpXbxWnLXd2g3a7l38CuEhBydH/nOfDb/byMgpS8+bnJHTg==} + engines: {node: '>=18'} + peerDependencies: + zod: ^3.0.0 + + '@ai-sdk/provider-utils@2.1.10': + resolution: {integrity: sha512-4GZ8GHjOFxePFzkl3q42AU0DQOtTQ5w09vmaWUf/pKFXJPizlnzKSUkF0f+VkapIUfDugyMqPMT1ge8XQzVI7Q==} + engines: {node: '>=18'} + peerDependencies: + zod: ^3.0.0 + peerDependenciesMeta: + zod: + optional: true + '@ai-sdk/provider-utils@2.1.9': resolution: {integrity: sha512-NerKjTuuUUs6glJGaentaXEBH52jRM0pR+cRCzc7aWke/K5jYBD6Frv1JYBpcxS7gnnCqSQZR9woiyS+6jrdjw==} engines: {node: '>=18'} @@ -345,20 +429,50 @@ packages: zod: optional: true + '@ai-sdk/provider-utils@2.2.1': + resolution: {integrity: sha512-BuExLp+NcpwsAVj1F4bgJuQkSqO/+roV9wM7RdIO+NVrcT8RBUTdXzf5arHt5T58VpK7bZyB2V9qigjaPHE+Dg==} + engines: {node: '>=18'} + peerDependencies: + zod: ^3.23.8 + + '@ai-sdk/provider-utils@2.2.3': + resolution: {integrity: sha512-o3fWTzkxzI5Af7U7y794MZkYNEsxbjLam2nxyoUZSScqkacb7vZ3EYHLh21+xCcSSzEC161C7pZAGHtC0hTUMw==} + engines: {node: '>=18'} + peerDependencies: + zod: ^3.23.8 + '@ai-sdk/provider-utils@2.2.6': resolution: {integrity: sha512-sUlZ7Gnq84DCGWMQRIK8XVbkzIBnvPR1diV4v6JwPgpn5armnLI/j+rqn62MpLrU5ZCQZlDKl/Lw6ed3ulYqaA==} engines: {node: '>=18'} peerDependencies: zod: ^3.23.8 + '@ai-sdk/provider-utils@2.2.7': + resolution: {integrity: sha512-kM0xS3GWg3aMChh9zfeM+80vEZfXzR3JEUBdycZLtbRZ2TRT8xOj3WodGHPb06sUK5yD7pAXC/P7ctsi2fvUGQ==} + engines: {node: '>=18'} + peerDependencies: + zod: ^3.23.8 + '@ai-sdk/provider@1.0.8': resolution: {integrity: sha512-f9jSYwKMdXvm44Dmab1vUBnfCDSFfI5rOtvV1W9oKB7WYHR5dGvCC6x68Mk3NUfrdmNoMVHGoh6JT9HCVMlMow==} engines: {node: '>=18'} + '@ai-sdk/provider@1.0.9': + resolution: {integrity: sha512-jie6ZJT2ZR0uVOVCDc9R2xCX5I/Dum/wEK28lx21PJx6ZnFAN9EzD2WsPhcDWfCgGx3OAZZ0GyM3CEobXpa9LA==} + engines: {node: '>=18'} + + '@ai-sdk/provider@1.1.0': + resolution: {integrity: sha512-0M+qjp+clUD0R1E5eWQFhxEvWLNaOtGQRUaBn8CUABnSKredagq92hUS9VjOzGsTm37xLfpaxl97AVtbeOsHew==} + engines: {node: '>=18'} + '@ai-sdk/provider@1.1.2': resolution: {integrity: sha512-ITdgNilJZwLKR7X5TnUr1BsQW6UTX5yFp0h66Nfx8XjBYkWD9W3yugr50GOz3CnE9m/U/Cd5OyEbTMI0rgi6ZQ==} engines: {node: '>=18'} + '@ai-sdk/provider@1.1.3': + resolution: {integrity: sha512-qZMxYJ0qqX/RfnuIaab+zp8UAeJn/ygXXAffR5I4N0n1IrvA6qBsjc8hXLmBiMV2zoXlifkacF7sEFnYnjBcqg==} + engines: {node: '>=18'} + '@ai-sdk/react@1.2.8': resolution: {integrity: sha512-S2FzCSi4uTF0JuSN6zYMXyiAWVAzi/Hho8ISYgHpGZiICYLNCP2si4DuXQOsnWef3IXzQPLVoE11C63lILZIkw==} engines: {node: '>=18'} @@ -949,6 +1063,12 @@ packages: '@one-ini/wasm@0.1.1': resolution: {integrity: sha512-XuySG1E38YScSJoMlqovLru4KTUNSjgVTIjyh7qMX6aNN5HY5Ct5LhRJdxO79JtTzKfzV/bnWpz+zquYrISsvw==} + '@openrouter/ai-sdk-provider@0.4.5': + resolution: {integrity: sha512-gbCOcSjNhyWlLHyYZX2rIFnpJi3C2RXNyyzJj+d6pMRfTS/mdvEEOsU66KxK9H8Qju2i9YRLOn/FdQT26K7bIQ==} + engines: {node: '>=18'} + peerDependencies: + zod: ^3.0.0 + '@opentelemetry/api-logs@0.52.1': resolution: {integrity: sha512-qnSqB2DQ9TPP96dl8cDubDvrUyWc0/sK81xHTK8eSUspzDM3bsewX903qclQFvVhgStjRWdC5bLb3kQqMkfV5A==} engines: {node: '>=14'} @@ -1644,8 +1764,8 @@ packages: '@types/express-serve-static-core@4.19.3': resolution: {integrity: sha512-KOzM7MhcBFlmnlr/fzISFF5vGWVSvN6fTd4T+ExOt08bA/dA5kpSzY52nMsI1KDFmUREpJelPYyuslLRSjjgCg==} - '@types/express-ws@3.0.4': - resolution: {integrity: sha512-Yjj18CaivG5KndgcvzttWe8mPFinPCHJC2wvyQqVzA7hqeufM8EtWMj6mpp5omg3s8XALUexhOu8aXAyi/DyJQ==} + '@types/express-ws@3.0.5': + resolution: {integrity: sha512-lbWMjoHrm/v85j81UCmb/GNZFO3genxRYBW1Ob7rjRI+zxUBR+4tcFuOpKKsYQ1LYTYiy3356epLeYi/5zxUwA==} '@types/express@4.17.21': resolution: {integrity: sha512-ejlPM315qwLpaQlQDTjPdsUFSc6ZsP4AN6AlWnogPjQ7CVi7PYF3YVz+CY3jE2pwYf7E/7HlDAN0rV2GxTG0HQ==} @@ -1820,8 +1940,8 @@ packages: resolution: {integrity: sha512-5GG/5IbQQpC9FpkRGsSvZI5QYeSCzlJHdpBQntCsuTOxhKD8lqKhrleg2Yi7yvMIf82Ycmmqln9U8V9qwEiJew==} engines: {node: '>= 8.0.0'} - ai@4.3.4: - resolution: {integrity: sha512-uMjzrowIqfU8CCCxhx8QGl7ETydHBROeNL0VoEwetkmDCY6Q8ZTacj6jNNqGJOiCk595aUrGR9VHPY9Ylvy1fg==} + ai@4.3.5: + resolution: {integrity: sha512-hxJ+6YCdGOK1MVPGITmz1if+LXR/aW72w8TI8kiV+3R7lpK1hfpApR8EjqN2ag6cWa0R7OEI3gb/srWkQ3hT2Q==} engines: {node: '>=18'} peerDependencies: react: ^18 || ^19 || ^19.0.0-rc @@ -2091,8 +2211,8 @@ packages: resolution: {integrity: sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==} engines: {node: '>=10'} - chalk@5.3.0: - resolution: {integrity: sha512-dLitG79d+GV1Nb/VYcCDFivJeK1hiukt9QjRNVOsUtTy1rR1YJsmpGGTZ3qJos+uw7WmWF4wUwBd9jxjocFC2w==} + chalk@5.4.1: + resolution: {integrity: sha512-zgVZuo2WcZgfUEmsn6eO3kINexW8RAE4maiQ8QNs8CtpPCSyMiYsULR3HQYkm3w8FIA3SberyMJMSldGsW+U3w==} engines: {node: ^12.17.0 || ^14.13 || >=16.0.0} char-regex@1.0.2: @@ -4350,8 +4470,8 @@ packages: resolution: {integrity: sha512-ot0WnXS9fgdkgIcePe6RHNk1WA8+muPa6cSjeR3V8K27q9BB1rTE3R1p7Hv0z1ZyAc8s6Vvv8DIyWf681MAt0w==} engines: {node: '>= 0.4'} - swr@2.3.2: - resolution: {integrity: sha512-RosxFpiabojs75IwQ316DGoDRmOqtiAj0tg8wCcbEu4CiLZBs/a9QNtHV7TUfDXmmlgqij/NqzKq/eLelyv9xA==} + swr@2.3.3: + resolution: {integrity: sha512-dshNvs3ExOqtZ6kJBaAsabhPdHyeY4P2cKwRCniDVifBMoG/SVI7tfLWqPXriVspf2Rg4tPzXJTnwaihIeFw2A==} peerDependencies: react: ^16.11.0 || ^17.0.0 || ^18.0.0 || ^19.0.0 @@ -4565,8 +4685,8 @@ packages: urlpattern-polyfill@10.0.0: resolution: {integrity: sha512-H/A06tKD7sS1O1X2SshBVeA5FLycRpjqiBeqGKmBwBDBy28EnRjORxTNe269KSSr5un5qyWi1iL61wLxpd+ZOg==} - use-sync-external-store@1.4.0: - resolution: {integrity: sha512-9WXSPC5fMv61vaupRkCKCxsPxBocVnwakBEkMIHHpkTTg6icbJtg6jzgtLDm4bl3cSHAca52rYWih0k4K3PfHw==} + use-sync-external-store@1.5.0: + resolution: {integrity: sha512-Rb46I4cGGVBmjamjphe8L/UnvJD+uPPtTkNvX5mZgqdbavhI4EbgIWJiIHXJ8bc/i9EQGPRh4DwEURJ552Do0A==} peerDependencies: react: ^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0 @@ -4773,8 +4893,8 @@ packages: resolution: {integrity: sha512-rVksvsnNCdJ/ohGc6xgPwyN8eheCxsiLM8mxuE/t/mOVqJewPuO1miLpTHQiRgTKCLexL4MeAFVagts7HmNZ2Q==} engines: {node: '>=10'} - zod-to-json-schema@3.24.2: - resolution: {integrity: sha512-pNUqrcSxuuB3/+jBbU8qKUbTbDqYUaG1vf5cXFjbhGgoUuA1amO/y4Q8lzfOhHU8HNPK6VFJ18lBDKj3OHyDsg==} + zod-to-json-schema@3.24.5: + resolution: {integrity: sha512-/AuWwMP+YqiPbsJx5D6TfgRTc4kTLjsh5SOcd4bLsfUg2RcEXrFMJl1DGgdHy2aCfsIA/cr/1JM0xcB2GZji8g==} peerDependencies: zod: ^3.24.1 @@ -4786,10 +4906,81 @@ packages: snapshots: - '@ai-sdk/openai@1.3.10(zod@3.24.2)': + '@ai-sdk/anthropic@1.2.10(zod@3.24.2)': dependencies: - '@ai-sdk/provider': 1.1.2 - '@ai-sdk/provider-utils': 2.2.6(zod@3.24.2) + '@ai-sdk/provider': 1.1.3 + '@ai-sdk/provider-utils': 2.2.7(zod@3.24.2) + zod: 3.24.2 + + '@ai-sdk/anthropic@1.2.4(zod@3.24.2)': + dependencies: + '@ai-sdk/provider': 1.1.0 + '@ai-sdk/provider-utils': 2.2.3(zod@3.24.2) + zod: 3.24.2 + + '@ai-sdk/deepinfra@0.2.4(zod@3.24.2)': + dependencies: + '@ai-sdk/openai-compatible': 0.2.4(zod@3.24.2) + '@ai-sdk/provider': 1.1.0 + '@ai-sdk/provider-utils': 2.2.3(zod@3.24.2) + zod: 3.24.2 + + '@ai-sdk/fireworks@0.2.4(zod@3.24.2)': + dependencies: + '@ai-sdk/openai-compatible': 0.2.4(zod@3.24.2) + '@ai-sdk/provider': 1.1.0 + '@ai-sdk/provider-utils': 2.2.3(zod@3.24.2) + zod: 3.24.2 + + '@ai-sdk/google-vertex@2.2.15(encoding@0.1.13)(zod@3.24.2)': + dependencies: + '@ai-sdk/anthropic': 1.2.10(zod@3.24.2) + '@ai-sdk/google': 1.2.11(zod@3.24.2) + '@ai-sdk/provider': 1.1.3 + '@ai-sdk/provider-utils': 2.2.7(zod@3.24.2) + google-auth-library: 9.15.1(encoding@0.1.13) + zod: 3.24.2 + transitivePeerDependencies: + - encoding + - supports-color + + '@ai-sdk/google@1.2.11(zod@3.24.2)': + dependencies: + '@ai-sdk/provider': 1.1.3 + '@ai-sdk/provider-utils': 2.2.7(zod@3.24.2) + zod: 3.24.2 + + '@ai-sdk/google@1.2.3(zod@3.24.2)': + dependencies: + '@ai-sdk/provider': 1.1.0 + '@ai-sdk/provider-utils': 2.2.1(zod@3.24.2) + zod: 3.24.2 + + '@ai-sdk/groq@1.2.1(zod@3.24.2)': + dependencies: + '@ai-sdk/provider': 1.1.0 + '@ai-sdk/provider-utils': 2.2.1(zod@3.24.2) + zod: 3.24.2 + + '@ai-sdk/openai-compatible@0.2.4(zod@3.24.2)': + dependencies: + '@ai-sdk/provider': 1.1.0 + '@ai-sdk/provider-utils': 2.2.3(zod@3.24.2) + zod: 3.24.2 + + '@ai-sdk/openai@1.3.12(zod@3.24.2)': + dependencies: + '@ai-sdk/provider': 1.1.3 + '@ai-sdk/provider-utils': 2.2.7(zod@3.24.2) + zod: 3.24.2 + + '@ai-sdk/provider-utils@2.1.10(zod@3.24.2)': + dependencies: + '@ai-sdk/provider': 1.0.9 + eventsource-parser: 3.0.0 + nanoid: 3.3.8 + secure-json-parse: 2.7.0 + optionalDependencies: zod: 3.24.2 '@ai-sdk/provider-utils@2.1.9(zod@3.24.2)': @@ -4801,6 +4992,20 @@ snapshots: optionalDependencies: zod: 3.24.2 + '@ai-sdk/provider-utils@2.2.1(zod@3.24.2)': + dependencies: + '@ai-sdk/provider': 1.1.0 + nanoid: 3.3.8 + secure-json-parse: 2.7.0 + zod: 3.24.2 + + '@ai-sdk/provider-utils@2.2.3(zod@3.24.2)': + dependencies: + '@ai-sdk/provider': 1.1.0 + nanoid: 3.3.8 + secure-json-parse: 2.7.0 + zod: 3.24.2 + '@ai-sdk/provider-utils@2.2.6(zod@3.24.2)': dependencies: '@ai-sdk/provider': 1.1.2 @@ -4808,20 +5013,39 @@ snapshots: secure-json-parse: 2.7.0 zod: 3.24.2 + '@ai-sdk/provider-utils@2.2.7(zod@3.24.2)': + dependencies: + '@ai-sdk/provider': 1.1.3 + nanoid: 3.3.8 + secure-json-parse: 2.7.0 + zod: 3.24.2 + '@ai-sdk/provider@1.0.8': dependencies: json-schema: 0.4.0 + '@ai-sdk/provider@1.0.9': + dependencies: + json-schema: 0.4.0 + + '@ai-sdk/provider@1.1.0': + dependencies: + json-schema: 0.4.0 + '@ai-sdk/provider@1.1.2': dependencies: json-schema: 0.4.0 + '@ai-sdk/provider@1.1.3': + dependencies: + json-schema: 0.4.0 + '@ai-sdk/react@1.2.8(react@18.3.1)(zod@3.24.2)': dependencies: '@ai-sdk/provider-utils': 2.2.6(zod@3.24.2) '@ai-sdk/ui-utils': 1.2.7(zod@3.24.2) react: 18.3.1 - swr: 2.3.2(react@18.3.1) + swr: 2.3.3(react@18.3.1) throttleit: 2.1.0 optionalDependencies: zod: 3.24.2 @@ -4831,7 +5055,7 @@ snapshots: '@ai-sdk/provider': 1.1.2 '@ai-sdk/provider-utils': 2.2.6(zod@3.24.2) zod: 3.24.2 - zod-to-json-schema: 3.24.2(zod@3.24.2) + zod-to-json-schema: 3.24.5(zod@3.24.2) '@ampproject/remapping@2.3.0': dependencies: @@ -5906,6 +6130,12 @@ snapshots: '@one-ini/wasm@0.1.1': {} + '@openrouter/ai-sdk-provider@0.4.5(zod@3.24.2)': + dependencies: + '@ai-sdk/provider': 1.0.9 + '@ai-sdk/provider-utils': 2.1.10(zod@3.24.2) + zod: 3.24.2 + '@opentelemetry/api-logs@0.52.1': dependencies: '@opentelemetry/api': 1.9.0 @@ -6837,7 +7067,7 @@ snapshots: '@types/range-parser': 1.2.7 '@types/send': 0.17.4 - '@types/express-ws@3.0.4': + '@types/express-ws@3.0.5': dependencies: '@types/express': 4.17.21 '@types/express-serve-static-core': 4.19.3 @@ -7025,7 +7255,7 @@ snapshots: dependencies: humanize-ms: 1.2.1 - ai@4.3.4(react@18.3.1)(zod@3.24.2): + ai@4.3.5(react@18.3.1)(zod@3.24.2): dependencies: '@ai-sdk/provider': 1.1.2 '@ai-sdk/provider-utils': 2.2.6(zod@3.24.2) @@ -7350,7 +7580,7 @@ snapshots: ansi-styles: 4.3.0 supports-color: 7.2.0 - chalk@5.3.0: {} + chalk@5.4.1: {} char-regex@1.0.2: {} @@ -8829,7 +9059,7 @@ snapshots: jsondiffpatch@0.6.0: dependencies: '@types/diff-match-patch': 1.0.36 - chalk: 5.3.0 + chalk: 5.4.1 diff-match-patch: 1.0.5 jsonfile@6.1.0: @@ -9956,11 +10186,11 @@ snapshots: supports-preserve-symlinks-flag@1.0.0: {} - swr@2.3.2(react@18.3.1): + swr@2.3.3(react@18.3.1): dependencies: dequal: 2.0.3 react: 18.3.1 - use-sync-external-store: 1.4.0(react@18.3.1) + use-sync-external-store: 1.5.0(react@18.3.1) sylvester@0.0.12: {} @@ -10158,7 +10388,7 @@ snapshots: urlpattern-polyfill@10.0.0: {} - use-sync-external-store@1.4.0(react@18.3.1): + use-sync-external-store@1.5.0(react@18.3.1): dependencies: react: 18.3.1 @@ -10336,7 +10566,7 @@ snapshots: yocto-queue@0.1.0: {} - zod-to-json-schema@3.24.2(zod@3.24.2): + zod-to-json-schema@3.24.5(zod@3.24.2): dependencies: zod: 3.24.2 diff --git a/apps/api/requests.http b/apps/api/requests.http index 2ef8bb37..9183ad24 100644 --- a/apps/api/requests.http +++ b/apps/api/requests.http @@ -1,6 +1,6 @@ # Pick your baseUrl here: -@baseUrl = http://localhost:3002 -#@baseUrl = https://api.firecrawl.dev +# @baseUrl = http://localhost:3002 +@baseUrl = https://api.firecrawl.dev ### Scrape Website # @name scrape @@ -9,7 +9,7 @@ Authorization: Bearer {{$dotenv TEST_API_KEY}} content-type: application/json { - "url":"https://firecrawl.dev" + "url": "https://firecrawl.dev" } ### Crawl Website @@ -65,38 +65,6 @@ content-type: application/json "sitemapOnly": true } -### Extract Firecrawl Title -# @name extractFirecrawl -POST {{baseUrl}}/v1/extract HTTP/1.1 -Authorization: Bearer {{$dotenv TEST_API_KEY}} -content-type: application/json - -{ - "urls": [ - "https://firecrawl.dev/blog" - ], - "origin": "api-sdk", - "prompt": "Extract all the blog titles from the page, is multity entity = true", - "schema": { - "type": "object", - "properties": { - "blog_titles": { - "type": "array", - "items": { - "type": "string" - } - } - }, - "required": ["blog_titles"] - } -} - -### -@extractFirecrawlId = {{extractFirecrawl.response.body.$.id}} -# @name extractFirecrawlStatus -GET {{baseUrl}}/v1/extract/{{extractFirecrawlId}} HTTP/1.1 -Authorization: Bearer {{$dotenv TEST_API_KEY}} - ### DELETE {{baseUrl}}/v1/crawl/c94136f9-86c1-4a97-966c-1c8e0274778f HTTP/1.1 Authorization: Bearer {{$dotenv TEST_API_KEY}} diff --git a/apps/api/src/controllers/auth.ts b/apps/api/src/controllers/auth.ts index 8e2e260f..d0392428 100644 --- a/apps/api/src/controllers/auth.ts +++ b/apps/api/src/controllers/auth.ts @@ -1,5 +1,5 @@ import { parseApi } from "../lib/parseApi"; -import { getRateLimiter, isTestSuiteToken } from "../services/rate-limiter"; +import { getRateLimiter } from "../services/rate-limiter"; import { AuthResponse, NotificationType, @@ -89,6 +89,8 @@ const mockPreviewACUC: (team_id: string, is_extract: boolean) => AuthCreditUsage preview: 5, crawlStatus: 500, extractStatus: 500, + extractAgentPreview: 1, + scrapeAgentPreview: 5, }, price_credits: 99999999, credits_used: 0, @@ -121,6 +123,8 @@ const mockACUC: () => AuthCreditUsageChunk = () => ({ preview: 99999999, crawlStatus: 99999999, extractStatus: 99999999, + extractAgentPreview: 99999999, + scrapeAgentPreview: 99999999, }, price_credits: 99999999, credits_used: 0, diff --git a/apps/api/src/controllers/v0/crawl-status.ts b/apps/api/src/controllers/v0/crawl-status.ts index aefcda13..f9ed7917 100644 --- a/apps/api/src/controllers/v0/crawl-status.ts +++ b/apps/api/src/controllers/v0/crawl-status.ts @@ -64,7 +64,7 @@ export async function getJobs(crawlId: string, ids: string[]): Promise { @@ -125,6 +128,12 @@ export async function scrapeController( } } + const cost_tracking = doc?.metadata?.costTracking; + + if (doc && doc.metadata) { + delete doc.metadata.costTracking; + } + return res.status(200).json({ success: true, data: doc, diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index c143a52f..9e212d4d 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -55,6 +55,17 @@ export const url = z.preprocess( const strictMessage = "Unrecognized key in body -- please review the v1 API documentation for request body changes"; +export const agentExtractModelValue = 'fire-1' +export const isAgentExtractModelValid = (x: string | undefined) => x?.toLowerCase() === agentExtractModelValue; + +export const agentOptionsExtract = z + .object({ + model: z.string().default(agentExtractModelValue), + }) + .strict(strictMessage); + +export type AgentOptions = z.infer; + export const extractOptions = z .object({ mode: z.enum(["llm"]).default("llm"), @@ -62,13 +73,53 @@ export const extractOptions = z systemPrompt: z .string() .max(10000) - .default( - "Based on the information on the page, extract all the information from the schema in JSON format. Try to extract all the fields even those that might not be marked as required.", - ), + .default(""), prompt: z.string().max(10000).optional(), temperature: z.number().optional(), }) - .strict(strictMessage); + .strict(strictMessage) + .transform((data) => ({ + ...data, + systemPrompt: "Based on the information on the page, extract all the information from the schema in JSON format. Try to extract all the fields even those that might not be marked as required." + })); + +export const extractOptionsWithAgent = z + .object({ + mode: z.enum(["llm"]).default("llm"), + schema: z.any().optional(), + systemPrompt: z + .string() + .max(10000) + .default(""), + prompt: z.string().max(10000).optional(), + temperature: z.number().optional(), + agent: z + .object({ + model: z.string().default(agentExtractModelValue), + prompt: z.string().optional(), + }) + .optional(), + }) + .strict(strictMessage) + .transform((data) => ({ + ...data, + systemPrompt: isAgentExtractModelValid(data.agent?.model) + ? `You are an expert web data extractor. Your task is to analyze the provided markdown content from a web page and generate a JSON object based *strictly* on the provided schema. + +Key Instructions: +1. **Schema Adherence:** Populate the JSON object according to the structure defined in the schema. +2. **Content Grounding:** Extract information *only* if it is explicitly present in the provided markdown. Do NOT infer or fabricate information. +3. **Missing Information:** If a piece of information required by the schema cannot be found in the markdown, use \`null\` for that field's value. +4. **SmartScrape Recommendation:** + * Assess if the *full* required data seems unavailable in the current markdown likely because: + - Content requires user interaction to reveal (e.g., clicking buttons, hovering, scrolling) + - Content uses pagination (e.g., "Load More" buttons, numbered pagination, infinite scroll) + - Content is dynamically loaded after user actions + * If the content requires user interaction or pagination to be fully accessible, set \`shouldUseSmartscrape\` to \`true\` in your response and provide a clear \`reasoning\` and \`prompt\` for the SmartScrape tool. + * If the content is simply JavaScript rendered but doesn't require interaction, set \`shouldUseSmartscrape\` to \`false\`. +5. **Output Format:** Your final output MUST be a single, valid JSON object conforming precisely to the schema. Do not include any explanatory text outside the JSON structure.` + : "Based on the information on the page, extract all the information from the schema in JSON format. Try to extract all the fields even those that might not be marked as required." + })); export type ExtractOptions = z.infer; @@ -253,17 +304,24 @@ const baseScrapeOptions = z }) .strict(strictMessage); +const fire1Refine = (obj) => { + if (obj.agent?.model?.toLowerCase() === "fire-1" && obj.jsonOptions?.agent?.model?.toLowerCase() === "fire-1") { + return false; + } + return true; +} +const fire1RefineOpts = { + message: "You may only specify the FIRE-1 model in agent or jsonOptions.agent, but not both.", +}; const extractRefine = (obj) => { const hasExtractFormat = obj.formats?.includes("extract"); const hasExtractOptions = obj.extract !== undefined; const hasJsonFormat = obj.formats?.includes("json"); const hasJsonOptions = obj.jsonOptions !== undefined; return ( - (hasExtractFormat && hasExtractOptions) - || (!hasExtractFormat && !hasExtractOptions) - ) && ( - (hasJsonFormat && hasJsonOptions) - || (!hasJsonFormat && !hasJsonOptions) + ((hasExtractFormat && hasExtractOptions) || + (!hasExtractFormat && !hasExtractOptions)) && + ((hasJsonFormat && hasJsonOptions) || (!hasJsonFormat && !hasJsonOptions)) ); }; const extractRefineOpts = { @@ -277,7 +335,7 @@ const extractTransform = (obj) => { obj.extract || obj.formats?.includes("json") || obj.jsonOptions) && - (obj.timeout === 30000) + obj.timeout === 30000 ) { obj = { ...obj, timeout: 60000 }; } @@ -290,6 +348,10 @@ const extractTransform = (obj) => { obj = { ...obj, timeout: 60000 }; } + if (obj.agent) { + obj = { ...obj, timeout: 300000 }; + } + if (obj.formats?.includes("json")) { obj.formats.push("extract"); } @@ -302,6 +364,7 @@ const extractTransform = (obj) => { prompt: obj.jsonOptions.prompt, systemPrompt: obj.jsonOptions.systemPrompt, schema: obj.jsonOptions.schema, + agent: obj.jsonOptions.agent, mode: "llm", }, }; @@ -311,6 +374,16 @@ const extractTransform = (obj) => { }; export const scrapeOptions = baseScrapeOptions + .extend({ + agent: z + .object({ + model: z.string().default(agentExtractModelValue), + prompt: z.string().optional(), + }) + .optional(), + extract: extractOptionsWithAgent.optional(), + jsonOptions: extractOptionsWithAgent.optional(), + }) .refine( (obj) => { if (!obj.actions) return true; @@ -324,11 +397,13 @@ export const scrapeOptions = baseScrapeOptions }, ) .refine(extractRefine, extractRefineOpts) + .refine(fire1Refine, fire1RefineOpts) .transform(extractTransform); -export type ScrapeOptions = z.infer; +export type ScrapeOptions = z.infer; import Ajv from "ajv"; +import type { CostTracking } from "../../lib/extract/extraction-service"; const ajv = new Ajv(); @@ -362,7 +437,7 @@ export const extractV1Options = z includeSubdomains: z.boolean().default(true), allowExternalLinks: z.boolean().default(false), enableWebSearch: z.boolean().default(false), - scrapeOptions: scrapeOptions.default({ onlyMainContent: false }).optional(), + scrapeOptions: baseScrapeOptions.default({ onlyMainContent: false }).optional(), origin: z.string().optional().default("api"), urlTrace: z.boolean().default(false), timeout: z.number().int().positive().finite().safe().default(60000), @@ -375,14 +450,13 @@ export const extractV1Options = z .enum(["direct", "save", "load"]) .default("direct") .optional(), + agent: agentOptionsExtract.optional(), + __experimental_showCostTracking: z.boolean().default(false), }) .strict(strictMessage) - .refine( - (obj) => obj.urls || obj.prompt, - { - message: "Either 'urls' or 'prompt' must be provided.", - }, - ) + .refine((obj) => obj.urls || obj.prompt, { + message: "Either 'urls' or 'prompt' must be provided.", + }) .transform((obj) => ({ ...obj, allowExternalLinks: obj.allowExternalLinks || obj.enableWebSearch, @@ -391,6 +465,10 @@ export const extractV1Options = z (x) => (x.scrapeOptions ? extractRefine(x.scrapeOptions) : true), extractRefineOpts, ) + .refine( + (x) => (x.scrapeOptions ? fire1Refine(x.scrapeOptions) : true), + fire1RefineOpts, + ) .transform((x) => ({ ...x, scrapeOptions: x.scrapeOptions @@ -407,11 +485,20 @@ export const scrapeRequestSchema = baseScrapeOptions .omit({ timeout: true }) .extend({ url, + agent: z + .object({ + model: z.string().default(agentExtractModelValue), + prompt: z.string().optional(), + }) + .optional(), + extract: extractOptionsWithAgent.optional(), + jsonOptions: extractOptionsWithAgent.optional(), origin: z.string().optional().default("api"), timeout: z.number().int().positive().finite().safe().default(30000), }) .strict(strictMessage) .refine(extractRefine, extractRefineOpts) + .refine(fire1Refine, fire1RefineOpts) .transform(extractTransform); export type ScrapeRequest = z.infer; @@ -447,6 +534,7 @@ export const batchScrapeRequestSchema = baseScrapeOptions }) .strict(strictMessage) .refine(extractRefine, extractRefineOpts) + .refine(fire1Refine, fire1RefineOpts) .transform(extractTransform); export const batchScrapeRequestSchemaNoURLValidation = baseScrapeOptions @@ -459,6 +547,7 @@ export const batchScrapeRequestSchemaNoURLValidation = baseScrapeOptions }) .strict(strictMessage) .refine(extractRefine, extractRefineOpts) + .refine(fire1Refine, fire1RefineOpts) .transform(extractTransform); export type BatchScrapeRequest = z.infer; @@ -498,12 +587,13 @@ export const crawlRequestSchema = crawlerOptions .extend({ url, origin: z.string().optional().default("api"), - scrapeOptions: scrapeOptions.default({}), + scrapeOptions: baseScrapeOptions.default({}), webhook: webhookSchema.optional(), limit: z.number().default(10000), }) .strict(strictMessage) .refine((x) => extractRefine(x.scrapeOptions), extractRefineOpts) + .refine((x) => fire1Refine(x.scrapeOptions), fire1RefineOpts) .transform((x) => ({ ...x, scrapeOptions: extractTransform(x.scrapeOptions), @@ -563,8 +653,8 @@ export type Document = { screenshots?: string[]; scrapes?: ScrapeActionContent[]; javascriptReturns?: { - type: string, - value: unknown + type: string; + value: unknown; }[]; }; changeTracking?: { @@ -609,6 +699,7 @@ export type Document = { ogLocaleAlternate?: string[]; ogSiteName?: string; ogVideo?: string; + favicon?: string; dcTermsCreated?: string; dcDateCreated?: string; dcDate?: string; @@ -628,7 +719,8 @@ export type Document = { statusCode: number; scrapeId?: string; error?: string; - [key: string]: string | string[] | number | undefined; + costTracking?: CostTracking; + // [key: string]: string | string[] | number | { smartScrape: number; other: number; total: number } | undefined; }; serpResults?: { title: string; @@ -798,6 +890,8 @@ export type AuthCreditUsageChunk = { preview: number; crawlStatus: number; extractStatus: number; + extractAgentPreview?: number; + scrapeAgentPreview?: number; }; concurrency: number; @@ -895,7 +989,7 @@ export function fromLegacyCrawlerOptions(x: any, teamId: string): { ignoreQueryParameters: x.ignoreQueryParameters, regexOnFullURL: x.regexOnFullURL, maxDiscoveryDepth: x.maxDiscoveryDepth, - }), + }), internalOptions: { v0CrawlOnlyUrls: x.returnOnlyUrls, teamId, @@ -1054,6 +1148,7 @@ export const searchRequestSchema = z "Unrecognized key in body -- please review the v1 API documentation for request body changes", ) .refine((x) => extractRefine(x.scrapeOptions), extractRefineOpts) + .refine((x) => fire1Refine(x.scrapeOptions), fire1RefineOpts) .transform((x) => ({ ...x, scrapeOptions: extractTransform(x.scrapeOptions), @@ -1099,6 +1194,6 @@ export type GenerateLLMsTextRequest = z.infer< export class TimeoutSignal extends Error { constructor() { - super("Operation timed out") + super("Operation timed out"); } } diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index 50fce459..efca0479 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -25,6 +25,7 @@ import { ErrorResponse, ResponseWithSentry } from "./controllers/v1/types"; import { ZodError } from "zod"; import { v4 as uuidv4 } from "uuid"; import { RateLimiterMode } from "./types"; +import { attachWsProxy } from "./services/agentLivecastWS"; const { createBullBoard } = require("@bull-board/api"); const { BullAdapter } = require("@bull-board/api/bullAdapter"); @@ -39,7 +40,9 @@ const cacheable = new CacheableLookup(); cacheable.install(http.globalAgent); cacheable.install(https.globalAgent); -const ws = expressWs(express()); +// Initialize Express with WebSocket support +const expressApp = express(); +const ws = expressWs(expressApp); const app = ws.app; global.isProduction = process.env.IS_PRODUCTION === "true"; @@ -87,6 +90,9 @@ const DEFAULT_PORT = process.env.PORT ?? 3002; const HOST = process.env.HOST ?? "localhost"; function startServer(port = DEFAULT_PORT) { + // Attach WebSocket proxy to the Express app + attachWsProxy(app); + const server = app.listen(Number(port), HOST, () => { logger.info(`Worker ${process.pid} listening on port ${port}`); }); diff --git a/apps/api/src/lib/deep-research/research-manager.ts b/apps/api/src/lib/deep-research/research-manager.ts index 900dcd21..87655fac 100644 --- a/apps/api/src/lib/deep-research/research-manager.ts +++ b/apps/api/src/lib/deep-research/research-manager.ts @@ -5,9 +5,12 @@ import { DeepResearchSource, updateDeepResearch, } from "./deep-research-redis"; -import { generateCompletions, trimToTokenLimit } from "../../scraper/scrapeURL/transformers/llmExtract"; +import { + generateCompletions, + trimToTokenLimit, +} from "../../scraper/scrapeURL/transformers/llmExtract"; import { ExtractOptions } from "../../controllers/v1/types"; -import { openai } from "@ai-sdk/openai/dist"; + import { getModel } from "../generic-ai"; interface AnalysisResult { gaps: string[]; @@ -52,7 +55,7 @@ export class ResearchStateManager { } async addActivity(activities: DeepResearchActivity[]): Promise { - if (activities.some(activity => activity.status === "complete")) { + if (activities.some((activity) => activity.status === "complete")) { this.completedSteps++; } @@ -190,7 +193,7 @@ export class ResearchLLMService { Every search query is a new SERP query so make sure the whole context is added without overwhelming the search engine. The first SERP query you generate should be a very concise, simple version of the topic. `, }, - markdown: "" + markdown: "", }); return extract.queries; @@ -260,31 +263,31 @@ export class ResearchLLMService { formats?: string[], jsonOptions?: ExtractOptions, ): Promise { - if(!formats) { - formats = ['markdown']; + if (!formats) { + formats = ["markdown"]; } - if(!jsonOptions) { + if (!jsonOptions) { jsonOptions = undefined; } - + const { extract } = await generateCompletions({ logger: this.logger.child({ method: "generateFinalAnalysis", }), - mode: formats.includes('json') ? 'object' : 'no-object', + mode: formats.includes("json") ? "object" : "no-object", options: { mode: "llm", - ...(formats.includes('json') && { - ...jsonOptions + ...(formats.includes("json") && { + ...jsonOptions, }), - systemPrompt: formats.includes('json') + systemPrompt: formats.includes("json") ? "You are an expert research analyst who creates comprehensive, structured analysis following the provided JSON schema exactly." : "You are an expert research analyst who creates comprehensive, well-structured reports. Don't begin the report by saying 'Here is the report', nor 'Below is the report', nor something similar. ALWAYS start with a great title that reflects the research topic and findings. Your reports are detailed, properly formatted in Markdown, and include clear sections with citations. Today's date is " + new Date().toISOString().split("T")[0], prompt: trimToTokenLimit( analysisPrompt ? `${analysisPrompt}\n\nResearch data:\n${findings.map((f) => `[From ${f.source}]: ${f.text}`).join("\n")}` - : formats.includes('json') + : formats.includes("json") ? `Analyze the following research data on "${topic}" and structure the output according to the provided schema: Schema: ${JSON.stringify(jsonOptions?.schema)}\n\nFindings:\n\n${findings.map((f) => `[From ${f.source}]: ${f.text}`).join("\n")}` : `Create a comprehensive research report on "${topic}" based on the collected findings and analysis. @@ -308,7 +311,7 @@ export class ResearchLLMService { ).text, }, markdown: "", - model: getModel('o3-mini'), + model: getModel("o3-mini"), }); return extract; diff --git a/apps/api/src/lib/extract/build-prompts.ts b/apps/api/src/lib/extract/build-prompts.ts index f7dd4e32..a593efc3 100644 --- a/apps/api/src/lib/extract/build-prompts.ts +++ b/apps/api/src/lib/extract/build-prompts.ts @@ -10,7 +10,7 @@ Provide a rephrased search query that: 4. Is concise and focused 5. Short is better than long 6. It is a search engine, not a chatbot -7. Concise +7. Concise, no more than 3 words besides the site Return only the rephrased search query, without any explanation or additional text.`; } @@ -40,7 +40,20 @@ to determine their relevance to the user's query and intent. } export function buildRerankerUserPrompt(searchQuery: string): string { - return `Given these URLs and their content, identify which ones are relevant to the user's extraction request: "${searchQuery}". Return an array of relevant links with their relevance scores (0-1). Higher scores should be given to URLs that directly address the user's extraction request. Be very mindful with the links you select, as if they are not that relevant it may affect the quality of the extraction. Only include URLs that have a relevancy score of 0.6+.`; + return `Given these URLs and their content, analyze their relevance to this extraction request: "${searchQuery}". + +For each URL, consider: +1. How well it matches the extraction needs +2. The quantity and quality of extractable information +3. Whether the content structure matches what we're looking for + +Score each URL from 0-1 based on the scoring guidelines provided in the system prompt. + +Provide detailed reasoning for each URL to explain why you assigned that score, considering: +- Content relevance +- Information completeness +- Structure suitability +- Potential extraction value`; } // Multi entity schema anlayzer @@ -73,7 +86,7 @@ export function buildAnalyzeSchemaUserPrompt( urls: string[], ): string { return `Classify the query as Single-Answer or Multi-Entity. For Multi-Entity, return keys with large arrays; otherwise, return none: -Schema: ${schemaString}\nPrompt: ${prompt}\nRelevant URLs: ${urls}`; +Schema: ${schemaString}\nPrompt: ${prompt}\n URLs: ${urls}`; } // Should Extract @@ -97,8 +110,7 @@ export function buildBatchExtractSystemPrompt( ): string { return ( (systemPrompt ? `${systemPrompt}\n` : "") + - `Always prioritize using the provided content to answer the question. Do not make up an answer. Do not hallucinate. In case you can't find the information and the string is required, instead of 'N/A' or 'Not speficied', return an empty string: '', if it's not a string and you can't find the information, return null. Be concise and follow the schema always if provided. If the document provided is not relevant to the prompt nor to the final user schema ${JSON.stringify(multiEntitySchema)}, return null. Here are the urls the user provided of which he wants to extract information from: ` + - links.join(", ") + `Always prioritize using the provided content to answer the question. Do not make up an answer. Do not hallucinate. In case you can't find the information and the string is required, instead of 'N/A' or 'Not speficied', return an empty string: '', if it's not a string and you can't find the information, return null. Be concise and follow the schema always if provided. If the document provided is not relevant to the prompt nor to the final user schema ${JSON.stringify(multiEntitySchema)}, return null.` ); } diff --git a/apps/api/src/lib/extract/completions/analyzeSchemaAndPrompt.ts b/apps/api/src/lib/extract/completions/analyzeSchemaAndPrompt.ts index 6aeee679..b02588be 100644 --- a/apps/api/src/lib/extract/completions/analyzeSchemaAndPrompt.ts +++ b/apps/api/src/lib/extract/completions/analyzeSchemaAndPrompt.ts @@ -19,12 +19,16 @@ export async function analyzeSchemaAndPrompt( ): Promise<{ isMultiEntity: boolean; multiEntityKeys: string[]; - reasoning?: string; - keyIndicators?: string[]; + reasoning: string; + keyIndicators: string[]; tokenUsage: TokenUsage; + cost: number; }> { + let cost = 0; if (!schema) { - schema = await generateSchemaFromPrompt(prompt); + const genRes = await generateSchemaFromPrompt(prompt); + schema = genRes.extract; + cost = genRes.cost; } const schemaString = JSON.stringify(schema); @@ -44,7 +48,7 @@ export async function analyzeSchemaAndPrompt( ); try { - const { extract: result, totalUsage } = await generateCompletions({ + const { extract: result, totalUsage, cost: cost2 } = await generateCompletions({ logger, options: { mode: "llm", @@ -55,6 +59,7 @@ export async function analyzeSchemaAndPrompt( markdown: "", model, }); + cost += cost2; const { isMultiEntity, multiEntityKeys, reasoning, keyIndicators } = checkSchema.parse(result); @@ -65,6 +70,7 @@ export async function analyzeSchemaAndPrompt( reasoning, keyIndicators, tokenUsage: totalUsage, + cost, }; } catch (e) { logger.warn("(analyzeSchemaAndPrompt) Error parsing schema analysis", { @@ -83,5 +89,6 @@ export async function analyzeSchemaAndPrompt( totalTokens: 0, model: model.modelId, }, + cost: 0, }; } diff --git a/apps/api/src/lib/extract/completions/batchExtract.ts b/apps/api/src/lib/extract/completions/batchExtract.ts index f70c8390..e7f2bb03 100644 --- a/apps/api/src/lib/extract/completions/batchExtract.ts +++ b/apps/api/src/lib/extract/completions/batchExtract.ts @@ -1,5 +1,8 @@ import { logger } from "../../../lib/logger"; -import { generateCompletions } from "../../../scraper/scrapeURL/transformers/llmExtract"; +import { + generateCompletions, + GenerateCompletionsOptions, +} from "../../../scraper/scrapeURL/transformers/llmExtract"; import { buildDocument } from "../build-document"; import { ExtractResponse, TokenUsage } from "../../../controllers/v1/types"; import { Document } from "../../../controllers/v1/types"; @@ -7,6 +10,19 @@ import { buildBatchExtractPrompt, buildBatchExtractSystemPrompt, } from "../build-prompts"; +import { getModel } from "../../generic-ai"; + +import fs from "fs/promises"; +import { extractData } from "../../../scraper/scrapeURL/lib/extractSmartScrape"; + +type BatchExtractOptions = { + multiEntitySchema: any; + links: string[]; + prompt: string; + systemPrompt: string; + doc: Document; + useAgent: boolean; +}; /** * Batch extract information from a list of URLs using a multi-entity schema. @@ -17,20 +33,21 @@ import { * @param doc - The document to extract information from * @returns The completion promise */ -export async function batchExtractPromise( - multiEntitySchema: any, - links: string[], - prompt: string, - systemPrompt: string, - doc: Document, -): Promise<{ - extract: any; +export async function batchExtractPromise(options: BatchExtractOptions): Promise<{ + extract: any; // array of extracted data numTokens: number; totalUsage: TokenUsage; warning?: string; sources: string[]; + smartScrapeCost: number; + otherCost: number; + smartScrapeCallCount: number; + otherCallCount: number; }> { - const completion = await generateCompletions({ + const { multiEntitySchema, links, prompt, systemPrompt, doc, useAgent } = options; + + + const generationOptions: GenerateCompletionsOptions = { logger: logger.child({ method: "extractService/generateCompletions", }), @@ -45,13 +62,49 @@ export async function batchExtractPromise( schema: multiEntitySchema, }, markdown: buildDocument(doc), - isExtractEndpoint: true - }); + isExtractEndpoint: true, + model: getModel("gemini-2.0-flash", "google"), + }; + let extractedDataArray: any[] = []; + let warning: string | undefined; + let smCost = 0, oCost = 0, smCallCount = 0, oCallCount = 0; + try { + const { extractedDataArray: e, warning: w, smartScrapeCost, otherCost, smartScrapeCallCount, otherCallCount } = await extractData({ + extractOptions: generationOptions, + urls: [doc.metadata.sourceURL || doc.metadata.url || ""], + useAgent, + }); + extractedDataArray = e; + warning = w; + smCost = smartScrapeCost; + oCost = otherCost; + smCallCount = smartScrapeCallCount; + oCallCount = otherCallCount; + } catch (error) { + console.error(">>>>>>>error>>>>>\n", error); + } + + // await fs.writeFile( + // `logs/extractedDataArray-${crypto.randomUUID()}.json`, + // JSON.stringify(extractedDataArray, null, 2), + // ); + + // TODO: fix this return { - extract: completion.extract, - numTokens: completion.numTokens, - totalUsage: completion.totalUsage, - sources: [doc.metadata.url || doc.metadata.sourceURL || ""] + extract: extractedDataArray, + numTokens: 0, + totalUsage: { + promptTokens: 0, + completionTokens: 0, + totalTokens: 0, + model: "gemini-2.0-flash", + }, + warning: warning, + sources: [doc.metadata.url || doc.metadata.sourceURL || ""], + smartScrapeCost: smCost, + otherCost: oCost, + smartScrapeCallCount: smCallCount, + otherCallCount: oCallCount, }; } diff --git a/apps/api/src/lib/extract/completions/checkShouldExtract.ts b/apps/api/src/lib/extract/completions/checkShouldExtract.ts index da945010..3bff4fc7 100644 --- a/apps/api/src/lib/extract/completions/checkShouldExtract.ts +++ b/apps/api/src/lib/extract/completions/checkShouldExtract.ts @@ -12,7 +12,7 @@ export async function checkShouldExtract( prompt: string, multiEntitySchema: any, doc: Document, -): Promise<{ tokenUsage: TokenUsage; extract: boolean }> { +): Promise<{ tokenUsage: TokenUsage; extract: boolean; cost: number }> { const shouldExtractCheck = await generateCompletions({ logger: logger.child({ method: "extractService/checkShouldExtract" }), options: { @@ -37,5 +37,6 @@ export async function checkShouldExtract( return { tokenUsage: shouldExtractCheck.totalUsage, extract: shouldExtractCheck.extract["extract"], + cost: shouldExtractCheck.cost, }; } diff --git a/apps/api/src/lib/extract/completions/singleAnswer.ts b/apps/api/src/lib/extract/completions/singleAnswer.ts index 86d19fe6..27e3cad0 100644 --- a/apps/api/src/lib/extract/completions/singleAnswer.ts +++ b/apps/api/src/lib/extract/completions/singleAnswer.ts @@ -1,7 +1,12 @@ import { logger } from "../../../lib/logger"; -import { generateCompletions } from "../../../scraper/scrapeURL/transformers/llmExtract"; +import { + generateCompletions, + GenerateCompletionsOptions, +} from "../../../scraper/scrapeURL/transformers/llmExtract"; import { buildDocument } from "../build-document"; import { Document, TokenUsage } from "../../../controllers/v1/types"; +import { getModel } from "../../../lib/generic-ai"; +import { extractData } from "../../../scraper/scrapeURL/lib/extractSmartScrape"; export async function singleAnswerCompletion({ singleAnswerDocs, @@ -9,34 +14,85 @@ export async function singleAnswerCompletion({ links, prompt, systemPrompt, + useAgent }: { singleAnswerDocs: Document[]; rSchema: any; links: string[]; prompt: string; systemPrompt: string; + useAgent: boolean; }): Promise<{ extract: any; tokenUsage: TokenUsage; sources: string[]; + smartScrapeCallCount: number; + smartScrapeCost: number; + otherCallCount: number; + otherCost: number; }> { - const completion = await generateCompletions({ + const docsPrompt = `Today is: ` + new Date().toISOString() + `.\n` + prompt; + const generationOptions: GenerateCompletionsOptions = { logger: logger.child({ module: "extract", method: "generateCompletions" }), options: { mode: "llm", systemPrompt: (systemPrompt ? `${systemPrompt}\n` : "") + - "Always prioritize using the provided content to answer the question. Do not make up an answer. Do not hallucinate. In case you can't find the information and the string is required, instead of 'N/A' or 'Not speficied', return an empty string: '', if it's not a string and you can't find the information, return null. Be concise and follow the schema always if provided. Here are the urls the user provided of which he wants to extract information from: " + - links.join(", "), - prompt: "Today is: " + new Date().toISOString() + "\n" + prompt, - schema: rSchema, - }, - markdown: singleAnswerDocs.map((x) => buildDocument(x)).join("\n"), - isExtractEndpoint: true + "Always prioritize using the provided content to answer the question. Do not make up an answer. Do not hallucinate. In case you can't find the information and the string is required, instead of 'N/A' or 'Not speficied', return an empty string: '', if it's not a string and you can't find the information, return null. Be concise and follow the schema always if provided.", + prompt: docsPrompt, + schema: rSchema, + }, + markdown: `${singleAnswerDocs.map((x, i) => `[START_PAGE (ID: ${i})]` + buildDocument(x)).join("\n")} [END_PAGE]\n`, + isExtractEndpoint: true, + model: getModel("gemini-2.0-flash", "google"), + }; + + const { extractedDataArray, warning, smartScrapeCost, otherCost, smartScrapeCallCount, otherCallCount } = await extractData({ + extractOptions: generationOptions, + urls: singleAnswerDocs.map(doc => doc.metadata.url || doc.metadata.sourceURL || ""), + useAgent, }); - return { - extract: completion.extract, - tokenUsage: completion.totalUsage, - sources: singleAnswerDocs.map(doc => doc.metadata.url || doc.metadata.sourceURL || "") + + const completion = { + extract: extractedDataArray, + tokenUsage: { + promptTokens: 0, + completionTokens: 0, + totalTokens: 0, + model: "gemini-2.0-flash", + }, + sources: singleAnswerDocs.map( + (doc) => doc.metadata.url || doc.metadata.sourceURL || "", + ), + }; + + // const completion = await generateCompletions({ + // logger: logger.child({ module: "extract", method: "generateCompletions" }), + // options: { + // mode: "llm", + // systemPrompt: + // (systemPrompt ? `${systemPrompt}\n` : "") + + // "Always prioritize using the provided content to answer the question. Do not make up an answer. Do not hallucinate. In case you can't find the information and the string is required, instead of 'N/A' or 'Not speficied', return an empty string: '', if it's not a string and you can't find the information, return null. Be concise and follow the schema always if provided.", + // prompt: "Today is: " + new Date().toISOString() + "\n" + prompt, + // schema: rSchema, + // }, + // markdown: singleAnswerDocs.map((x) => buildDocument(x)).join("\n"), + // isExtractEndpoint: true, + // model: getModel("gemini-2.0-flash", "google"), + // }); + // await fs.writeFile( + // `logs/singleAnswer-${crypto.randomUUID()}.json`, + // JSON.stringify(completion, null, 2), + // ); + return { + extract: completion.extract, + tokenUsage: completion.tokenUsage, + sources: singleAnswerDocs.map( + (doc) => doc.metadata.url || doc.metadata.sourceURL || "", + ), + smartScrapeCost, + otherCost, + smartScrapeCallCount, + otherCallCount, }; } diff --git a/apps/api/src/lib/extract/config.ts b/apps/api/src/lib/extract/config.ts index f6609f07..c401c5a6 100644 --- a/apps/api/src/lib/extract/config.ts +++ b/apps/api/src/lib/extract/config.ts @@ -2,8 +2,8 @@ export const extractConfig = { RERANKING: { MAX_INITIAL_RANKING_LIMIT: 1000, MAX_RANKING_LIMIT_FOR_RELEVANCE: 100, - INITIAL_SCORE_THRESHOLD_FOR_RELEVANCE: 0.75, - FALLBACK_SCORE_THRESHOLD_FOR_RELEVANCE: 0.5, + INITIAL_SCORE_THRESHOLD_FOR_RELEVANCE: 0.00000001, + FALLBACK_SCORE_THRESHOLD_FOR_RELEVANCE: 0.00000001, MIN_REQUIRED_LINKS: 1, }, DEDUPLICATION: { diff --git a/apps/api/src/lib/extract/extract-redis.ts b/apps/api/src/lib/extract/extract-redis.ts index 5c2ecdb4..e560f18d 100644 --- a/apps/api/src/lib/extract/extract-redis.ts +++ b/apps/api/src/lib/extract/extract-redis.ts @@ -1,5 +1,6 @@ import { redisConnection } from "../../services/queue-service"; import { logger as _logger } from "../logger"; +import { CostTracking } from "./extraction-service"; export enum ExtractStep { INITIAL = "initial", @@ -32,6 +33,8 @@ export type StoredExtract = { showLLMUsage?: boolean; showSources?: boolean; llmUsage?: number; + showCostTracking?: boolean; + costTracking?: CostTracking; sources?: { [key: string]: string[]; }; diff --git a/apps/api/src/lib/extract/extraction-service.ts b/apps/api/src/lib/extract/extraction-service.ts index f3d0c87b..882d115d 100644 --- a/apps/api/src/lib/extract/extraction-service.ts +++ b/apps/api/src/lib/extract/extraction-service.ts @@ -1,6 +1,7 @@ import { Document, ExtractRequest, + isAgentExtractModelValid, TokenUsage, URLTrace, } from "../../controllers/v1/types"; @@ -26,12 +27,8 @@ import { deduplicateObjectsArray } from "./helpers/deduplicate-objs-array"; import { mergeNullValObjs } from "./helpers/merge-null-val-objs"; import { areMergeable } from "./helpers/merge-null-val-objs"; import { CUSTOM_U_TEAMS } from "./config"; -import { - calculateFinalResultCost, - estimateTotalCost, -} from "./usage/llm-cost"; +import { calculateFinalResultCost, estimateTotalCost } from "./usage/llm-cost"; import { analyzeSchemaAndPrompt } from "./completions/analyzeSchemaAndPrompt"; -import { checkShouldExtract } from "./completions/checkShouldExtract"; import { batchExtractPromise } from "./completions/batchExtract"; import { singleAnswerCompletion } from "./completions/singleAnswer"; import { SourceTracker } from "./helpers/source-tracker"; @@ -39,13 +36,14 @@ import { getCachedDocs, saveCachedDocs } from "./helpers/cached-docs"; import { normalizeUrl } from "../canonical-url"; import { search } from "../../search"; import { buildRephraseToSerpPrompt } from "./build-prompts"; - +import fs from "fs/promises"; interface ExtractServiceOptions { request: ExtractRequest; teamId: string; subId?: string; cacheMode?: "load" | "save" | "direct"; cacheKey?: string; + agent?: boolean; } export interface ExtractResult { @@ -69,6 +67,14 @@ type completions = { sources?: string[]; }; +export type CostTracking = { + smartScrapeCallCount: number; + smartScrapeCost: number; + otherCallCount: number; + otherCost: number; + totalCost: number; + costLimitExceededTokenUsage?: number; +}; export async function performExtraction( extractId: string, @@ -83,7 +89,18 @@ export async function performExtraction( let singleAnswerResult: any = {}; let totalUrlsScraped = 0; let sources: Record = {}; + let costTracking: CostTracking = { + smartScrapeCallCount: 0, + smartScrapeCost: 0, + otherCallCount: 0, + otherCost: 0, + totalCost: 0, + }; + let log = { + extractId, + request, + }; const logger = _logger.child({ module: "extract", @@ -97,13 +114,21 @@ export async function performExtraction( logger.debug("Generating URLs from prompt...", { prompt: request.prompt, }); - const rephrasedPrompt = await generateBasicCompletion(buildRephraseToSerpPrompt(request.prompt)); + const rephrasedPrompt = await generateBasicCompletion( + buildRephraseToSerpPrompt(request.prompt), + ); + let rptxt = rephrasedPrompt?.text.replace('"', "").replace("'", "") || ""; + if (rephrasedPrompt) { + costTracking.otherCallCount++; + costTracking.otherCost += rephrasedPrompt.cost; + costTracking.totalCost += rephrasedPrompt.cost; + } const searchResults = await search({ - query: rephrasedPrompt.replace('"', "").replace("'", ""), + query: rptxt, num_results: 10, }); - request.urls = searchResults.map(result => result.url) as string[]; + request.urls = searchResults.map((result) => result.url) as string[]; } if (request.urls && request.urls.length === 0) { logger.error("No search results found", { @@ -118,7 +143,11 @@ export async function performExtraction( const urls = request.urls || ([] as string[]); - if (request.__experimental_cacheMode == "load" && request.__experimental_cacheKey && urls) { + if ( + request.__experimental_cacheMode == "load" && + request.__experimental_cacheKey && + urls + ) { logger.debug("Loading cached docs..."); try { const cache = await getCachedDocs(urls, request.__experimental_cacheKey); @@ -147,12 +176,66 @@ export async function performExtraction( ], }); + let reqSchema = request.schema; + if (!reqSchema && request.prompt) { + const schemaGenRes = await generateSchemaFromPrompt(request.prompt); + reqSchema = schemaGenRes.extract; + costTracking.otherCallCount++; + costTracking.otherCost += schemaGenRes.cost; + costTracking.totalCost += schemaGenRes.cost; + + logger.debug("Generated request schema.", { + originalSchema: request.schema, + schema: reqSchema, + }); + } + + if (reqSchema) { + reqSchema = await dereferenceSchema(reqSchema); + } + + logger.debug("Transformed schema.", { + originalSchema: request.schema, + schema: reqSchema, + }); + + let rSchema = reqSchema; + + // agent evaluates if the schema or the prompt has an array with big amount of items + // also it checks if the schema any other properties that are not arrays + // if so, it splits the results into 2 types of completions: + // 1. the first one is a completion that will extract the array of items + // 2. the second one is multiple completions that will extract the items from the array + let startAnalyze = Date.now(); + const { + isMultiEntity, + multiEntityKeys, + reasoning, + keyIndicators, + tokenUsage: schemaAnalysisTokenUsage, + cost: schemaAnalysisCost, + } = await analyzeSchemaAndPrompt(urls, reqSchema, request.prompt ?? ""); + + logger.debug("Analyzed schema.", { + isMultiEntity, + multiEntityKeys, + reasoning, + keyIndicators, + }); + + costTracking.otherCallCount++; + costTracking.otherCost += schemaAnalysisCost; + costTracking.totalCost += schemaAnalysisCost; + + // Track schema analysis tokens + tokenUsage.push(schemaAnalysisTokenUsage); + let startMap = Date.now(); let aggMapLinks: string[] = []; logger.debug("Processing URLs...", { urlCount: request.urls?.length || 0, }); - + const urlPromises = urls.map((url) => processUrl( { @@ -164,6 +247,11 @@ export async function performExtraction( limit: request.limit, includeSubdomains: request.includeSubdomains, schema: request.schema, + log, + isMultiEntity, + reasoning, + multiEntityKeys, + keyIndicators, }, urlTraces, (links: string[]) => { @@ -180,6 +268,7 @@ export async function performExtraction( }); }, logger.child({ module: "extract", method: "processUrl", url }), + costTracking, ), ); @@ -189,6 +278,9 @@ export async function performExtraction( linkCount: links.length, }); + log["links"] = links; + log["linksLength"] = links.length; + if (links.length === 0) { logger.error("0 links! Bailing.", { linkCount: links.length, @@ -215,55 +307,8 @@ export async function performExtraction( ], }); - let reqSchema = request.schema; - if (!reqSchema && request.prompt) { - reqSchema = await generateSchemaFromPrompt(request.prompt); - logger.debug("Generated request schema.", { - originalSchema: request.schema, - schema: reqSchema, - }); - } - - if (reqSchema) { - reqSchema = await dereferenceSchema(reqSchema); - } - - logger.debug("Transformed schema.", { - originalSchema: request.schema, - schema: reqSchema, - }); - - // agent evaluates if the schema or the prompt has an array with big amount of items - // also it checks if the schema any other properties that are not arrays - // if so, it splits the results into 2 types of completions: - // 1. the first one is a completion that will extract the array of items - // 2. the second one is multiple completions that will extract the items from the array - let startAnalyze = Date.now(); - const { - isMultiEntity, - multiEntityKeys, - reasoning, - keyIndicators, - tokenUsage: schemaAnalysisTokenUsage, - } = await analyzeSchemaAndPrompt(links, reqSchema, request.prompt ?? ""); - - logger.debug("Analyzed schema.", { - isMultiEntity, - multiEntityKeys, - reasoning, - keyIndicators, - }); - - // Track schema analysis tokens - tokenUsage.push(schemaAnalysisTokenUsage); - - // console.log("\nIs Multi Entity:", isMultiEntity); - // console.log("\nMulti Entity Keys:", multiEntityKeys); - // console.log("\nReasoning:", reasoning); - // console.log("\nKey Indicators:", keyIndicators); - - let rSchema = reqSchema; if (isMultiEntity && reqSchema) { + log["isMultiEntity"] = true; logger.debug("=== MULTI-ENTITY ==="); const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas( @@ -301,7 +346,8 @@ export async function performExtraction( logger.debug("Starting multi-entity scrape..."); let startScrape = Date.now(); - + log["docsSizeBeforeMultiEntityScrape"] = docsMap.size; + const scrapePromises = links.map((url) => { if (!docsMap.has(normalizeUrl(url))) { return scrapeDocument( @@ -323,7 +369,7 @@ export async function performExtraction( // Needs to be true for multi-entity to work properly onlyMainContent: true, - } + }, ); } return docsMap.get(normalizeUrl(url)); @@ -333,6 +379,8 @@ export async function performExtraction( (doc): doc is Document => doc !== null, ); + log["docsSizeAfterMultiEntityScrape"] = scrapePromises.length; + logger.debug("Multi-entity scrape finished.", { docCount: multyEntityDocs.length, }); @@ -365,7 +413,7 @@ export async function performExtraction( const chunkSize = 50; const timeoutCompletion = 45000; // 45 second timeout const chunks: Document[][] = []; - const extractionResults: {extract: any, url: string}[] = []; + const extractionResults: { extract: any; url: string }[] = []; // Split into chunks for (let i = 0; i < multyEntityDocs.length; i += chunkSize) { @@ -383,68 +431,36 @@ export async function performExtraction( setTimeout(() => resolve(null), timeoutCompletion); }); - // Check if page should be extracted before proceeding - const { extract, tokenUsage: shouldExtractCheckTokenUsage } = await checkShouldExtract( - request.prompt ?? "", + const completionPromise = batchExtractPromise({ multiEntitySchema, + links, + prompt: request.prompt ?? "", + systemPrompt: request.systemPrompt ?? "", doc, - ); - - tokenUsage.push(shouldExtractCheckTokenUsage); - - if (!extract) { - logger.info( - `Skipping extraction for ${doc.metadata.url} as content is irrelevant`, - ); - return null; - } - // Add confidence score to schema with 5 levels - const schemaWithConfidence = { - ...multiEntitySchema, - properties: { - ...multiEntitySchema.properties, - is_content_relevant: { - type: "boolean", - description: - "Determine if this content is relevant to the prompt. Return true ONLY if the content contains information that directly helps answer the prompt. Return false if the content is irrelevant or unlikely to contain useful information.", - }, - }, - required: [ - ...(multiEntitySchema.required || []), - "is_content_relevant", - ], - }; - - await updateExtract(extractId, { - status: "processing", - steps: [ - { - step: ExtractStep.MULTI_ENTITY_EXTRACT, - startedAt: startScrape, - finishedAt: Date.now(), - discoveredLinks: [ - doc.metadata.url || doc.metadata.sourceURL || "", - ], - }, - ], + useAgent: isAgentExtractModelValid(request.agent?.model) }); - const completionPromise = batchExtractPromise(multiEntitySchema, links, request.prompt ?? "", request.systemPrompt ?? "", doc); - // Race between timeout and completion - const multiEntityCompletion = (await Promise.race([ - completionPromise, - timeoutPromise, - ])) as Awaited>; + const multiEntityCompletion = (await completionPromise) as Awaited< + ReturnType + >; + + // TODO: merge multiEntityCompletion.extract to fit the multiEntitySchema // Track multi-entity extraction tokens if (multiEntityCompletion) { tokenUsage.push(multiEntityCompletion.totalUsage); - + + costTracking.smartScrapeCallCount += multiEntityCompletion.smartScrapeCallCount; + costTracking.smartScrapeCost += multiEntityCompletion.smartScrapeCost; + costTracking.otherCallCount += multiEntityCompletion.otherCallCount; + costTracking.otherCost += multiEntityCompletion.otherCost; + costTracking.totalCost += multiEntityCompletion.smartScrapeCost + multiEntityCompletion.otherCost; + if (multiEntityCompletion.extract) { return { extract: multiEntityCompletion.extract, - url: doc.metadata.url || doc.metadata.sourceURL || "" + url: doc.metadata.url || doc.metadata.sourceURL || "", }; } } @@ -490,42 +506,115 @@ export async function performExtraction( return null; } }); - // Wait for current chunk to complete before processing next chunk const chunkResults = await Promise.all(chunkPromises); - const validResults = chunkResults.filter((result): result is {extract: any, url: string} => result !== null); + const validResults = chunkResults.filter( + (result): result is { extract: any; url: string } => result !== null, + ); extractionResults.push(...validResults); - multiEntityCompletions.push(...validResults.map(r => r.extract)); + // Merge all extracts from valid results into a single array + const extractArrays = validResults.map((r) => + Array.isArray(r.extract) ? r.extract : [r.extract], + ); + const mergedExtracts = extractArrays.flat(); + multiEntityCompletions.push(...mergedExtracts); + multiEntityCompletions = multiEntityCompletions.filter((c) => c !== null); logger.debug("All multi-entity completion chunks finished.", { completionCount: multiEntityCompletions.length, }); + log["multiEntityCompletionsLength"] = multiEntityCompletions.length; } try { // Use SourceTracker to handle source tracking const sourceTracker = new SourceTracker(); - - // Transform and merge results while preserving sources - sourceTracker.transformResults(extractionResults, multiEntitySchema, false); - - multiEntityResult = transformArrayToObject( - multiEntitySchema, - multiEntityCompletions, - ); - - // Track sources before deduplication - sourceTracker.trackPreDeduplicationSources(multiEntityResult); - - // Apply deduplication and merge - multiEntityResult = deduplicateObjectsArray(multiEntityResult); - multiEntityResult = mergeNullValObjs(multiEntityResult); - - // Map sources to final deduplicated/merged items - const multiEntitySources = sourceTracker.mapSourcesToFinalItems(multiEntityResult, multiEntityKeys); - Object.assign(sources, multiEntitySources); + logger.debug("Created SourceTracker instance"); + // Transform and merge results while preserving sources + try { + sourceTracker.transformResults( + extractionResults, + multiEntitySchema, + false, + ); + logger.debug("Successfully transformed results with sourceTracker"); + } catch (error) { + const errorLog = `[${new Date().toISOString()}] Error in sourceTracker.transformResults: ${JSON.stringify(error, null, 2)}\n`; + await fs.appendFile('logs/extraction-errors.log', errorLog); + logger.error(`Error in sourceTracker.transformResults:`, { error }); + throw error; + } + + try { + multiEntityResult = transformArrayToObject( + multiEntitySchema, + multiEntityCompletions, + ); + logger.debug("Successfully transformed array to object"); + } catch (error) { + const errorLog = `[${new Date().toISOString()}] Error in transformArrayToObject: ${JSON.stringify(error, null, 2)}\n`; + await fs.appendFile('logs/extraction-errors.log', errorLog); + logger.error(`Error in transformArrayToObject:`, { error }); + throw error; + } + + // Track sources before deduplication + try { + sourceTracker.trackPreDeduplicationSources(multiEntityResult); + logger.debug("Successfully tracked pre-deduplication sources"); + } catch (error) { + const errorLog = `[${new Date().toISOString()}] Error in trackPreDeduplicationSources: ${JSON.stringify(error, null, 2)}\n`; + await fs.appendFile('logs/extraction-errors.log', errorLog); + logger.error(`Error in trackPreDeduplicationSources:`, { error }); + throw error; + } + + // Apply deduplication and merge + try { + multiEntityResult = deduplicateObjectsArray(multiEntityResult); + logger.debug("Successfully deduplicated objects array"); + } catch (error) { + const errorLog = `[${new Date().toISOString()}] Error in deduplicateObjectsArray: ${JSON.stringify(error, null, 2)}\n`; + await fs.appendFile('logs/extraction-errors.log', errorLog); + logger.error(`Error in deduplicateObjectsArray:`, { error }); + throw error; + } + + try { + multiEntityResult = mergeNullValObjs(multiEntityResult); + logger.debug("Successfully merged null value objects"); + } catch (error) { + const errorLog = `[${new Date().toISOString()}] Error in mergeNullValObjs: ${JSON.stringify(error, null, 2)}\n`; + await fs.appendFile('logs/extraction-errors.log', errorLog); + logger.error(`Error in mergeNullValObjs:`, { error }); + throw error; + } + + // Map sources to final deduplicated/merged items + try { + const multiEntitySources = sourceTracker.mapSourcesToFinalItems( + multiEntityResult, + multiEntityKeys, + ); + Object.assign(sources, multiEntitySources); + logger.debug("Successfully mapped sources to final items"); + } catch (error) { + const errorLog = `[${new Date().toISOString()}] Error in mapSourcesToFinalItems: ${JSON.stringify(error, null, 2)}\n`; + await fs.appendFile('logs/extraction-errors.log', errorLog); + logger.error(`Error in mapSourcesToFinalItems:`, { error }); + throw error; + } } catch (error) { - logger.error(`Failed to transform array to object`, { error }); + const errorLog = `[${new Date().toISOString()}] Failed to transform array to object\nError: ${JSON.stringify(error, null, 2)}\nStack: ${error.stack}\nMultiEntityResult: ${JSON.stringify(multiEntityResult, null, 2)}\nMultiEntityCompletions: ${JSON.stringify(multiEntityCompletions, null, 2)}\nMultiEntitySchema: ${JSON.stringify(multiEntitySchema, null, 2)}\n\n`; + await fs.appendFile('logs/extraction-errors.log', errorLog); + logger.error(`Failed to transform array to object`, { + error, + errorMessage: error.message, + errorStack: error.stack, + multiEntityResult: JSON.stringify(multiEntityResult), + multiEntityCompletions: JSON.stringify(multiEntityCompletions), + multiEntitySchema: JSON.stringify(multiEntitySchema) + }); return { success: false, error: @@ -542,6 +631,7 @@ export async function performExtraction( rSchema.properties && Object.keys(rSchema.properties).length > 0 ) { + log["isSingleEntity"] = true; logger.debug("=== SINGLE PAGES ===", { linkCount: links.length, schema: rSchema, @@ -564,6 +654,7 @@ export async function performExtraction( }, ], }); + log["docsSizeBeforeSingleEntityScrape"] = docsMap.size; const scrapePromises = links.map((url) => { if (!docsMap.has(normalizeUrl(url))) { return scrapeDocument( @@ -580,7 +671,7 @@ export async function performExtraction( url, isMultiEntity: false, }), - request.scrapeOptions + request.scrapeOptions, ); } return docsMap.get(normalizeUrl(url)); @@ -588,6 +679,7 @@ export async function performExtraction( try { const results = await Promise.all(scrapePromises); + log["docsSizeAfterSingleEntityScrape"] = docsMap.size; for (const doc of results) { if (doc?.metadata?.url) { @@ -640,31 +732,53 @@ export async function performExtraction( // Generate completions logger.debug("Generating singleAnswer completions..."); - let { extract: completionResult, tokenUsage: singleAnswerTokenUsage, sources: singleAnswerSources } = await singleAnswerCompletion({ + log["singleAnswerDocsLength"] = singleAnswerDocs.length; + let { + extract: completionResult, + tokenUsage: singleAnswerTokenUsage, + sources: singleAnswerSources, + smartScrapeCost: singleAnswerSmartScrapeCost, + otherCost: singleAnswerOtherCost, + smartScrapeCallCount: singleAnswerSmartScrapeCallCount, + otherCallCount: singleAnswerOtherCallCount, + } = await singleAnswerCompletion({ singleAnswerDocs, rSchema, links, prompt: request.prompt ?? "", systemPrompt: request.systemPrompt ?? "", + useAgent: isAgentExtractModelValid(request.agent?.model), }); + costTracking.smartScrapeCost += singleAnswerSmartScrapeCost; + costTracking.smartScrapeCallCount += singleAnswerSmartScrapeCallCount; + costTracking.otherCost += singleAnswerOtherCost; + costTracking.otherCallCount += singleAnswerOtherCallCount; + costTracking.totalCost += singleAnswerSmartScrapeCost + singleAnswerOtherCost; logger.debug("Done generating singleAnswer completions."); + singleAnswerResult = transformArrayToObject(rSchema, completionResult); + + singleAnswerResult = deduplicateObjectsArray(singleAnswerResult); // Track single answer extraction tokens and sources if (completionResult) { tokenUsage.push(singleAnswerTokenUsage); - + // Add sources for top-level properties in single answer if (rSchema?.properties) { - Object.keys(rSchema.properties).forEach(key => { + Object.keys(rSchema.properties).forEach((key) => { if (completionResult[key] !== undefined) { - sources[key] = singleAnswerSources || singleAnswerDocs.map(doc => doc.metadata.url || doc.metadata.sourceURL || ""); + sources[key] = + singleAnswerSources || + singleAnswerDocs.map( + (doc) => doc.metadata.url || doc.metadata.sourceURL || "", + ); } }); } } - singleAnswerResult = completionResult; - singleAnswerCompletions = singleAnswerResult; + // singleAnswerResult = completionResult; + // singleAnswerCompletions = singleAnswerResult; // Update token usage in traces // if (completions && completions.numTokens) { @@ -686,6 +800,9 @@ export async function performExtraction( // } } + log["singleAnswerResult"] = singleAnswerResult; + log["multiEntityResult"] = multiEntityResult; + let finalResult = reqSchema ? await mixSchemaObjects( reqSchema, @@ -776,11 +893,13 @@ export async function performExtraction( num_tokens: totalTokensUsed, tokens_billed: tokensToBill, sources, + cost_tracking: costTracking, }).then(() => { updateExtract(extractId, { status: "completed", llmUsage, sources, + costTracking, }).catch((error) => { logger.error( `Failed to update extract ${extractId} status to completed: ${error}`, @@ -790,15 +909,26 @@ export async function performExtraction( logger.debug("Done!"); - if (request.__experimental_cacheMode == "save" && request.__experimental_cacheKey) { + if ( + request.__experimental_cacheMode == "save" && + request.__experimental_cacheKey + ) { logger.debug("Saving cached docs..."); try { - await saveCachedDocs([...docsMap.values()], request.__experimental_cacheKey); + await saveCachedDocs( + [...docsMap.values()], + request.__experimental_cacheKey, + ); } catch (error) { logger.error("Error saving cached docs", { error }); } } + // fs.writeFile( + // `logs/${request.urls?.[0].replaceAll("https://", "").replaceAll("http://", "").replaceAll("/", "-").replaceAll(".", "-")}-extract-${extractId}.json`, + // JSON.stringify(log, null, 2), + // ); + return { success: true, data: finalResult ?? {}, diff --git a/apps/api/src/lib/extract/fire-0/build-document-f0.ts b/apps/api/src/lib/extract/fire-0/build-document-f0.ts new file mode 100644 index 00000000..6276eb54 --- /dev/null +++ b/apps/api/src/lib/extract/fire-0/build-document-f0.ts @@ -0,0 +1,17 @@ +import { Document } from "../../../controllers/v1/types"; + +export function buildDocument_F0(document: Document): string { + const metadata = document.metadata; + const markdown = document.markdown; + + // for each key in the metadata allow up to 250 characters + const metadataString = Object.entries(metadata) + .map(([key, value]) => { + return `${key}: ${value?.toString().slice(0, 250)}`; + }) + .join("\n"); + + const documentMetadataString = `\n- - - - - Page metadata - - - - -\n${metadataString}`; + const documentString = `${markdown}${documentMetadataString}`; + return documentString; +} diff --git a/apps/api/src/lib/extract/fire-0/build-prompts-f0.ts b/apps/api/src/lib/extract/fire-0/build-prompts-f0.ts new file mode 100644 index 00000000..20b6ac0d --- /dev/null +++ b/apps/api/src/lib/extract/fire-0/build-prompts-f0.ts @@ -0,0 +1,115 @@ +export function buildRefrasedPrompt_F0(prompt: string, url: string): string { + return `You are a search query optimizer. Your task is to rephrase the following prompt into an effective search query that will find relevant results about this topic on ${url}. + + Original prompt: "${prompt}" + + Provide a rephrased search query that: + 1. Maintains the core intent of the original prompt with ONLY the keywords + 2. Uses relevant keywords + 3. Is optimized for search engine results + 4. Is concise and focused + 5. Short is better than long + 6. It is a search engine, not a chatbot + 7. Concise + + Return only the rephrased search query, without any explanation or additional text.`; + } + + export function buildPreRerankPrompt_F0( + prompt: string | undefined, + schema: any, + url: string, + ): string { + const schemaString = JSON.stringify(schema, null, 2); + return `Create a concise search query that combines the key data points from both the schema and prompt. Focus on the core information needed while keeping it general enough to find relevant matches. + + Schema: ${schemaString} + Prompt: ${prompt} + Website to get content from: ${url} + + Return only a concise sentece or 2 focused on the essential data points that the user wants to extract. This will be used by an LLM to determine how releavant the links that are present are to the user's request.`; + } + + export function buildRerankerSystemPrompt_F0(): string { + return `You are a relevance expert scoring links from a website the user is trying to extract information from. Analyze the provided URLs and their content + to determine their relevance to the user's query and intent. + For each URL, assign a relevance score between 0 and 1, where 1 + means highly relevant and we should extract the content from it and 0 means not relevant at all, we should not extract the content from it. + Always return all the links scored that you are giving. Do not omit links. + Always return the links in the same order they were provided. If the user wants the content from all the links, all links should be scored 1.`; + } + + export function buildRerankerUserPrompt_F0(searchQuery: string): string { + return `Given these URLs and their content, identify which ones are relevant to the user's extraction request: "${searchQuery}". Return an array of relevant links with their relevance scores (0-1). Higher scores should be given to URLs that directly address the user's extraction request. Be very mindful with the links you select, as if they are not that relevant it may affect the quality of the extraction. Only include URLs that have a relevancy score of 0.6+.`; + } + + // Multi entity schema anlayzer + export function buildAnalyzeSchemaPrompt_F0(): string { + return `You are a query classifier for a web scraping system. Classify the data extraction query as either: + A) Single-Answer: One answer across a few pages, possibly containing small arrays. + B) Multi-Entity: Many items across many pages, often involving large arrays. + + Consider: + 1. Answer Cardinality: Single or multiple items? + 2. Page Distribution: Found on 1-3 pages or many? + 3. Verification Needs: Cross-page verification or independent extraction? + + Provide: + - Method: [Single-Answer/Multi-Entity] + - Confidence: [0-100%] + - Reasoning: Why this classification? + - Key Indicators: Specific aspects leading to this decision. + + Examples: + - "Is this company a non-profit?" -> Single-Answer + - "Extract all product prices" -> Multi-Entity + + For Single-Answer, arrays may be present but are typically small. For Multi-Entity, if arrays have multiple items not from a single page, return keys with large arrays. If nested, return the full key (e.g., 'ecommerce.products').`; + } + + export function buildAnalyzeSchemaUserPrompt_F0( + schemaString: string, + prompt: string, + urls: string[], + ): string { + return `Classify the query as Single-Answer or Multi-Entity. For Multi-Entity, return keys with large arrays; otherwise, return none: + Schema: ${schemaString}\nPrompt: ${prompt}\nRelevant URLs: ${urls}`; + } + + // Should Extract + + export function buildShouldExtractSystemPrompt_F0(): string { + return `You are a content relevance checker. Your job is to determine if the provided content is very relevant to extract information from based on the user's prompt. Return true only if the content appears relevant and contains information that could help answer the prompt. Return false if the content seems irrelevant or unlikely to contain useful information for the prompt.`; + } + + export function buildShouldExtractUserPrompt_F0( + prompt: string, + schema: any, + ): string { + return `Should the following content be used to extract information for this prompt: "${prompt}" User schema is: ${JSON.stringify(schema)}\nReturn only true or false.`; + } + + // Batch extract + export function buildBatchExtractSystemPrompt_F0( + systemPrompt: string, + multiEntitySchema: any, + links: string[], + ): string { + return ( + (systemPrompt ? `${systemPrompt}\n` : "") + + `Always prioritize using the provided content to answer the question. Do not make up an answer. Do not hallucinate. In case you can't find the information and the string is required, instead of 'N/A' or 'Not speficied', return an empty string: '', if it's not a string and you can't find the information, return null. Be concise and follow the schema always if provided. If the document provided is not relevant to the prompt nor to the final user schema ${JSON.stringify(multiEntitySchema)}, return null. Here are the urls the user provided of which he wants to extract information from: ` + + links.join(", ") + ); + } + + export function buildBatchExtractPrompt_F0(prompt: string): string { + return `Today is: ${new Date().toISOString()}\n${prompt}`; + } + + + export function buildRephraseToSerpPrompt_F0(prompt: string): string { + return `Rephrase the following prompt to be suitable for a search engine results page (SERP) query. Make sure the rephrased prompt is concise and focused on retrieving relevant search results: + + Original Prompt: "${prompt}"`; + } + \ No newline at end of file diff --git a/apps/api/src/lib/extract/fire-0/completions/analyzeSchemaAndPrompt-f0.ts b/apps/api/src/lib/extract/fire-0/completions/analyzeSchemaAndPrompt-f0.ts new file mode 100644 index 00000000..5121da74 --- /dev/null +++ b/apps/api/src/lib/extract/fire-0/completions/analyzeSchemaAndPrompt-f0.ts @@ -0,0 +1,87 @@ +import { TokenUsage } from "../../../../controllers/v1/types"; +import { z } from "zod"; +import { + buildAnalyzeSchemaPrompt, + buildAnalyzeSchemaUserPrompt, +} from "../../build-prompts"; +import { logger } from "../../../logger"; +import { jsonSchema } from "ai"; +import { getModel } from "../../../generic-ai"; +import { + generateCompletions_F0, + generateSchemaFromPrompt_F0, +} from "../llmExtract-f0"; + +export async function analyzeSchemaAndPrompt_F0( + urls: string[], + schema: any, + prompt: string, +): Promise<{ + isMultiEntity: boolean; + multiEntityKeys: string[]; + reasoning?: string; + keyIndicators?: string[]; + tokenUsage: TokenUsage; +}> { + if (!schema) { + schema = await generateSchemaFromPrompt_F0(prompt); + } + + const schemaString = JSON.stringify(schema); + + const model = getModel("gpt-4o"); + + const checkSchema = z + .object({ + isMultiEntity: z.boolean(), + multiEntityKeys: z.array(z.string()).optional().default([]), + reasoning: z.string(), + keyIndicators: z.array(z.string()), + }) + .refine( + (x) => !x.isMultiEntity || x.multiEntityKeys.length > 0, + "isMultiEntity was true, but no multiEntityKeys", + ); + + try { + const { extract: result, totalUsage } = await generateCompletions_F0({ + logger, + options: { + mode: "llm", + schema: checkSchema, + prompt: buildAnalyzeSchemaUserPrompt(schemaString, prompt, urls), + systemPrompt: buildAnalyzeSchemaPrompt(), + }, + markdown: "", + model, + }); + + const { isMultiEntity, multiEntityKeys, reasoning, keyIndicators } = + checkSchema.parse(result); + + return { + isMultiEntity, + multiEntityKeys, + reasoning, + keyIndicators, + tokenUsage: totalUsage, + }; + } catch (e) { + logger.warn("(analyzeSchemaAndPrompt) Error parsing schema analysis", { + error: e, + }); + } + + return { + isMultiEntity: false, + multiEntityKeys: [], + reasoning: "", + keyIndicators: [], + tokenUsage: { + promptTokens: 0, + completionTokens: 0, + totalTokens: 0, + model: model.modelId, + }, + }; +} diff --git a/apps/api/src/lib/extract/fire-0/completions/batchExtract-f0.ts b/apps/api/src/lib/extract/fire-0/completions/batchExtract-f0.ts new file mode 100644 index 00000000..1358abb8 --- /dev/null +++ b/apps/api/src/lib/extract/fire-0/completions/batchExtract-f0.ts @@ -0,0 +1,54 @@ +import { logger } from "../../../../lib/logger"; +import { ExtractResponse, TokenUsage } from "../../../../controllers/v1/types"; +import { Document } from "../../../../controllers/v1/types"; +import { generateCompletions_F0 } from "../llmExtract-f0"; +import { buildBatchExtractPrompt_F0, buildBatchExtractSystemPrompt_F0 } from "../build-prompts-f0"; +import { buildDocument_F0 } from "../build-document-f0"; + +/** + * Batch extract information from a list of URLs using a multi-entity schema. + * @param multiEntitySchema - The schema for the multi-entity extraction + * @param links - The URLs to extract information from + * @param prompt - The prompt for the extraction + * @param systemPrompt - The system prompt for the extraction + * @param doc - The document to extract information from + * @returns The completion promise + */ +export async function batchExtractPromise_F0( + multiEntitySchema: any, + links: string[], + prompt: string, + systemPrompt: string, + doc: Document, +): Promise<{ + extract: any; + numTokens: number; + totalUsage: TokenUsage; + warning?: string; + sources: string[]; +}> { + const completion = await generateCompletions_F0({ + logger: logger.child({ + method: "extractService/generateCompletions", + }), + options: { + mode: "llm", + systemPrompt: buildBatchExtractSystemPrompt_F0( + systemPrompt, + multiEntitySchema, + links, + ), + prompt: buildBatchExtractPrompt_F0(prompt), + schema: multiEntitySchema, + }, + markdown: buildDocument_F0(doc), + isExtractEndpoint: true + }); + + return { + extract: completion.extract, + numTokens: completion.numTokens, + totalUsage: completion.totalUsage, + sources: [doc.metadata.url || doc.metadata.sourceURL || ""] + }; +} diff --git a/apps/api/src/lib/extract/fire-0/completions/checkShouldExtract-f0.ts b/apps/api/src/lib/extract/fire-0/completions/checkShouldExtract-f0.ts new file mode 100644 index 00000000..39c7c771 --- /dev/null +++ b/apps/api/src/lib/extract/fire-0/completions/checkShouldExtract-f0.ts @@ -0,0 +1,39 @@ +import { logger } from "../../../../lib/logger"; +import { buildDocument } from "../../build-document"; +import { Document, TokenUsage } from "../../../../controllers/v1/types"; +import { generateCompletions_F0 } from "../llmExtract-f0"; +import { buildShouldExtractSystemPrompt_F0, buildShouldExtractUserPrompt_F0 } from "../build-prompts-f0"; +import { getModel } from "../../../../lib/generic-ai"; + + +export async function checkShouldExtract_F0( + prompt: string, + multiEntitySchema: any, + doc: Document, +): Promise<{ tokenUsage: TokenUsage; extract: boolean }> { + const shouldExtractCheck = await generateCompletions_F0({ + logger: logger.child({ method: "extractService/checkShouldExtract" }), + options: { + mode: "llm", + systemPrompt: buildShouldExtractSystemPrompt_F0(), + prompt: buildShouldExtractUserPrompt_F0(prompt, multiEntitySchema), + schema: { + type: "object", + properties: { + extract: { + type: "boolean", + }, + }, + required: ["extract"], + }, + }, + markdown: buildDocument(doc), + isExtractEndpoint: true, + model: getModel("gpt-4o-mini"), + }); + + return { + tokenUsage: shouldExtractCheck.totalUsage, + extract: shouldExtractCheck.extract["extract"], + }; +} diff --git a/apps/api/src/lib/extract/fire-0/completions/singleAnswer-f0.ts b/apps/api/src/lib/extract/fire-0/completions/singleAnswer-f0.ts new file mode 100644 index 00000000..4b60778e --- /dev/null +++ b/apps/api/src/lib/extract/fire-0/completions/singleAnswer-f0.ts @@ -0,0 +1,42 @@ +import { logger } from "../../../../lib/logger"; +import { generateCompletions_F0 } from "../llmExtract-f0"; +import { buildDocument_F0 } from "../build-document-f0"; +import { Document, TokenUsage } from "../../../../controllers/v1/types"; + +export async function singleAnswerCompletion_F0({ + singleAnswerDocs, + rSchema, + links, + prompt, + systemPrompt, +}: { + singleAnswerDocs: Document[]; + rSchema: any; + links: string[]; + prompt: string; + systemPrompt: string; +}): Promise<{ + extract: any; + tokenUsage: TokenUsage; + sources: string[]; +}> { + const completion = await generateCompletions_F0({ + logger: logger.child({ module: "extract", method: "generateCompletions" }), + options: { + mode: "llm", + systemPrompt: + (systemPrompt ? `${systemPrompt}\n` : "") + + "Always prioritize using the provided content to answer the question. Do not make up an answer. Do not hallucinate. In case you can't find the information and the string is required, instead of 'N/A' or 'Not speficied', return an empty string: '', if it's not a string and you can't find the information, return null. Be concise and follow the schema always if provided. Here are the urls the user provided of which he wants to extract information from: " + + links.join(", "), + prompt: "Today is: " + new Date().toISOString() + "\n" + prompt, + schema: rSchema, + }, + markdown: singleAnswerDocs.map((x) => buildDocument_F0(x)).join("\n"), + isExtractEndpoint: true + }); + return { + extract: completion.extract, + tokenUsage: completion.totalUsage, + sources: singleAnswerDocs.map(doc => doc.metadata.url || doc.metadata.sourceURL || "") + }; +} diff --git a/apps/api/src/lib/extract/fire-0/document-scraper-f0.ts b/apps/api/src/lib/extract/fire-0/document-scraper-f0.ts new file mode 100644 index 00000000..b5f8f0bb --- /dev/null +++ b/apps/api/src/lib/extract/fire-0/document-scraper-f0.ts @@ -0,0 +1,98 @@ +import { Document, ScrapeOptions, URLTrace, scrapeOptions } from "../../../controllers/v1/types"; +import { logger } from "../../logger"; +import { getScrapeQueue } from "../../../services/queue-service"; +import { waitForJob } from "../../../services/queue-jobs"; +import { addScrapeJob } from "../../../services/queue-jobs"; +import { getJobPriority } from "../../job-priority"; +import type { Logger } from "winston"; + +interface ScrapeDocumentOptions { + url: string; + teamId: string; + origin: string; + timeout: number; + isSingleUrl?: boolean; +} + +export async function scrapeDocument_F0( + options: ScrapeDocumentOptions, + urlTraces: URLTrace[], + logger: Logger, + internalScrapeOptions: Partial = { onlyMainContent: false }, +): Promise { + const trace = urlTraces.find((t) => t.url === options.url); + if (trace) { + trace.status = "scraped"; + trace.timing.scrapedAt = new Date().toISOString(); + } + + async function attemptScrape(timeout: number) { + const jobId = crypto.randomUUID(); + const jobPriority = await getJobPriority({ + team_id: options.teamId, + basePriority: 10, + from_extract: true, + }); + + await addScrapeJob( + { + url: options.url, + mode: "single_urls", + team_id: options.teamId, + scrapeOptions: scrapeOptions.parse({ ...internalScrapeOptions }), + internalOptions: { + useCache: true, + teamId: options.teamId, + }, + origin: options.origin, + is_scrape: true, + from_extract: true, + }, + {}, + jobId, + jobPriority, + ); + + const doc = await waitForJob(jobId, timeout); + await getScrapeQueue().remove(jobId); + + if (trace) { + trace.timing.completedAt = new Date().toISOString(); + trace.contentStats = { + rawContentLength: doc.markdown?.length || 0, + processedContentLength: doc.markdown?.length || 0, + tokensUsed: 0, + }; + } + + return doc; + } + + try { + try { + logger.debug("Attempting scrape..."); + const x = await attemptScrape(options.timeout); + logger.debug("Scrape finished!"); + return x; + } catch (timeoutError) { + logger.warn("Scrape failed.", { error: timeoutError }); + + if (options.isSingleUrl) { + // For single URLs, try again with double timeout + logger.debug("Attempting scrape..."); + const x = await attemptScrape(options.timeout * 2); + logger.debug("Scrape finished!"); + return x; + } + + throw timeoutError; + } + } catch (error) { + logger.error(`error in scrapeDocument`, { error }); + if (trace) { + trace.status = "error"; + trace.error = error.message; + } + return null; + } +} diff --git a/apps/api/src/lib/extract/fire-0/extraction-service-f0.ts b/apps/api/src/lib/extract/fire-0/extraction-service-f0.ts new file mode 100644 index 00000000..d3ab1589 --- /dev/null +++ b/apps/api/src/lib/extract/fire-0/extraction-service-f0.ts @@ -0,0 +1,807 @@ +import { + Document, + ExtractRequest, + TokenUsage, + URLTrace, + } from "../../../controllers/v1/types"; + import { logger as _logger } from "../../logger"; + import { scrapeDocument_F0 } from "./document-scraper-f0"; + import { billTeam } from "../../../services/billing/credit_billing"; + import { logJob } from "../../../services/logging/log_job"; + import { _addScrapeJobToBullMQ } from "../../../services/queue-jobs"; + import { spreadSchemas_F0 } from "./helpers/spread-schemas-f0"; + import Ajv from "ajv"; + const ajv = new Ajv(); + + import { ExtractStep, updateExtract } from "../extract-redis"; + import { CUSTOM_U_TEAMS } from "../config"; + import { getCachedDocs, saveCachedDocs } from "../helpers/cached-docs"; + import { normalizeUrl } from "../../canonical-url"; + import { search } from "../../../search"; +import { buildRephraseToSerpPrompt_F0 } from "./build-prompts-f0"; +import { processUrl_F0, generateBasicCompletion_FO } from "./url-processor-f0"; +import { generateCompletions_F0, generateSchemaFromPrompt_F0 } from "./llmExtract-f0"; +import { dereferenceSchema_F0 } from "./helpers/dereference-schema-f0"; +import { analyzeSchemaAndPrompt_F0 } from "./completions/analyzeSchemaAndPrompt-f0"; +import { checkShouldExtract_F0 } from "./completions/checkShouldExtract-f0"; +import { batchExtractPromise_F0 } from "./completions/batchExtract-f0"; +import { transformArrayToObject_F0 } from "./helpers/transform-array-to-obj-f0"; +import { deduplicateObjectsArray_F0 } from "./helpers/deduplicate-objs-array-f0"; +import { mergeNullValObjs_F0 } from "./helpers/merge-null-val-objs-f0"; +import { mixSchemaObjects_F0 } from "./helpers/mix-schema-objs-f0"; +import { singleAnswerCompletion_F0 } from "./completions/singleAnswer-f0"; +import { calculateFinalResultCost_F0, estimateTotalCost_F0 } from "./usage/llm-cost-f0"; +import { SourceTracker_F0 } from "./helpers/source-tracker-f0"; + + + interface ExtractServiceOptions { + request: ExtractRequest; + teamId: string; + subId?: string; + cacheMode?: "load" | "save" | "direct"; + cacheKey?: string; + } + + export interface ExtractResult { + success: boolean; + data?: any; + extractId: string; + warning?: string; + urlTrace?: URLTrace[]; + error?: string; + tokenUsageBreakdown?: TokenUsage[]; + llmUsage?: number; + totalUrlsScraped?: number; + sources?: Record; + } + + type completions = { + extract: Record; + numTokens: number; + totalUsage: TokenUsage; + warning?: string; + sources?: string[]; + }; + + + export async function performExtraction_F0( + extractId: string, + options: ExtractServiceOptions, + ): Promise { + const { request, teamId, subId } = options; + const urlTraces: URLTrace[] = []; + let docsMap: Map = new Map(); + let singleAnswerCompletions: completions | null = null; + let multiEntityCompletions: completions[] = []; + let multiEntityResult: any = {}; + let singleAnswerResult: any = {}; + let totalUrlsScraped = 0; + let sources: Record = {}; + + + const logger = _logger.child({ + module: "extract", + method: "performExtraction", + extractId, + teamId, + }); + + // If no URLs are provided, generate URLs from the prompt + if ((!request.urls || request.urls.length === 0) && request.prompt) { + logger.debug("Generating URLs from prompt...", { + prompt: request.prompt, + }); + const rephrasedPrompt = await generateBasicCompletion_FO(buildRephraseToSerpPrompt_F0(request.prompt)); + const searchResults = await search({ + query: rephrasedPrompt.replace('"', "").replace("'", ""), + num_results: 10, + }); + + request.urls = searchResults.map(result => result.url) as string[]; + } + if (request.urls && request.urls.length === 0) { + logger.error("No search results found", { + query: request.prompt, + }); + return { + success: false, + error: "No search results found", + extractId, + }; + } + + const urls = request.urls || ([] as string[]); + + if (request.__experimental_cacheMode == "load" && request.__experimental_cacheKey && urls) { + logger.debug("Loading cached docs..."); + try { + const cache = await getCachedDocs(urls, request.__experimental_cacheKey); + for (const doc of cache) { + if (doc.metadata.url) { + docsMap.set(normalizeUrl(doc.metadata.url), doc); + } + } + } catch (error) { + logger.error("Error loading cached docs", { error }); + } + } + + // Token tracking + let tokenUsage: TokenUsage[] = []; + + await updateExtract(extractId, { + status: "processing", + steps: [ + { + step: ExtractStep.INITIAL, + startedAt: Date.now(), + finishedAt: Date.now(), + discoveredLinks: request.urls, + }, + ], + }); + + let startMap = Date.now(); + let aggMapLinks: string[] = []; + logger.debug("Processing URLs...", { + urlCount: request.urls?.length || 0, + }); + + const urlPromises = urls.map((url) => + processUrl_F0( + { + url, + prompt: request.prompt, + teamId, + allowExternalLinks: request.allowExternalLinks, + origin: request.origin, + limit: request.limit, + includeSubdomains: request.includeSubdomains, + schema: request.schema, + }, + urlTraces, + (links: string[]) => { + aggMapLinks.push(...links); + updateExtract(extractId, { + steps: [ + { + step: ExtractStep.MAP, + startedAt: startMap, + finishedAt: Date.now(), + discoveredLinks: aggMapLinks, + }, + ], + }); + }, + logger.child({ module: "extract", method: "processUrl", url }), + ), + ); + + const processedUrls = await Promise.all(urlPromises); + const links = processedUrls.flat().filter((url) => url); + logger.debug("Processed URLs.", { + linkCount: links.length, + }); + + if (links.length === 0) { + logger.error("0 links! Bailing.", { + linkCount: links.length, + }); + return { + success: false, + error: + "No valid URLs found to scrape. Try adjusting your search criteria or including more URLs.", + extractId, + urlTrace: urlTraces, + totalUrlsScraped: 0, + }; + } + + await updateExtract(extractId, { + status: "processing", + steps: [ + { + step: ExtractStep.MAP_RERANK, + startedAt: startMap, + finishedAt: Date.now(), + discoveredLinks: links, + }, + ], + }); + + let reqSchema = request.schema; + if (!reqSchema && request.prompt) { + reqSchema = await generateSchemaFromPrompt_F0(request.prompt); + logger.debug("Generated request schema.", { + originalSchema: request.schema, + schema: reqSchema, + }); + } + + if (reqSchema) { + reqSchema = await dereferenceSchema_F0(reqSchema); + } + + logger.debug("Transformed schema.", { + originalSchema: request.schema, + schema: reqSchema, + }); + + // agent evaluates if the schema or the prompt has an array with big amount of items + // also it checks if the schema any other properties that are not arrays + // if so, it splits the results into 2 types of completions: + // 1. the first one is a completion that will extract the array of items + // 2. the second one is multiple completions that will extract the items from the array + let startAnalyze = Date.now(); + const { + isMultiEntity, + multiEntityKeys, + reasoning, + keyIndicators, + tokenUsage: schemaAnalysisTokenUsage, + } = await analyzeSchemaAndPrompt_F0(links, reqSchema, request.prompt ?? ""); + + logger.debug("Analyzed schema.", { + isMultiEntity, + multiEntityKeys, + reasoning, + keyIndicators, + }); + + // Track schema analysis tokens + tokenUsage.push(schemaAnalysisTokenUsage); + + // console.log("\nIs Multi Entity:", isMultiEntity); + // console.log("\nMulti Entity Keys:", multiEntityKeys); + // console.log("\nReasoning:", reasoning); + // console.log("\nKey Indicators:", keyIndicators); + + let rSchema = reqSchema; + if (isMultiEntity && reqSchema) { + logger.debug("=== MULTI-ENTITY ==="); + + const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas_F0( + reqSchema, + multiEntityKeys, + ); + rSchema = singleAnswerSchema; + logger.debug("Spread schemas.", { singleAnswerSchema, multiEntitySchema }); + + await updateExtract(extractId, { + status: "processing", + steps: [ + { + step: ExtractStep.MULTI_ENTITY, + startedAt: startAnalyze, + finishedAt: Date.now(), + discoveredLinks: [], + }, + ], + }); + + const timeout = 60000; + + await updateExtract(extractId, { + status: "processing", + steps: [ + { + step: ExtractStep.MULTI_ENTITY_SCRAPE, + startedAt: startAnalyze, + finishedAt: Date.now(), + discoveredLinks: links, + }, + ], + }); + + logger.debug("Starting multi-entity scrape..."); + let startScrape = Date.now(); + + const scrapePromises = links.map((url) => { + if (!docsMap.has(normalizeUrl(url))) { + return scrapeDocument_F0( + { + url, + teamId, + origin: request.origin || "api", + timeout, + }, + urlTraces, + logger.child({ + module: "extract", + method: "scrapeDocument", + url, + isMultiEntity: true, + }), + { + ...request.scrapeOptions, + + // Needs to be true for multi-entity to work properly + onlyMainContent: true, + } + ); + } + return docsMap.get(normalizeUrl(url)); + }); + + let multyEntityDocs = (await Promise.all(scrapePromises)).filter( + (doc): doc is Document => doc !== null, + ); + + logger.debug("Multi-entity scrape finished.", { + docCount: multyEntityDocs.length, + }); + + totalUrlsScraped += multyEntityDocs.length; + + let endScrape = Date.now(); + + await updateExtract(extractId, { + status: "processing", + steps: [ + { + step: ExtractStep.MULTI_ENTITY_SCRAPE, + startedAt: startScrape, + finishedAt: endScrape, + discoveredLinks: links, + }, + ], + }); + + for (const doc of multyEntityDocs) { + if (doc?.metadata?.url) { + docsMap.set(normalizeUrl(doc.metadata.url), doc); + } + } + + logger.debug("Updated docsMap.", { docsMapSize: docsMap.size }); // useful for error probing + + // Process docs in chunks with queue style processing + const chunkSize = 50; + const timeoutCompletion = 45000; // 45 second timeout + const chunks: Document[][] = []; + const extractionResults: {extract: any, url: string}[] = []; + + // Split into chunks + for (let i = 0; i < multyEntityDocs.length; i += chunkSize) { + chunks.push(multyEntityDocs.slice(i, i + chunkSize)); + } + + // Process chunks sequentially with timeout + for (const chunk of chunks) { + const chunkPromises = chunk.map(async (doc) => { + try { + ajv.compile(multiEntitySchema); + + // Wrap in timeout promise + const timeoutPromise = new Promise((resolve) => { + setTimeout(() => resolve(null), timeoutCompletion); + }); + + // Check if page should be extracted before proceeding + const { extract, tokenUsage: shouldExtractCheckTokenUsage } = await checkShouldExtract_F0( + request.prompt ?? "", + multiEntitySchema, + doc, + ); + + tokenUsage.push(shouldExtractCheckTokenUsage); + + if (!extract) { + logger.info( + `Skipping extraction for ${doc.metadata.url} as content is irrelevant`, + ); + return null; + } + // Add confidence score to schema with 5 levels + const schemaWithConfidence = { + ...multiEntitySchema, + properties: { + ...multiEntitySchema.properties, + is_content_relevant: { + type: "boolean", + description: + "Determine if this content is relevant to the prompt. Return true ONLY if the content contains information that directly helps answer the prompt. Return false if the content is irrelevant or unlikely to contain useful information.", + }, + }, + required: [ + ...(multiEntitySchema.required || []), + "is_content_relevant", + ], + }; + + await updateExtract(extractId, { + status: "processing", + steps: [ + { + step: ExtractStep.MULTI_ENTITY_EXTRACT, + startedAt: startScrape, + finishedAt: Date.now(), + discoveredLinks: [ + doc.metadata.url || doc.metadata.sourceURL || "", + ], + }, + ], + }); + + const completionPromise = batchExtractPromise_F0(multiEntitySchema, links, request.prompt ?? "", request.systemPrompt ?? "", doc); + + // Race between timeout and completion + const multiEntityCompletion = (await Promise.race([ + completionPromise, + timeoutPromise, + ])) as Awaited>; + + // Track multi-entity extraction tokens + if (multiEntityCompletion) { + tokenUsage.push(multiEntityCompletion.totalUsage); + + if (multiEntityCompletion.extract) { + return { + extract: multiEntityCompletion.extract, + url: doc.metadata.url || doc.metadata.sourceURL || "" + }; + } + } + + // console.log(multiEntityCompletion.extract) + // if (!multiEntityCompletion.extract?.is_content_relevant) { + // console.log(`Skipping extraction for ${doc.metadata.url} as content is not relevant`); + // return null; + // } + + // Update token usage in traces + // if (multiEntityCompletion && multiEntityCompletion.numTokens) { + // const totalLength = docs.reduce( + // (sum, doc) => sum + (doc.markdown?.length || 0), + // 0, + // ); + // docs.forEach((doc) => { + // if (doc.metadata?.sourceURL) { + // const trace = urlTraces.find( + // (t) => t.url === doc.metadata.sourceURL, + // ); + // if (trace && trace.contentStats) { + // trace.contentStats.tokensUsed = Math.floor( + // ((doc.markdown?.length || 0) / totalLength) * + // (multiEntityCompletion?.numTokens || 0), + // ); + // } + // } + // }); + // } + + // if (multiEntityCompletion.extract && multiEntityCompletion.extract.extraction_confidence < 3) { + // console.log(`Skipping extraction for ${doc.metadata.url} as confidence is too low (${multiEntityCompletion.extract.extraction_confidence})`); + // return null; + // } + + return null; + } catch (error) { + logger.error(`Failed to process document.`, { + error, + url: doc.metadata.url ?? doc.metadata.sourceURL!, + }); + return null; + } + }); + + // Wait for current chunk to complete before processing next chunk + const chunkResults = await Promise.all(chunkPromises); + const validResults = chunkResults.filter((result): result is {extract: any, url: string} => result !== null); + extractionResults.push(...validResults); + multiEntityCompletions.push(...validResults.map(r => r.extract)); + logger.debug("All multi-entity completion chunks finished.", { + completionCount: multiEntityCompletions.length, + }); + } + + try { + // Use SourceTracker to handle source tracking + const sourceTracker = new SourceTracker_F0(); + + // Transform and merge results while preserving sources + sourceTracker.transformResults_F0(extractionResults, multiEntitySchema, false); + + multiEntityResult = transformArrayToObject_F0( + multiEntitySchema, + multiEntityCompletions, + ); + + // Track sources before deduplication + sourceTracker.trackPreDeduplicationSources_F0(multiEntityResult); + + // Apply deduplication and merge + multiEntityResult = deduplicateObjectsArray_F0(multiEntityResult); + multiEntityResult = mergeNullValObjs_F0(multiEntityResult); + + // Map sources to final deduplicated/merged items + const multiEntitySources = sourceTracker.mapSourcesToFinalItems_F0(multiEntityResult, multiEntityKeys); + Object.assign(sources, multiEntitySources); + + } catch (error) { + logger.error(`Failed to transform array to object`, { error }); + return { + success: false, + error: + "An unexpected error occurred. Please contact help@firecrawl.com for help.", + extractId, + urlTrace: urlTraces, + totalUrlsScraped, + }; + } + } + if ( + rSchema && + Object.keys(rSchema).length > 0 && + rSchema.properties && + Object.keys(rSchema.properties).length > 0 + ) { + logger.debug("=== SINGLE PAGES ===", { + linkCount: links.length, + schema: rSchema, + }); + + // Scrape documents + const timeout = 60000; + let singleAnswerDocs: Document[] = []; + + // let rerank = await rerankLinks(links.map((url) => ({ url })), request.prompt ?? JSON.stringify(request.schema), urlTraces); + + await updateExtract(extractId, { + status: "processing", + steps: [ + { + step: ExtractStep.SCRAPE, + startedAt: Date.now(), + finishedAt: Date.now(), + discoveredLinks: links, + }, + ], + }); + const scrapePromises = links.map((url) => { + if (!docsMap.has(normalizeUrl(url))) { + return scrapeDocument_F0( + { + url, + teamId, + origin: request.origin || "api", + timeout, + }, + urlTraces, + logger.child({ + module: "extract", + method: "scrapeDocument", + url, + isMultiEntity: false, + }), + request.scrapeOptions + ); + } + return docsMap.get(normalizeUrl(url)); + }); + + try { + const results = await Promise.all(scrapePromises); + + for (const doc of results) { + if (doc?.metadata?.url) { + docsMap.set(normalizeUrl(doc.metadata.url), doc); + } + } + logger.debug("Updated docsMap.", { docsMapSize: docsMap.size }); // useful for error probing + + const validResults = results.filter( + (doc): doc is Document => doc !== null, + ); + singleAnswerDocs.push(...validResults); + totalUrlsScraped += validResults.length; + + logger.debug("Scrapes finished.", { docCount: validResults.length }); + } catch (error) { + return { + success: false, + error: error.message, + extractId, + urlTrace: urlTraces, + totalUrlsScraped, + }; + } + + if (docsMap.size == 0) { + // All urls are invalid + logger.error("All provided URLs are invalid!"); + return { + success: false, + error: + "All provided URLs are invalid. Please check your input and try again.", + extractId, + urlTrace: request.urlTrace ? urlTraces : undefined, + totalUrlsScraped: 0, + }; + } + + await updateExtract(extractId, { + status: "processing", + steps: [ + { + step: ExtractStep.EXTRACT, + startedAt: Date.now(), + finishedAt: Date.now(), + discoveredLinks: links, + }, + ], + }); + + // Generate completions + logger.debug("Generating singleAnswer completions..."); + let { extract: completionResult, tokenUsage: singleAnswerTokenUsage, sources: singleAnswerSources } = await singleAnswerCompletion_F0({ + singleAnswerDocs, + rSchema, + links, + prompt: request.prompt ?? "", + systemPrompt: request.systemPrompt ?? "" + }); + logger.debug("Done generating singleAnswer completions."); + + // Track single answer extraction tokens and sources + if (completionResult) { + tokenUsage.push(singleAnswerTokenUsage); + + // Add sources for top-level properties in single answer + if (rSchema?.properties) { + Object.keys(rSchema.properties).forEach(key => { + if (completionResult[key] !== undefined) { + sources[key] = singleAnswerSources || singleAnswerDocs.map(doc => doc.metadata.url || doc.metadata.sourceURL || ""); + } + }); + } + } + + singleAnswerResult = completionResult; + singleAnswerCompletions = singleAnswerResult; + + // Update token usage in traces + // if (completions && completions.numTokens) { + // const totalLength = docs.reduce( + // (sum, doc) => sum + (doc.markdown?.length || 0), + // 0, + // ); + // docs.forEach((doc) => { + // if (doc.metadata?.sourceURL) { + // const trace = urlTraces.find((t) => t.url === doc.metadata.sourceURL); + // if (trace && trace.contentStats) { + // trace.contentStats.tokensUsed = Math.floor( + // ((doc.markdown?.length || 0) / totalLength) * + // (completions?.numTokens || 0), + // ); + // } + // } + // }); + // } + } + + let finalResult = reqSchema + ? await mixSchemaObjects_F0( + reqSchema, + singleAnswerResult, + multiEntityResult, + logger.child({ method: "mixSchemaObjects" }), + ) + : singleAnswerResult || multiEntityResult; + + // Tokenize final result to get token count + // let finalResultTokens = 0; + // if (finalResult) { + // const finalResultStr = JSON.stringify(finalResult); + // finalResultTokens = numTokensFromString(finalResultStr, "gpt-4o"); + + // } + // // Deduplicate and validate final result against schema + // if (reqSchema && finalResult && finalResult.length <= extractConfig.DEDUPLICATION.MAX_TOKENS) { + // const schemaValidation = await generateCompletions( + // logger.child({ method: "extractService/validateAndDeduplicate" }), + // { + // mode: "llm", + // systemPrompt: `You are a data validator and deduplicator. Your task is to: + // 1. Remove any duplicate entries in the data extracted by merging that into a single object according to the provided shcema + // 2. Ensure all data matches the provided schema + // 3. Keep only the highest quality and most complete entries when duplicates are found. + + // Do not change anything else. If data is null keep it null. If the schema is not provided, return the data as is.`, + // prompt: `Please validate and merge the duplicate entries in this data according to the schema provided:\n + + // + + // ${JSON.stringify(finalResult)} + + // + + // + + // ${JSON.stringify(reqSchema)} + + // + // `, + // schema: reqSchema, + // }, + // undefined, + // undefined, + // true, + // "gpt-4o" + // ); + // console.log("schemaValidation", schemaValidation); + + // console.log("schemaValidation", finalResult); + + // if (schemaValidation?.extract) { + // tokenUsage.push(schemaValidation.totalUsage); + // finalResult = schemaValidation.extract; + // } + // } + + const totalTokensUsed = tokenUsage.reduce((a, b) => a + b.totalTokens, 0); + const llmUsage = estimateTotalCost_F0(tokenUsage); + let tokensToBill = calculateFinalResultCost_F0(finalResult); + + if (CUSTOM_U_TEAMS.includes(teamId)) { + tokensToBill = 1; + } + + // Bill team for usage + billTeam(teamId, subId, tokensToBill, logger, true).catch((error) => { + logger.error( + `Failed to bill team ${teamId} for ${tokensToBill} tokens: ${error}`, + ); + }); + + // Log job with token usage and sources + logJob({ + job_id: extractId, + success: true, + message: "Extract completed", + num_docs: 1, + docs: finalResult ?? {}, + time_taken: (new Date().getTime() - Date.now()) / 1000, + team_id: teamId, + mode: "extract", + url: request.urls?.join(", ") || "", + scrapeOptions: request, + origin: request.origin ?? "api", + num_tokens: totalTokensUsed, + tokens_billed: tokensToBill, + sources, + }).then(() => { + updateExtract(extractId, { + status: "completed", + llmUsage, + sources, + }).catch((error) => { + logger.error( + `Failed to update extract ${extractId} status to completed: ${error}`, + ); + }); + }); + + logger.debug("Done!"); + + if (request.__experimental_cacheMode == "save" && request.__experimental_cacheKey) { + logger.debug("Saving cached docs..."); + try { + await saveCachedDocs([...docsMap.values()], request.__experimental_cacheKey); + } catch (error) { + logger.error("Error saving cached docs", { error }); + } + } + + return { + success: true, + data: finalResult ?? {}, + extractId, + warning: undefined, + urlTrace: request.urlTrace ? urlTraces : undefined, + llmUsage, + totalUrlsScraped, + sources, + }; + } + \ No newline at end of file diff --git a/apps/api/src/lib/extract/fire-0/helpers/deduplicate-objs-array-f0.ts b/apps/api/src/lib/extract/fire-0/helpers/deduplicate-objs-array-f0.ts new file mode 100644 index 00000000..fa3c46f7 --- /dev/null +++ b/apps/api/src/lib/extract/fire-0/helpers/deduplicate-objs-array-f0.ts @@ -0,0 +1,29 @@ +export function deduplicateObjectsArray_F0(objArray: { [key: string]: any[] }): { + [key: string]: any[]; +} { + const deduplicatedObjArray: { [key: string]: any[] } = {}; + + for (const key in objArray) { + if (Array.isArray(objArray[key])) { + const seen = new Set(); + deduplicatedObjArray[key] = objArray[key].filter((item) => { + // Create a unique identifier for each item based on its properties + const identifier = JSON.stringify(item); + + // Check if this identifier has been seen before + if (seen.has(identifier)) { + return false; // Duplicate found, filter it out + } + + // Add the identifier to the set and keep the item + seen.add(identifier); + return true; + }); + } else { + // If the value is not an array, just copy it as is + deduplicatedObjArray[key] = objArray[key]; + } + } + + return deduplicatedObjArray; +} diff --git a/apps/api/src/lib/extract/fire-0/helpers/dereference-schema-f0.ts b/apps/api/src/lib/extract/fire-0/helpers/dereference-schema-f0.ts new file mode 100644 index 00000000..d4f863a7 --- /dev/null +++ b/apps/api/src/lib/extract/fire-0/helpers/dereference-schema-f0.ts @@ -0,0 +1,10 @@ +import { dereference } from "@apidevtools/json-schema-ref-parser"; + +export async function dereferenceSchema_F0(schema: any): Promise { + try { + return await dereference(schema); + } catch (error) { + console.error("Failed to dereference schema:", error); + throw error; + } +} diff --git a/apps/api/src/lib/extract/fire-0/helpers/merge-null-val-objs-f0.ts b/apps/api/src/lib/extract/fire-0/helpers/merge-null-val-objs-f0.ts new file mode 100644 index 00000000..ec798c8a --- /dev/null +++ b/apps/api/src/lib/extract/fire-0/helpers/merge-null-val-objs-f0.ts @@ -0,0 +1,153 @@ +import { deduplicateObjectsArray_F0 } from "./deduplicate-objs-array-f0"; + +/** + * Convert "null" strings to actual null values for easier comparison. + */ +function unifyValue(val: any): any { + return val === "null" ? null : val; +} + +/** + * Convert all "null" strings in an object to actual null values. + */ +function unifyItemValues(item: T): T { + const unifiedItem: any = {}; + for (const key of Object.keys(item)) { + unifiedItem[key] = unifyValue(item[key]); + } + return unifiedItem; +} + +/** + * Check if two objects are mergeable by comparing their non-null values + */ +export function areMergeable_F0(obj1: any, obj2: any): boolean { + const allKeys = new Set([...Object.keys(obj1), ...Object.keys(obj2)]); + let matchingNonNullValues = 0; + let nonNullComparisons = 0; + + for (const key of allKeys) { + const val1 = obj1[key]; + const val2 = obj2[key]; + + // Skip array comparisons - they'll be merged separately + if (Array.isArray(val1) || Array.isArray(val2)) { + continue; + } + + // If both values exist and are not null + if (val1 !== null && val2 !== null) { + nonNullComparisons++; + if (val1 === val2) { + matchingNonNullValues++; + } + } + } + + // Objects are mergeable if they have at least one matching non-null value + // and all their non-null values match when both objects have them + return nonNullComparisons > 0 && matchingNonNullValues === nonNullComparisons; +} + +/** + * Merge arrays and remove duplicates + */ +function mergeArrays(arr1: any[], arr2: any[]): any[] { + const combined = [...arr1, ...arr2]; + return combined.filter((item, index) => { + const stringified = JSON.stringify(item); + return ( + combined.findIndex((other) => JSON.stringify(other) === stringified) === + index + ); + }); +} + +/** + * Merge two objects, taking non-null values over null values + */ +function mergeObjects(obj1: any, obj2: any): any { + const result = { ...obj1 }; + + for (const key in obj2) { + if (obj2.hasOwnProperty(key)) { + // If obj2's value is non-null, it should override obj1's value + if (obj2[key] !== null) { + if (Array.isArray(obj2[key])) { + // If both are arrays, merge them + if (Array.isArray(result[key])) { + result[key] = mergeArrays(result[key], obj2[key]); + } else { + // If only obj2's value is an array, use it + result[key] = [...obj2[key]]; + } + } else if (typeof obj2[key] === "object") { + // If both are objects (but not arrays), merge them + if (typeof result[key] === "object" && !Array.isArray(result[key])) { + result[key] = mergeObjects(result[key], obj2[key]); + } else { + result[key] = { ...obj2[key] }; + } + } else { + // For primitive values, obj2's non-null value always wins + result[key] = obj2[key]; + } + } + } + } + + return result; +} + +/** + * Merges arrays of objects by combining those that are identical except for + * null-equivalent fields, filling in null fields with the corresponding + * non-null fields from the other object. + */ +export function mergeNullValObjs_F0(objArray: { [key: string]: any[] }): { + [key: string]: any[]; +} { + const result: { [key: string]: any[] } = {}; + + for (const key in objArray) { + if (Array.isArray(objArray[key])) { + // If array contains only primitive values, return as is + if ( + objArray[key].every((item) => typeof item !== "object" || item === null) + ) { + result[key] = [...objArray[key]]; + continue; + } + + const items = objArray[key].map(unifyItemValues); + const mergedItems: any[] = []; + + for (const item of items) { + let merged = false; + + for (let i = 0; i < mergedItems.length; i++) { + if (areMergeable_F0(mergedItems[i], item)) { + mergedItems[i] = mergeObjects(mergedItems[i], item); + merged = true; + break; + } + } + + if (!merged) { + mergedItems.push({ ...item }); + } + } + + // Final deduplication pass + result[key] = deduplicateObjectsArray_F0({ [key]: mergedItems })[key]; + } else { + console.warn( + `Expected an array at objArray[${key}], but found:`, + objArray[key], + ); + return objArray; + } + } + + return result; +} diff --git a/apps/api/src/lib/extract/fire-0/helpers/mix-schema-objs-f0.ts b/apps/api/src/lib/extract/fire-0/helpers/mix-schema-objs-f0.ts new file mode 100644 index 00000000..8f7e13c2 --- /dev/null +++ b/apps/api/src/lib/extract/fire-0/helpers/mix-schema-objs-f0.ts @@ -0,0 +1,48 @@ +import type { Logger } from "winston"; + +export async function mixSchemaObjects_F0( + finalSchema: any, + singleAnswerResult: any, + multiEntityResult: any, + logger?: Logger +) { + const finalResult: any = {}; + logger?.debug("Mixing schema objects."); + + // Recursive helper function to merge results based on schema + function mergeResults(schema: any, singleResult: any, multiResult: any) { + const result: any = {}; + for (const key in schema.properties) { + if ( + schema.properties[key].type === "object" && + schema.properties[key].properties + ) { + // If the property is an object, recursively merge its properties + result[key] = mergeResults( + schema.properties[key], + singleResult[key] || {}, + multiResult[key] || {}, + ); + } else if ( + schema.properties[key].type === "array" && + Array.isArray(multiResult[key]) + ) { + // If the property is an array, flatten the arrays from multiResult + result[key] = multiResult[key].flat(); + } else if (singleResult.hasOwnProperty(key)) { + result[key] = singleResult[key]; + } else if (multiResult.hasOwnProperty(key)) { + result[key] = multiResult[key]; + } + } + return result; + } + + // Merge the properties from the final schema + Object.assign( + finalResult, + mergeResults(finalSchema, singleAnswerResult, multiEntityResult), + ); + + return finalResult; +} diff --git a/apps/api/src/lib/extract/fire-0/helpers/source-tracker-f0.ts b/apps/api/src/lib/extract/fire-0/helpers/source-tracker-f0.ts new file mode 100644 index 00000000..72c70c97 --- /dev/null +++ b/apps/api/src/lib/extract/fire-0/helpers/source-tracker-f0.ts @@ -0,0 +1,151 @@ +import { logger } from "../../../../lib/logger"; +import { areMergeable_F0 } from "./merge-null-val-objs-f0"; +import { transformArrayToObject_F0 } from "./transform-array-to-obj-f0"; + +interface TransformedResult { + transformed: { [key: string]: any[] } | any[]; + url: string; +} + +/** + * Tracks sources through the transformation, deduplication, and merging process + */ +export class SourceTracker_F0 { + private transformedResults: TransformedResult[]; + private preDedupeSourceMap: Map; + + constructor() { + this.transformedResults = []; + this.preDedupeSourceMap = new Map(); + } + + /** + * Transform raw extraction results into a format that preserves source information + */ + transformResults_F0(extractionResults: { extract: any; url: string }[], schema: any, withTransform: boolean = true) { + // Handle array outputs + if (Array.isArray(extractionResults[0]?.extract)) { + this.transformedResults = extractionResults.map(result => ({ + transformed: result.extract, + url: result.url + })); + + if (withTransform) { + // Combine all extracts to match original behavior + const combinedExtracts = extractionResults.map(r => r.extract).flat(); + return combinedExtracts; + } + return this.transformedResults; + } + + // Handle object outputs (original behavior) + this.transformedResults = extractionResults.map(result => ({ + transformed: transformArrayToObject_F0(schema, [result.extract]), + url: result.url + })); + + if (withTransform) { + // Then combine all extracts and transform them together to match original behavior + const combinedExtracts = extractionResults.map(r => r.extract); + return transformArrayToObject_F0(schema, combinedExtracts); + } + return this.transformedResults; + } + + /** + * Track sources for each item before deduplication + */ + trackPreDeduplicationSources_F0(multiEntityResult: { [key: string]: any[] } | any[]) { + try { + if (Array.isArray(multiEntityResult)) { + // Handle array outputs + multiEntityResult.forEach((item: any) => { + const itemKey = JSON.stringify(item); + const matchingSources = this.transformedResults + .filter(result => + Array.isArray(result.transformed) && + result.transformed.some((resultItem: any) => + JSON.stringify(resultItem) === itemKey + ) + ) + .map(result => result.url); + this.preDedupeSourceMap.set(itemKey, matchingSources); + }); + } else { + // Handle object outputs (original behavior) + Object.keys(multiEntityResult).forEach(key => { + multiEntityResult[key].forEach((item: any) => { + const itemKey = JSON.stringify(item); + const matchingSources = this.transformedResults + .filter(result => + result.transformed[key]?.some((resultItem: any) => + JSON.stringify(resultItem) === itemKey + ) + ) + .map(result => result.url); + this.preDedupeSourceMap.set(itemKey, matchingSources); + }); + }); + } + } catch (error) { + logger.error(`Failed to track pre-deduplication sources`, { error }); + } + } + + /** + * Map sources to final deduplicated/merged items + */ + mapSourcesToFinalItems_F0( + multiEntityResult: { [key: string]: any[] } | any[], + multiEntityKeys: string[] + ): Record { + try { + const sources: Record = {}; + + if (Array.isArray(multiEntityResult)) { + // Handle array outputs + multiEntityResult.forEach((item: any, finalIndex: number) => { + const sourceKey = `[${finalIndex}]`; + const itemSources = new Set(); + + this.transformedResults.forEach(result => { + if (Array.isArray(result.transformed)) { + result.transformed.forEach((originalItem: any) => { + if (areMergeable_F0(item, originalItem)) { + itemSources.add(result.url); + } + }); + } + }); + + sources[sourceKey] = Array.from(itemSources); + }); + } else { + // Handle object outputs (original behavior) + multiEntityKeys.forEach(key => { + if (multiEntityResult[key] && Array.isArray(multiEntityResult[key])) { + multiEntityResult[key].forEach((item: any, finalIndex: number) => { + const sourceKey = `${key}[${finalIndex}]`; + const itemSources = new Set(); + + this.transformedResults.forEach(result => { + result.transformed[key]?.forEach((originalItem: any) => { + if (areMergeable_F0(item, originalItem)) { + itemSources.add(result.url); + } + }); + }); + + sources[sourceKey] = Array.from(itemSources); + }); + } + }); + } + + return sources; + } catch (error) { + logger.error(`Failed to map sources to final items`, { error }); + return {}; + } + } +} \ No newline at end of file diff --git a/apps/api/src/lib/extract/fire-0/helpers/spread-schemas-f0.ts b/apps/api/src/lib/extract/fire-0/helpers/spread-schemas-f0.ts new file mode 100644 index 00000000..5aa7fbfc --- /dev/null +++ b/apps/api/src/lib/extract/fire-0/helpers/spread-schemas-f0.ts @@ -0,0 +1,82 @@ +export async function spreadSchemas_F0( + schema: any, + keys: string[], +): Promise<{ + singleAnswerSchema: any; + multiEntitySchema: any; +}> { + let singleAnswerSchema = { ...schema, properties: { ...schema.properties } }; + let multiEntitySchema: any = { + type: "object", + properties: {}, + ...(schema.required ? { required: [] } : {}) + }; + + // Helper function to check if a property path exists in schema + const hasPropertyPath = (schema: any, path: string[]): boolean => { + let current = schema.properties; + for (let i = 0; i < path.length; i++) { + if (!current[path[i]]) return false; + if (current[path[i]].type === "array" && current[path[i]].items) { + current = current[path[i]].items.properties; + } else { + current = current[path[i]].properties; + } + } + return true; + }; + + // Helper function to get the root property of a dot path + const getRootProperty = (path: string): string => { + return path.split('.')[0]; + }; + + keys.forEach((key) => { + const rootProperty = getRootProperty(key); + if (singleAnswerSchema.properties[rootProperty]) { + multiEntitySchema.properties[rootProperty] = singleAnswerSchema.properties[rootProperty]; + delete singleAnswerSchema.properties[rootProperty]; + + // Move required field if it exists + if (schema.required?.includes(rootProperty)) { + multiEntitySchema.required.push(rootProperty); + singleAnswerSchema.required = schema.required.filter((k: string) => k !== rootProperty); + } + } + }); + + // Recursively delete empty properties in singleAnswerSchema + const deleteEmptyProperties = (schema: any) => { + for (const key in schema.properties) { + if ( + schema.properties[key].properties && + Object.keys(schema.properties[key].properties).length === 0 + ) { + delete schema.properties[key]; + } else if (schema.properties[key].properties) { + deleteEmptyProperties(schema.properties[key]); + } + } + }; + + deleteEmptyProperties(singleAnswerSchema); + deleteEmptyProperties(multiEntitySchema); + + // If singleAnswerSchema has no properties left, return an empty object + if (Object.keys(singleAnswerSchema.properties).length === 0) { + singleAnswerSchema = {}; + } else if (singleAnswerSchema.required?.length === 0) { + delete singleAnswerSchema.required; + } + + if (Object.keys(multiEntitySchema.properties).length === 0) { + multiEntitySchema = {}; + } else if (multiEntitySchema.required?.length === 0) { + delete multiEntitySchema.required; + } + + return { + singleAnswerSchema, + multiEntitySchema, + }; +} diff --git a/apps/api/src/lib/extract/fire-0/helpers/transform-array-to-obj-f0.ts b/apps/api/src/lib/extract/fire-0/helpers/transform-array-to-obj-f0.ts new file mode 100644 index 00000000..07237bd0 --- /dev/null +++ b/apps/api/src/lib/extract/fire-0/helpers/transform-array-to-obj-f0.ts @@ -0,0 +1,167 @@ +import isEqual from "lodash/isEqual"; + +/** + * Transforms an array of objects into a single object, merging properties with the same name. + * @param originalSchema - The schema of the original data. + * @param arrayData - The array of objects to transform. + * @returns A single object with merged properties. + */ +export function transformArrayToObject_F0( + originalSchema: any, + arrayData: any[], +): any { + if (Object.keys(originalSchema).length == 0) { + return {}; + } + + const transformedResult: any = {}; + + // Function to find the array key in a nested schema + function findArrayKey(schema: any): string | null { + for (const key in schema.properties) { + if (schema.properties[key].type === "array") { + return key; + } else if (schema.properties[key].type === "object") { + const nestedKey = findArrayKey(schema.properties[key]); + if (nestedKey) { + return `${key}.${nestedKey}`; + } + } + } + return null; + } + + const arrayKeyPath = findArrayKey(originalSchema); + if (!arrayKeyPath) { + return arrayData.reduce((acc, item) => { + for (const key in item) { + if (!acc[key]) { + acc[key] = item[key]; + } else if ( + typeof acc[key] === "object" && + typeof item[key] === "object" + ) { + acc[key] = { ...acc[key], ...item[key] }; + } + } + return acc; + }, {}); + } + + const arrayKeyParts = arrayKeyPath.split("."); + const arrayKey = arrayKeyParts.pop(); + if (!arrayKey) { + throw new Error("Array key not found in schema"); + } + + const parentSchema = arrayKeyParts.reduce( + (schema, key) => schema.properties[key], + originalSchema, + ); + const itemSchema = parentSchema.properties[arrayKey].items; + if (!itemSchema) { + throw new Error("Item schema not found for array key"); + } + + // Initialize the array in the transformed result + let currentLevel = transformedResult; + arrayKeyParts.forEach((part) => { + if (!currentLevel[part]) { + currentLevel[part] = {}; + } + currentLevel = currentLevel[part]; + }); + currentLevel[arrayKey] = []; + + // Helper function to check if an object is already in the array + function isDuplicateObject(array: any[], obj: any): boolean { + return array.some((existingItem) => isEqual(existingItem, obj)); + } + + // Helper function to validate if an object follows the schema + function isValidObject(obj: any, schema: any): boolean { + return Object.keys(schema.properties).every((key) => { + return ( + obj.hasOwnProperty(key) && + typeof obj[key] === schema.properties[key].type + ); + }); + } + + // Iterate over each item in the arrayData + arrayData.forEach((item) => { + let currentItem = item; + arrayKeyParts.forEach((part) => { + if (currentItem[part]) { + currentItem = currentItem[part]; + } + }); + + // Copy non-array properties from the parent object + for (const key in parentSchema.properties) { + if ( + key !== arrayKey && + currentItem.hasOwnProperty(key) && + !currentLevel.hasOwnProperty(key) + ) { + currentLevel[key] = currentItem[key]; + } + } + + // Ensure that the currentItem[arrayKey] is an array before mapping + if (Array.isArray(currentItem[arrayKey])) { + currentItem[arrayKey].forEach((subItem: any) => { + if ( + typeof subItem === "object" && + subItem !== null && + isValidObject(subItem, itemSchema) + ) { + // For arrays of objects, add only unique objects + const transformedItem: any = {}; + let hasValidData = false; + + for (const key in itemSchema.properties) { + if (subItem.hasOwnProperty(key) && subItem[key] !== undefined) { + transformedItem[key] = subItem[key]; + hasValidData = true; + } + } + + if ( + hasValidData && + !isDuplicateObject(currentLevel[arrayKey], transformedItem) + ) { + currentLevel[arrayKey].push(transformedItem); + } + } + }); + } else { + console.warn( + `Expected an array at ${arrayKey}, but found:`, + currentItem[arrayKey], + ); + } + + // Handle merging of array properties + for (const key in parentSchema.properties) { + if ( + parentSchema.properties[key].type === "array" && + Array.isArray(currentItem[key]) + ) { + if (!currentLevel[key]) { + currentLevel[key] = []; + } + currentItem[key].forEach((value: any) => { + if ( + !currentLevel[key].includes(value) && + !isDuplicateObject(currentLevel[arrayKey], value) + ) { + currentLevel[key].push(value); + } + }); + } + } + }); + + return transformedResult; +} diff --git a/apps/api/src/lib/extract/fire-0/llmExtract-f0.ts b/apps/api/src/lib/extract/fire-0/llmExtract-f0.ts new file mode 100644 index 00000000..efe82d4b --- /dev/null +++ b/apps/api/src/lib/extract/fire-0/llmExtract-f0.ts @@ -0,0 +1,469 @@ +import { encoding_for_model } from "@dqbd/tiktoken"; +import { TiktokenModel } from "@dqbd/tiktoken"; +import { + Document, + ExtractOptions, + TokenUsage, +} from "../../../controllers/v1/types"; +import { Logger } from "winston"; +import { logger } from "../../../lib/logger"; +import { modelPrices } from "../../../lib/extract/usage/model-prices"; +import { generateObject, generateText, LanguageModel } from 'ai'; +import { jsonSchema } from 'ai'; +import { getModel } from "../../../lib/generic-ai"; +import { z } from "zod"; +import { EngineResultsTracker, Meta } from "../../../scraper/scrapeURL"; + +// Get max tokens from model prices +const getModelLimits_F0 = (model: string) => { + const modelConfig = modelPrices[model]; + if (!modelConfig) { + // Default fallback values + return { + maxInputTokens: 8192, + maxOutputTokens: 4096, + maxTokens: 12288, + }; + } + return { + maxInputTokens: modelConfig.max_input_tokens || modelConfig.max_tokens, + maxOutputTokens: modelConfig.max_output_tokens || modelConfig.max_tokens, + maxTokens: modelConfig.max_tokens, + }; +}; + +export class LLMRefusalError extends Error { + public refusal: string; + public results: EngineResultsTracker | undefined; + + constructor(refusal: string) { + super("LLM refused to extract the website's content"); + this.refusal = refusal; + } +} + +function normalizeSchema(x: any): any { + if (typeof x !== "object" || x === null) return x; + + if (x["$defs"] !== null && typeof x["$defs"] === "object") { + x["$defs"] = Object.fromEntries( + Object.entries(x["$defs"]).map(([name, schema]) => [ + name, + normalizeSchema(schema), + ]), + ); + } + + if (x && x.anyOf) { + x.anyOf = x.anyOf.map((x) => normalizeSchema(x)); + } + + if (x && x.oneOf) { + x.oneOf = x.oneOf.map((x) => normalizeSchema(x)); + } + + if (x && x.allOf) { + x.allOf = x.allOf.map((x) => normalizeSchema(x)); + } + + if (x && x.not) { + x.not = normalizeSchema(x.not); + } + + if (x && x.type === "object") { + return { + ...x, + properties: Object.fromEntries( + Object.entries(x.properties || {}).map(([k, v]) => [k, normalizeSchema(v)]), + ), + required: Object.keys(x.properties || {}), + additionalProperties: false, + }; + } else if (x && x.type === "array") { + return { + ...x, + items: normalizeSchema(x.items), + }; + } else { + return x; + } +} + + + +interface TrimResult { + text: string; + numTokens: number; + warning?: string; +} + +export function trimToTokenLimit_F0(text: string, maxTokens: number, modelId: string="gpt-4o", previousWarning?: string): TrimResult { + try { + const encoder = encoding_for_model(modelId as TiktokenModel); + try { + const tokens = encoder.encode(text); + const numTokens = tokens.length; + + if (numTokens <= maxTokens) { + return { text, numTokens }; + } + + const modifier = 3; + // Start with 3 chars per token estimation + let currentText = text.slice(0, Math.floor(maxTokens * modifier) - 1); + + // Keep trimming until we're under the token limit + while (true) { + const currentTokens = encoder.encode(currentText); + if (currentTokens.length <= maxTokens) { + const warning = `The extraction content would have used more tokens (${numTokens}) than the maximum we allow (${maxTokens}). -- the input has been automatically trimmed.`; + return { + text: currentText, + numTokens: currentTokens.length, + warning: previousWarning ? `${warning} ${previousWarning}` : warning + }; + } + const overflow = currentTokens.length * modifier - maxTokens - 1; + // If still over limit, remove another chunk + currentText = currentText.slice(0, Math.floor(currentText.length - overflow)); + } + + } catch (e) { + throw e; + } finally { + encoder.free(); + } + } catch (error) { + // Fallback to a more conservative character-based approach + const estimatedCharsPerToken = 2.8; + const safeLength = maxTokens * estimatedCharsPerToken; + const trimmedText = text.slice(0, Math.floor(safeLength)); + + const warning = `Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (${maxTokens}) we support.`; + + return { + text: trimmedText, + numTokens: maxTokens, // We assume we hit the max in this fallback case + warning: previousWarning ? `${warning} ${previousWarning}` : warning + }; + } +} + +export async function generateCompletions_F0({ + logger, + options, + markdown, + previousWarning, + isExtractEndpoint, + model = getModel("gpt-4o-mini"), + mode = "object", +}: { + model?: LanguageModel; + logger: Logger; + options: ExtractOptions; + markdown?: string; + previousWarning?: string; + isExtractEndpoint?: boolean; + mode?: "object" | "no-object"; +}): Promise<{ + extract: any; + numTokens: number; + warning: string | undefined; + totalUsage: TokenUsage; + model: string; +}> { + let extract: any; + let warning: string | undefined; + + if (markdown === undefined) { + throw new Error("document.markdown is undefined -- this is unexpected"); + } + + const { maxInputTokens, maxOutputTokens } = getModelLimits_F0(model.modelId); + // Calculate 80% of max input tokens (for content) + const maxTokensSafe = Math.floor(maxInputTokens * 0.8); + + // Use the new trimming function + const { text: trimmedMarkdown, numTokens, warning: trimWarning } = trimToTokenLimit_F0( + markdown, + maxTokensSafe, + model.modelId, + previousWarning + ); + + markdown = trimmedMarkdown; + warning = trimWarning; + + try { + const prompt = options.prompt !== undefined + ? `Transform the following content into structured JSON output based on the provided schema and this user request: ${options.prompt}. If schema is provided, strictly follow it.\n\n${markdown}` + : `Transform the following content into structured JSON output based on the provided schema if any.\n\n${markdown}`; + + if (mode === "no-object") { + const result = await generateText({ + model: model, + prompt: options.prompt + (markdown ? `\n\nData:${markdown}` : ""), + temperature: options.temperature ?? 0, + system: options.systemPrompt, + }); + + extract = result.text; + + return { + extract, + warning, + numTokens, + totalUsage: { + promptTokens: numTokens, + completionTokens: result.usage?.completionTokens ?? 0, + totalTokens: numTokens + (result.usage?.completionTokens ?? 0), + }, + model: model.modelId, + }; + } + + let schema = options.schema; + // Normalize the bad json schema users write (mogery) + if (schema && !(schema instanceof z.ZodType)) { + // let schema = options.schema; + if (schema) { + schema = removeDefaultProperty_F0(schema); + } + + if (schema && schema.type === "array") { + schema = { + type: "object", + properties: { + items: options.schema, + }, + required: ["items"], + additionalProperties: false, + }; + } else if (schema && typeof schema === "object" && !schema.type) { + schema = { + type: "object", + properties: Object.fromEntries( + Object.entries(schema).map(([key, value]) => { + return [key, removeDefaultProperty_F0(value)]; + }), + ), + required: Object.keys(schema), + additionalProperties: false, + }; + } + + schema = normalizeSchema(schema); + } + + const repairConfig = { + experimental_repairText: async ({ text, error }) => { + // AI may output a markdown JSON code block. Remove it - mogery + if (typeof text === "string" && text.trim().startsWith("```")) { + if (text.trim().startsWith("```json")) { + text = text.trim().slice("```json".length).trim(); + } else { + text = text.trim().slice("```".length).trim(); + } + + if (text.trim().endsWith("```")) { + text = text.trim().slice(0, -"```".length).trim(); + } + + // If this fixes the JSON, just return it. If not, continue - mogery + try { + JSON.parse(text); + return text; + } catch (_) {} + } + + const { text: fixedText } = await generateText({ + model: model, + prompt: `Fix this JSON that had the following error: ${error}\n\nOriginal text:\n${text}\n\nReturn only the fixed JSON, no explanation.`, + system: "You are a JSON repair expert. Your only job is to fix malformed JSON and return valid JSON that matches the original structure and intent as closely as possible. Do not include any explanation or commentary - only return the fixed JSON. Do not return it in a Markdown code block, just plain JSON." + }); + return fixedText; + } + }; + + const generateObjectConfig = { + model: model, + prompt: prompt, + temperature: options.temperature ?? 0, + system: options.systemPrompt, + ...(schema && { schema: schema instanceof z.ZodType ? schema : jsonSchema(schema) }), + ...(!schema && { output: 'no-schema' as const }), + ...repairConfig, + ...(!schema && { + onError: (error: Error) => { + console.error(error); + } + }) + } satisfies Parameters[0]; + + const result = await generateObject(generateObjectConfig); + extract = result.object; + + // If the users actually wants the items object, they can specify it as 'required' in the schema + // otherwise, we just return the items array + if ( + options.schema && + options.schema.type === "array" && + !schema?.required?.includes("items") + ) { + extract = extract?.items; + } + + // Since generateObject doesn't provide token usage, we'll estimate it + const promptTokens = numTokens; + const completionTokens = result?.usage?.completionTokens ?? 0; + + return { + extract, + warning, + numTokens, + totalUsage: { + promptTokens, + completionTokens, + totalTokens: promptTokens + completionTokens, + }, + model: model.modelId, + }; + } catch (error) { + if (error.message?.includes('refused')) { + throw new LLMRefusalError(error.message); + } + throw error; + } +} + +export async function performLLMExtract( + meta: Meta, + document: Document, +): Promise { + if (meta.options.formats.includes("extract")) { + meta.internalOptions.abort?.throwIfAborted(); + const { extract, warning } = await generateCompletions_F0({ + logger: meta.logger.child({ + method: "performLLMExtract/generateCompletions", + }), + options: meta.options.extract!, + markdown: document.markdown, + previousWarning: document.warning + }); + + if (meta.options.formats.includes("json")) { + document.json = extract; + } else { + document.extract = extract; + } + document.warning = warning; + } + + return document; +} + +export function removeDefaultProperty_F0(schema: any): any { + if (typeof schema !== "object" || schema === null) return schema; + + const rest = { ...schema }; + + // unsupported global keys + delete rest.default; + + // unsupported object keys + delete rest.patternProperties; + delete rest.unevaluatedProperties; + delete rest.propertyNames; + delete rest.minProperties; + delete rest.maxProperties; + + // unsupported string keys + delete rest.minLength; + delete rest.maxLength; + delete rest.pattern; + delete rest.format; + + // unsupported number keys + delete rest.minimum; + delete rest.maximum; + delete rest.multipleOf; + + // unsupported array keys + delete rest.unevaluatedItems; + delete rest.contains; + delete rest.minContains; + delete rest.maxContains; + delete rest.minItems; + delete rest.maxItems; + delete rest.uniqueItems; + + for (const key in rest) { + if (Array.isArray(rest[key])) { + rest[key] = rest[key].map((item: any) => removeDefaultProperty_F0(item)); + } else if (typeof rest[key] === "object" && rest[key] !== null) { + rest[key] = removeDefaultProperty_F0(rest[key]); + } + } + + return rest; +} + +export async function generateSchemaFromPrompt_F0(prompt: string): Promise { + const model = getModel("gpt-4o"); + const temperatures = [0, 0.1, 0.3]; // Different temperatures to try + let lastError: Error | null = null; + + for (const temp of temperatures) { + try { + const { extract } = await generateCompletions_F0({ + logger: logger.child({ + method: "generateSchemaFromPrompt/generateCompletions", + }), + model: model, + options: { + mode: "llm", + systemPrompt: `You are a schema generator for a web scraping system. Generate a JSON schema based on the user's prompt. +Consider: +1. The type of data being requested +2. Required fields vs optional fields +3. Appropriate data types for each field +4. Nested objects and arrays where appropriate + +Valid JSON schema, has to be simple. No crazy properties. OpenAI has to support it. +Supported types +The following types are supported for Structured Outputs: + +String +Number +Boolean +Integer +Object +Array +Enum +anyOf + +Formats are not supported. Min/max are not supported. Anything beyond the above is not supported. Keep it simple with types and descriptions. +Optionals are not supported. +DO NOT USE FORMATS. +Keep it simple. Don't create too many properties, just the ones that are needed. Don't invent properties. +Return a valid JSON schema object with properties that would capture the information requested in the prompt.`, + prompt: `Generate a JSON schema for extracting the following information: ${prompt}`, + temperature: temp + }, + markdown: prompt + }); + + return extract; + + } catch (error) { + lastError = error as Error; + logger.warn(`Failed attempt with temperature ${temp}: ${error.message}`); + continue; + } + } + + // If we get here, all attempts failed + throw new Error( + `Failed to generate schema after all attempts. Last error: ${lastError?.message}`, + ); +} diff --git a/apps/api/src/lib/extract/fire-0/ranker-f0.ts b/apps/api/src/lib/extract/fire-0/ranker-f0.ts new file mode 100644 index 00000000..4ab45291 --- /dev/null +++ b/apps/api/src/lib/extract/fire-0/ranker-f0.ts @@ -0,0 +1,86 @@ +import { embed } from "ai"; +import { configDotenv } from "dotenv"; +import { getEmbeddingModel } from "../../../lib/generic-ai"; + +configDotenv(); + +async function getEmbedding(text: string) { + const { embedding } = await embed({ + model: getEmbeddingModel("text-embedding-3-small"), + value: text, + }); + + return embedding; +} + +const cosineSimilarity = (vec1: number[], vec2: number[]): number => { + const dotProduct = vec1.reduce((sum, val, i) => sum + val * vec2[i], 0); + const magnitude1 = Math.sqrt(vec1.reduce((sum, val) => sum + val * val, 0)); + const magnitude2 = Math.sqrt(vec2.reduce((sum, val) => sum + val * val, 0)); + if (magnitude1 === 0 || magnitude2 === 0) return 0; + return dotProduct / (magnitude1 * magnitude2); +}; + +// Function to convert text to vector +const textToVector = (searchQuery: string, text: string): number[] => { + const words = searchQuery.toLowerCase().split(/\W+/); + return words.map((word) => { + const count = (text.toLowerCase().match(new RegExp(word, "g")) || []) + .length; + return count / text.length; + }); +}; + +async function performRanking_F0( + linksWithContext: string[], + links: string[], + searchQuery: string, +) { + try { + // Handle invalid inputs + if (!searchQuery || !linksWithContext.length || !links.length) { + return []; + } + + // Sanitize search query by removing null characters + const sanitizedQuery = searchQuery; + + // Generate embeddings for the search query + const queryEmbedding = await getEmbedding(sanitizedQuery); + + // Generate embeddings for each link and calculate similarity in parallel + const linksAndScores = await Promise.all( + linksWithContext.map((linkWithContext, index) => + getEmbedding(linkWithContext) + .then((linkEmbedding) => { + const score = cosineSimilarity(queryEmbedding, linkEmbedding); + return { + link: links[index], + linkWithContext, + score, + originalIndex: index, + }; + }) + .catch(() => ({ + link: links[index], + linkWithContext, + score: 0, + originalIndex: index, + })), + ), + ); + + // Sort links based on similarity scores while preserving original order for equal scores + linksAndScores.sort((a, b) => { + const scoreDiff = b.score - a.score; + return scoreDiff === 0 ? a.originalIndex - b.originalIndex : scoreDiff; + }); + + return linksAndScores; + } catch (error) { + console.error(`Error performing semantic search: ${error}`); + return []; + } +} + +export { performRanking_F0 }; diff --git a/apps/api/src/lib/extract/fire-0/reranker-f0.ts b/apps/api/src/lib/extract/fire-0/reranker-f0.ts new file mode 100644 index 00000000..87e673df --- /dev/null +++ b/apps/api/src/lib/extract/fire-0/reranker-f0.ts @@ -0,0 +1,293 @@ +import { MapDocument, URLTrace } from "../../../controllers/v1/types"; +import { isUrlBlocked } from "../../../scraper/WebScraper/utils/blocklist"; +import { logger } from "../../logger"; +import { CohereClient } from "cohere-ai"; +import { extractConfig } from "../config"; +import { generateCompletions } from "../../../scraper/scrapeURL/transformers/llmExtract"; +import { performRanking_F0 } from "./ranker-f0"; +import { buildRerankerSystemPrompt_F0, buildRerankerUserPrompt_F0 } from "./build-prompts-f0"; + +const cohere = new CohereClient({ + token: process.env.COHERE_API_KEY, +}); + +interface RankingResult { + mappedLinks: MapDocument[]; + linksAndScores: { + link: string; + linkWithContext: string; + score: number; + originalIndex: number; + }[]; +} + +export async function rerankDocuments_FO( + documents: (string | Record)[], + query: string, + topN = 3, + model = "rerank-english-v3.0", +) { + const rerank = await cohere.v2.rerank({ + documents, + query, + topN, + model, + returnDocuments: true, + }); + + return rerank.results + .sort((a, b) => b.relevanceScore - a.relevanceScore) + .map((x) => ({ + document: x.document, + index: x.index, + relevanceScore: x.relevanceScore, + })); +} + +export async function rerankLinks_F0( + mappedLinks: MapDocument[], + searchQuery: string, + urlTraces: URLTrace[], +): Promise { + // console.log("Going to rerank links"); + const mappedLinksRerank = mappedLinks.map( + (x) => `url: ${x.url}, title: ${x.title}, description: ${x.description}`, + ); + + const linksAndScores = await performRanking_F0( + mappedLinksRerank, + mappedLinks.map((l) => l.url), + searchQuery, + ); + + // First try with high threshold + let filteredLinks = filterAndProcessLinks_F0( + mappedLinks, + linksAndScores, + extractConfig.RERANKING.INITIAL_SCORE_THRESHOLD_FOR_RELEVANCE, + ); + + // If we don't have enough high-quality links, try with lower threshold + if (filteredLinks.length < extractConfig.RERANKING.MIN_REQUIRED_LINKS) { + logger.info( + `Only found ${filteredLinks.length} links with score > ${extractConfig.RERANKING.INITIAL_SCORE_THRESHOLD_FOR_RELEVANCE}. Trying lower threshold...`, + ); + filteredLinks = filterAndProcessLinks_F0( + mappedLinks, + linksAndScores, + extractConfig.RERANKING.FALLBACK_SCORE_THRESHOLD_FOR_RELEVANCE, + ); + + if (filteredLinks.length === 0) { + // If still no results, take top N results regardless of score + logger.warn( + `No links found with score > ${extractConfig.RERANKING.FALLBACK_SCORE_THRESHOLD_FOR_RELEVANCE}. Taking top ${extractConfig.RERANKING.MIN_REQUIRED_LINKS} results.`, + ); + filteredLinks = linksAndScores + .sort((a, b) => b.score - a.score) + .slice(0, extractConfig.RERANKING.MIN_REQUIRED_LINKS) + .map((x) => mappedLinks.find((link) => link.url === x.link)) + .filter( + (x): x is MapDocument => + x !== undefined && x.url !== undefined && !isUrlBlocked(x.url), + ); + } + } + + // Update URL traces with relevance scores and mark filtered out URLs + linksAndScores.forEach((score) => { + const trace = urlTraces.find((t) => t.url === score.link); + if (trace) { + trace.relevanceScore = score.score; + // If URL didn't make it through filtering, mark it as filtered out + if (!filteredLinks.some((link) => link.url === score.link)) { + trace.warning = `Relevance score ${score.score} below threshold`; + trace.usedInCompletion = false; + } + } + }); + + const rankedLinks = filteredLinks.slice( + 0, + extractConfig.RERANKING.MAX_RANKING_LIMIT_FOR_RELEVANCE, + ); + + // Mark URLs that will be used in completion + rankedLinks.forEach((link) => { + const trace = urlTraces.find((t) => t.url === link.url); + if (trace) { + trace.usedInCompletion = true; + } + }); + + // Mark URLs that were dropped due to ranking limit + filteredLinks + .slice(extractConfig.RERANKING.MAX_RANKING_LIMIT_FOR_RELEVANCE) + .forEach((link) => { + const trace = urlTraces.find((t) => t.url === link.url); + if (trace) { + trace.warning = "Excluded due to ranking limit"; + trace.usedInCompletion = false; + } + }); + + // console.log("Reranked links: ", rankedLinks.length); + + return rankedLinks; +} + +function filterAndProcessLinks_F0( + mappedLinks: MapDocument[], + linksAndScores: { + link: string; + linkWithContext: string; + score: number; + originalIndex: number; + }[], + threshold: number, +): MapDocument[] { + return linksAndScores + .filter((x) => x.score > threshold) + .map((x) => mappedLinks.find((link) => link.url === x.link)) + .filter( + (x): x is MapDocument => + x !== undefined && x.url !== undefined && !isUrlBlocked(x.url), + ); +} + +export type RerankerResult = { + mapDocument: (MapDocument & { relevanceScore?: number; reason?: string })[]; + tokensUsed: number; +}; + +export type RerankerOptions = { + links: MapDocument[]; + searchQuery: string; + urlTraces: URLTrace[]; +}; + +export async function rerankLinksWithLLM_F0(options: RerankerOptions): Promise { + const { links, searchQuery, urlTraces } = options; + const chunkSize = 100; + const chunks: MapDocument[][] = []; + const TIMEOUT_MS = 20000; + const MAX_RETRIES = 2; + let totalTokensUsed = 0; + + // Split links into chunks of 200 + for (let i = 0; i < links.length; i += chunkSize) { + chunks.push(links.slice(i, i + chunkSize)); + } + + // console.log(`Total links: ${mappedLinks.length}, Number of chunks: ${chunks.length}`); + + const schema = { + type: "object", + properties: { + relevantLinks: { + type: "array", + items: { + type: "object", + properties: { + url: { type: "string" }, + relevanceScore: { type: "number" }, + reason: { type: "string", description: "The reason why you chose the score for this link given the intent." }, + }, + required: ["url", "relevanceScore", "reason"], + }, + }, + }, + required: ["relevantLinks"], + }; + + const results = await Promise.all( + chunks.map(async (chunk, chunkIndex) => { + // console.log(`Processing chunk ${chunkIndex + 1}/${chunks.length} with ${chunk.length} links`); + + const linksContent = chunk + .map( + (link) => + `URL: ${link.url}${link.title ? `\nTitle: ${link.title}` : ""}${link.description ? `\nDescription: ${link.description}` : ""}`, + ) + .join("\n\n"); + + for (let retry = 0; retry <= MAX_RETRIES; retry++) { + try { + const timeoutPromise = new Promise((resolve) => { + setTimeout(() => resolve(null), TIMEOUT_MS); + }); + + // dumpToFile(new Date().toISOString(),[buildRerankerSystemPrompt(), buildRerankerUserPrompt(searchQuery), schema, linksContent]) + const completionPromise = generateCompletions({ + logger: logger.child({ + method: "rerankLinksWithLLM", + chunk: chunkIndex + 1, + retry, + }), + options: { + mode: "llm", + systemPrompt: buildRerankerSystemPrompt_F0(), + prompt: buildRerankerUserPrompt_F0(searchQuery), + schema: schema, + }, + markdown: linksContent, + isExtractEndpoint: true + }); + + const completion = await Promise.race([ + completionPromise, + timeoutPromise, + ]); + + if (!completion) { + // console.log(`Chunk ${chunkIndex + 1}: Timeout on attempt ${retry + 1}`); + continue; + } + + if (!completion.extract?.relevantLinks) { + // console.warn(`Chunk ${chunkIndex + 1}: No relevant links found in completion response`); + return []; + } + + totalTokensUsed += completion.numTokens || 0; + // console.log(`Chunk ${chunkIndex + 1}: Found ${completion.extract.relevantLinks.length} relevant links`); + return completion.extract.relevantLinks; + } catch (error) { + console.warn( + `Error processing chunk ${chunkIndex + 1} attempt ${retry + 1}:`, + error, + ); + if (retry === MAX_RETRIES) { + // console.log(`Chunk ${chunkIndex + 1}: Max retries reached, returning empty array`); + return []; + } + } + } + return []; + }), + ); + + // console.log(`Processed ${results.length} chunks`); + + // Flatten results and sort by relevance score + const flattenedResults = results + .flat() + .sort((a, b) => b.relevanceScore - a.relevanceScore); + // console.log(`Total relevant links found: ${flattenedResults.length}`); + + // Map back to MapDocument format, keeping only relevant links + const relevantLinks = flattenedResults + .map((result) => { + const link = links.find((link) => link.url === result.url); + if (link) { + return { ...link, relevanceScore: result.relevanceScore ? parseFloat(result.relevanceScore) : 0, reason: result.reason }; + } + return undefined; + }) + .filter((link): link is NonNullable => link !== undefined); + + return { + mapDocument: relevantLinks, + tokensUsed: totalTokensUsed, + }; +} diff --git a/apps/api/src/lib/extract/fire-0/url-processor-f0.ts b/apps/api/src/lib/extract/fire-0/url-processor-f0.ts new file mode 100644 index 00000000..62ac2325 --- /dev/null +++ b/apps/api/src/lib/extract/fire-0/url-processor-f0.ts @@ -0,0 +1,250 @@ +import { MapDocument, URLTrace } from "../../../controllers/v1/types"; +import { getMapResults } from "../../../controllers/v1/map"; +import { removeDuplicateUrls } from "../../validateUrl"; +import { isUrlBlocked } from "../../../scraper/WebScraper/utils/blocklist"; +import { buildPreRerankPrompt, buildRefrasedPrompt } from "../build-prompts"; +import { rerankLinksWithLLM_F0 } from "./reranker-f0"; +import { extractConfig } from "../config"; +import type { Logger } from "winston"; +import { generateText } from "ai"; +import { getModel } from "../../generic-ai"; + +export async function generateBasicCompletion_FO(prompt: string) { + const { text } = await generateText({ + model: getModel("gpt-4o"), + prompt: prompt, + temperature: 0 + }); + return text; +} +interface ProcessUrlOptions { + url: string; + prompt?: string; + schema?: any; + teamId: string; + allowExternalLinks?: boolean; + origin?: string; + limit?: number; + includeSubdomains?: boolean; +} + +export async function processUrl_F0( + options: ProcessUrlOptions, + urlTraces: URLTrace[], + updateExtractCallback: (links: string[]) => void, + logger: Logger, +): Promise { + const trace: URLTrace = { + url: options.url, + status: "mapped", + timing: { + discoveredAt: new Date().toISOString(), + }, + }; + urlTraces.push(trace); + + if (!options.url.includes("/*") && !options.allowExternalLinks) { + if (!isUrlBlocked(options.url)) { + trace.usedInCompletion = true; + return [options.url]; + } + logger.warn("URL is blocked"); + trace.status = "error"; + trace.error = "URL is blocked"; + trace.usedInCompletion = false; + return []; + } + + const baseUrl = options.url.replace("/*", ""); + let urlWithoutWww = baseUrl.replace("www.", ""); + + let searchQuery = options.prompt; + if (options.prompt) { + searchQuery = + ( + await generateBasicCompletion_FO( + buildRefrasedPrompt(options.prompt, baseUrl), + ) + ) + ?.replace('"', "") + .replace("/", "") ?? options.prompt; + } + + try { + logger.debug("Running map...", { + search: searchQuery, + }); + const mapResults = await getMapResults({ + url: baseUrl, + search: searchQuery, + teamId: options.teamId, + allowExternalLinks: options.allowExternalLinks, + origin: options.origin, + limit: options.limit, + ignoreSitemap: false, + includeMetadata: true, + includeSubdomains: options.includeSubdomains, + }); + + let mappedLinks = mapResults.mapResults as MapDocument[]; + let allUrls = [...mappedLinks.map((m) => m.url), ...mapResults.links]; + let uniqueUrls = removeDuplicateUrls(allUrls); + logger.debug("Map finished.", { + linkCount: allUrls.length, + uniqueLinkCount: uniqueUrls.length, + }); + + // Track all discovered URLs + uniqueUrls.forEach((discoveredUrl) => { + if (!urlTraces.some((t) => t.url === discoveredUrl)) { + urlTraces.push({ + url: discoveredUrl, + status: "mapped", + timing: { + discoveredAt: new Date().toISOString(), + }, + usedInCompletion: false, + }); + } + }); + + // retry if only one url is returned + if (uniqueUrls.length <= 1) { + logger.debug("Running map... (pass 2)"); + const retryMapResults = await getMapResults({ + url: baseUrl, + teamId: options.teamId, + allowExternalLinks: options.allowExternalLinks, + origin: options.origin, + limit: options.limit, + ignoreSitemap: false, + includeMetadata: true, + includeSubdomains: options.includeSubdomains, + }); + + mappedLinks = retryMapResults.mapResults as MapDocument[]; + allUrls = [...mappedLinks.map((m) => m.url), ...mapResults.links]; + uniqueUrls = removeDuplicateUrls(allUrls); + logger.debug("Map finished. (pass 2)", { + linkCount: allUrls.length, + uniqueLinkCount: uniqueUrls.length, + }); + + // Track all discovered URLs + uniqueUrls.forEach((discoveredUrl) => { + if (!urlTraces.some((t) => t.url === discoveredUrl)) { + urlTraces.push({ + url: discoveredUrl, + status: "mapped", + warning: "Broader search. Not limiting map results to prompt.", + timing: { + discoveredAt: new Date().toISOString(), + }, + usedInCompletion: false, + }); + } + }); + } + + // Track all discovered URLs + uniqueUrls.forEach((discoveredUrl) => { + if (!urlTraces.some((t) => t.url === discoveredUrl)) { + urlTraces.push({ + url: discoveredUrl, + status: "mapped", + timing: { + discoveredAt: new Date().toISOString(), + }, + usedInCompletion: false, + }); + } + }); + + const existingUrls = new Set(mappedLinks.map((m) => m.url)); + const newUrls = uniqueUrls.filter((url) => !existingUrls.has(url)); + + mappedLinks = [ + ...mappedLinks, + ...newUrls.map((url) => ({ url, title: "", description: "" })), + ]; + + if (mappedLinks.length === 0) { + mappedLinks = [{ url: baseUrl, title: "", description: "" }]; + } + + // Limit initial set of links (1000) + mappedLinks = mappedLinks.slice( + 0, + extractConfig.RERANKING.MAX_INITIAL_RANKING_LIMIT, + ); + + updateExtractCallback(mappedLinks.map((x) => x.url)); + + let rephrasedPrompt = options.prompt ?? searchQuery; + try { + rephrasedPrompt = + (await generateBasicCompletion_FO( + buildPreRerankPrompt(rephrasedPrompt, options.schema, baseUrl), + )) ?? + "Extract the data according to the schema: " + + JSON.stringify(options.schema, null, 2); + } catch (error) { + console.error("Error generating search query from schema:", error); + rephrasedPrompt = + "Extract the data according to the schema: " + + JSON.stringify(options.schema, null, 2) + + " " + + options?.prompt; // Fallback to just the domain + } + + // "mapped-links.txt", + // mappedLinks, + // (link, index) => `${index + 1}. URL: ${link.url}, Title: ${link.title}, Description: ${link.description}` + // ); + + logger.info("Generated rephrased prompt.", { + rephrasedPrompt, + }); + + logger.info("Reranking pass 1 (threshold 0.8)..."); + const rerankerResult = await rerankLinksWithLLM_F0({ + links: mappedLinks, + searchQuery: rephrasedPrompt, + urlTraces, + }); + mappedLinks = rerankerResult.mapDocument; + let tokensUsed = rerankerResult.tokensUsed; + logger.info("Reranked! (pass 1)", { + linkCount: mappedLinks.length, + }); + + // 2nd Pass, useful for when the first pass returns too many links + if (mappedLinks.length > 100) { + logger.info("Reranking (pass 2)..."); + const rerankerResult = await rerankLinksWithLLM_F0({ + links: mappedLinks, + searchQuery: rephrasedPrompt, + urlTraces, + }); + mappedLinks = rerankerResult.mapDocument; + tokensUsed += rerankerResult.tokensUsed; + logger.info("Reranked! (pass 2)", { + linkCount: mappedLinks.length, + }); + } + + // dumpToFile( + // "llm-links.txt", + // mappedLinks, + // (link, index) => `${index + 1}. URL: ${link.url}, Title: ${link.title}, Description: ${link.description}` + // ); + // Remove title and description from mappedLinks + mappedLinks = mappedLinks.map((link) => ({ url: link.url })); + return mappedLinks.map((x) => x.url); + } catch (error) { + trace.status = "error"; + trace.error = error.message; + trace.usedInCompletion = false; + return []; + } +} diff --git a/apps/api/src/lib/extract/fire-0/usage/llm-cost-f0.ts b/apps/api/src/lib/extract/fire-0/usage/llm-cost-f0.ts new file mode 100644 index 00000000..ab6db63f --- /dev/null +++ b/apps/api/src/lib/extract/fire-0/usage/llm-cost-f0.ts @@ -0,0 +1,61 @@ +import { TokenUsage } from "../../../../controllers/v1/types"; +import { logger } from "../../../../lib/logger"; +import { modelPrices } from "../../usage/model-prices"; + +interface ModelPricing { + input_cost_per_token?: number; + output_cost_per_token?: number; + input_cost_per_request?: number; + mode: string; +} +const tokenPerCharacter = 4; +const baseTokenCost = 300; + +export function calculateFinalResultCost_F0(data: any): number { + return Math.floor( + JSON.stringify(data).length / tokenPerCharacter + baseTokenCost, + ); +} + +export function estimateTotalCost_F0(tokenUsage: TokenUsage[]): number { + return tokenUsage.reduce((total, usage) => { + return total + estimateCost_F0(usage); + }, 0); +} + +export function estimateCost_F0(tokenUsage: TokenUsage): number { + let totalCost = 0; + try { + let model = tokenUsage.model ?? (process.env.MODEL_NAME || "gpt-4o-mini"); + const pricing = modelPrices[model] as ModelPricing; + + if (!pricing) { + logger.error(`No pricing information found for model: ${model}`); + return 0; + } + + if (pricing.mode !== "chat") { + logger.error(`Model ${model} is not a chat model`); + return 0; + } + + // Add per-request cost if applicable (Only Perplexity supports this) + if (pricing.input_cost_per_request) { + totalCost += pricing.input_cost_per_request; + } + + // Add token-based costs + if (pricing.input_cost_per_token) { + totalCost += tokenUsage.promptTokens * pricing.input_cost_per_token; + } + + if (pricing.output_cost_per_token) { + totalCost += tokenUsage.completionTokens * pricing.output_cost_per_token; + } + + return Number(totalCost.toFixed(7)); + } catch (error) { + logger.error(`Error estimating cost: ${error}`); + return totalCost; + } +} diff --git a/apps/api/src/lib/extract/helpers/merge-null-val-objs.ts b/apps/api/src/lib/extract/helpers/merge-null-val-objs.ts index 4f67f989..24aa5761 100644 --- a/apps/api/src/lib/extract/helpers/merge-null-val-objs.ts +++ b/apps/api/src/lib/extract/helpers/merge-null-val-objs.ts @@ -145,6 +145,11 @@ export function mergeNullValObjs(objArray: { [key: string]: any[] }): { `Expected an array at objArray[${key}], but found:`, objArray[key], ); + + // create an array if it doesn't exist + if (objArray[key] === undefined) { + objArray[key] = []; + } return objArray; } } diff --git a/apps/api/src/lib/extract/helpers/transform-array-to-obj.ts b/apps/api/src/lib/extract/helpers/transform-array-to-obj.ts index c164951f..710ba131 100644 --- a/apps/api/src/lib/extract/helpers/transform-array-to-obj.ts +++ b/apps/api/src/lib/extract/helpers/transform-array-to-obj.ts @@ -91,12 +91,23 @@ export function transformArrayToObject( // Iterate over each item in the arrayData arrayData.forEach((item) => { let currentItem = item; + // Skip null items + if (currentItem === null) { + return; + } arrayKeyParts.forEach((part) => { - if (currentItem[part]) { + if (currentItem && currentItem[part]) { currentItem = currentItem[part]; + } else { + currentItem = null; } }); + // Skip if we couldn't find the nested path + if (currentItem === null) { + return; + } + // Copy non-array properties from the parent object for (const key in parentSchema.properties) { if ( @@ -108,8 +119,8 @@ export function transformArrayToObject( } } - // Ensure that the currentItem[arrayKey] is an array before mapping - if (Array.isArray(currentItem[arrayKey])) { + // Ensure that the currentItem[arrayKey] exists and is an array before mapping + if (currentItem && currentItem[arrayKey] && Array.isArray(currentItem[arrayKey])) { currentItem[arrayKey].forEach((subItem: any) => { if ( typeof subItem === "object" && @@ -138,14 +149,20 @@ export function transformArrayToObject( } else { console.warn( `Expected an array at ${arrayKey}, but found:`, - currentItem[arrayKey], + currentItem ? currentItem[arrayKey] : 'undefined' ); + + // create an array if it doesn't exist + if (currentLevel[arrayKey] === undefined) { + currentLevel[arrayKey] = []; + } } // Handle merging of array properties for (const key in parentSchema.properties) { if ( parentSchema.properties[key].type === "array" && + currentItem && Array.isArray(currentItem[key]) ) { if (!currentLevel[key]) { diff --git a/apps/api/src/lib/extract/reranker.ts b/apps/api/src/lib/extract/reranker.ts index f08b262b..a9ca8595 100644 --- a/apps/api/src/lib/extract/reranker.ts +++ b/apps/api/src/lib/extract/reranker.ts @@ -9,6 +9,11 @@ import { generateCompletions } from "../../scraper/scrapeURL/transformers/llmExt import { buildRerankerUserPrompt } from "./build-prompts"; import { buildRerankerSystemPrompt } from "./build-prompts"; import { dumpToFile } from "./helpers/dump-to-file"; +import { getModel } from "../generic-ai"; +import fs from "fs/promises"; + +const THRESHOLD_FOR_SINGLEPAGE = 0.6; +const THRESHOLD_FOR_MULTIENTITY = 0.45; const cohere = new CohereClient({ token: process.env.COHERE_API_KEY, @@ -161,22 +166,42 @@ function filterAndProcessLinks( export type RerankerResult = { mapDocument: (MapDocument & { relevanceScore?: number; reason?: string })[]; tokensUsed: number; + cost: number; }; export type RerankerOptions = { links: MapDocument[]; searchQuery: string; urlTraces: URLTrace[]; + isMultiEntity: boolean; + reasoning: string; + multiEntityKeys: string[]; + keyIndicators: string[]; }; -export async function rerankLinksWithLLM(options: RerankerOptions): Promise { - const { links, searchQuery, urlTraces } = options; - const chunkSize = 100; +export async function rerankLinksWithLLM( + options: RerankerOptions, +): Promise { + const { + links, + searchQuery, + urlTraces, + isMultiEntity, + reasoning, + multiEntityKeys, + keyIndicators, + } = options; + const chunkSize = 5000; const chunks: MapDocument[][] = []; - const TIMEOUT_MS = 20000; + const TIMEOUT_MS = 60000; const MAX_RETRIES = 2; let totalTokensUsed = 0; + // await fs.writeFile( + // `logs/links-${crypto.randomUUID()}.txt`, + // JSON.stringify(links, null, 2), + // ); + // Split links into chunks of 200 for (let i = 0; i < links.length; i += chunkSize) { chunks.push(links.slice(i, i + chunkSize)); @@ -194,7 +219,11 @@ export async function rerankLinksWithLLM(options: RerankerOptions): Promise { // console.log(`Processing chunk ${chunkIndex + 1}/${chunks.length} with ${chunk.length} links`); @@ -214,33 +245,91 @@ export async function rerankLinksWithLLM(options: RerankerOptions): Promise((resolve) => { setTimeout(() => resolve(null), TIMEOUT_MS); }); - // dumpToFile(new Date().toISOString(),[buildRerankerSystemPrompt(), buildRerankerUserPrompt(searchQuery), schema, linksContent]) - const completionPromise = generateCompletions({ - logger: logger.child({ - method: "rerankLinksWithLLM", - chunk: chunkIndex + 1, - retry, - }), - options: { - mode: "llm", - systemPrompt: buildRerankerSystemPrompt(), - prompt: buildRerankerUserPrompt(searchQuery), - schema: schema, - }, - markdown: linksContent, - isExtractEndpoint: true - }); + const systemPrompt = `You are analyzing URLs for ${isMultiEntity ? "collecting multiple items" : "specific information"}. + The user's query is: ${searchQuery} + ${ + isMultiEntity + ? `IMPORTANT: This is a multi-entity extraction task looking for ${multiEntityKeys.join(", ")}. + Score URLs higher if they contain ANY instance of the target entities. + Key indicators to look for: ${keyIndicators.join(", ")}` + : `IMPORTANT: This is a specific information task. + Score URLs based on precision and relevance to answering the query.` + } + + Scoring guidelines: + ${ + isMultiEntity + ? ` + - 1.0: Contains ANY instance of target entities, even just one. Give this score if page has any relevant entity. If you are not sure if this page is relevant or not, give it a score of 1.0 + - 0.8: Contains entity but may be incomplete information + - 0.6: Mentions entity type but no clear instance + - 0.4: Only tangentially related to entity type + - Below 0.4: No mention of relevant entities, or duplicates + + Reason: ${reasoning} + ` + : ` + - 1.0: Contains direct, authoritative answer to query. Give this score if unsure about relevance. If you are not sure if this page is relevant or not, give it a score of 1.0 + - 0.8: Contains information that directly helps answer the query + - 0.6: Contains related information that partially answers query + - Below 0.6: Information too general or not focused on query + ` + }`; - const completion = await Promise.race([ - completionPromise, - timeoutPromise, - ]); + // dumpToFile(new Date().toISOString(),[buildRerankerSystemPrompt(), buildRerankerUserPrompt(searchQuery), schema, linksContent]) + // const gemini = getGemini(); + // const model = getGemini() + let completion: any; + try { + const completionPromise = generateCompletions({ + model: getModel("gemini-2.5-pro-preview-03-25", "vertex"), + retryModel: getModel("gemini-2.5-pro-preview-03-25", "google"), + logger: logger.child({ + method: "rerankLinksWithLLM", + chunk: chunkIndex + 1, + retry, + }), + options: { + mode: "llm", + systemPrompt: systemPrompt, + prompt: buildRerankerUserPrompt(searchQuery), + schema: schema, + // temperature: isMultiEntity ? 0.5 : 0.3, + }, + // providerOptions: { + // anthropic: { + // thinking: { type: 'enabled', budgetTokens: 12000 }, + // tool_choice: "auto", + // }, + // }, + markdown: linksContent, + isExtractEndpoint: true, + }); + + completion = await completionPromise; + totalCost += completion.cost; + } catch (error) { + console.warn( + `Error processing chunk ${chunkIndex + 1} attempt ${retry + 1}:`, + error, + ); + } + + // await fs.writeFile( + // `logs/reranker-${crypto.randomUUID()}.json`, + // JSON.stringify(completion, null, 2), + // ); if (!completion) { // console.log(`Chunk ${chunkIndex + 1}: Timeout on attempt ${retry + 1}`); @@ -278,19 +367,48 @@ export async function rerankLinksWithLLM(options: RerankerOptions): Promise b.relevanceScore - a.relevanceScore); // console.log(`Total relevant links found: ${flattenedResults.length}`); - // Map back to MapDocument format, keeping only relevant links + // Map back to MapDocument format, keeping ALL links for testing const relevantLinks = flattenedResults .map((result) => { - const link = links.find((link) => link.url === result.url); - if (link) { - return { ...link, relevanceScore: result.relevanceScore ? parseFloat(result.relevanceScore) : 0, reason: result.reason }; + if ( + result.relevanceScore > + (isMultiEntity ? THRESHOLD_FOR_MULTIENTITY : THRESHOLD_FOR_SINGLEPAGE) + ) { + const link = links.find((link) => link.url === result.url); + if (link) { + return { + ...link, + relevanceScore: result.relevanceScore + ? parseFloat(result.relevanceScore) + : 0, + reason: result.reason, + }; + } } return undefined; }) .filter((link): link is NonNullable => link !== undefined); + // Add debug logging for testing + // fs.writeFile( + // `logs/reranker-aaa-${crypto.randomUUID()}.json`, + // JSON.stringify( + // { + // totalResults: relevantLinks.length, + // scores: relevantLinks.map((l) => ({ + // url: l.url, + // score: l.relevanceScore, + // reason: l.reason, + // })), + // }, + // null, + // 2, + // ), + // ); + return { mapDocument: relevantLinks, tokensUsed: totalTokensUsed, + cost: totalCost, }; } diff --git a/apps/api/src/lib/extract/url-processor.ts b/apps/api/src/lib/extract/url-processor.ts index 9cc5607e..cd683bd8 100644 --- a/apps/api/src/lib/extract/url-processor.ts +++ b/apps/api/src/lib/extract/url-processor.ts @@ -8,14 +8,42 @@ import { extractConfig } from "./config"; import type { Logger } from "winston"; import { generateText } from "ai"; import { getModel } from "../generic-ai"; +import { calculateCost } from "../../scraper/scrapeURL/transformers/llmExtract"; +import type { CostTracking } from "./extraction-service"; -export async function generateBasicCompletion(prompt: string) { - const { text } = await generateText({ - model: getModel("gpt-4o"), - prompt: prompt, - temperature: 0 - }); - return text; +export async function generateBasicCompletion(prompt: string): Promise<{ text: string, cost: number } | null> { + try { + const result = await generateText({ + model: getModel("gpt-4o", "openai"), + prompt: prompt, + providerOptions: { + anthropic: { + thinking: { type: "enabled", budgetTokens: 12000 }, + }, + } + }); + return { text: result.text, cost: calculateCost("openai/gpt-4o", result.usage?.promptTokens ?? 0, result.usage?.completionTokens ?? 0) }; + } catch (error) { + console.error("Error generating basic completion:", error); + if (error?.type == "rate_limit_error") { + try { + const result = await generateText({ + model: getModel("gpt-4o-mini", "openai"), + prompt: prompt, + providerOptions: { + anthropic: { + thinking: { type: "enabled", budgetTokens: 12000 }, + }, + } + }); + return { text: result.text, cost: calculateCost("openai/gpt-4o-mini", result.usage?.promptTokens ?? 0, result.usage?.completionTokens ?? 0) }; + } catch (fallbackError) { + console.error("Error generating basic completion with fallback model:", fallbackError); + return null; + } + } + return null; + } } interface ProcessUrlOptions { url: string; @@ -26,6 +54,11 @@ interface ProcessUrlOptions { origin?: string; limit?: number; includeSubdomains?: boolean; + log?: any; + isMultiEntity: boolean; + reasoning: string; + multiEntityKeys: string[]; + keyIndicators: string[]; } export async function processUrl( @@ -33,6 +66,7 @@ export async function processUrl( urlTraces: URLTrace[], updateExtractCallback: (links: string[]) => void, logger: Logger, + costTracking: CostTracking, ): Promise { const trace: URLTrace = { url: options.url, @@ -60,14 +94,16 @@ export async function processUrl( let searchQuery = options.prompt; if (options.prompt) { - searchQuery = - ( - await generateBasicCompletion( - buildRefrasedPrompt(options.prompt, baseUrl), - ) - ) - ?.replace('"', "") - .replace("/", "") ?? options.prompt; + const res = await generateBasicCompletion( + buildRefrasedPrompt(options.prompt, baseUrl), + ); + + if (res) { + searchQuery = res.text.replace('"', "").replace("/", "") ?? options.prompt; + costTracking.otherCallCount++; + costTracking.otherCost += res.cost; + costTracking.totalCost += res.cost; + } } try { @@ -93,6 +129,7 @@ export async function processUrl( linkCount: allUrls.length, uniqueLinkCount: uniqueUrls.length, }); + options.log["uniqueUrlsLength-1"] = uniqueUrls.length; // Track all discovered URLs uniqueUrls.forEach((discoveredUrl) => { @@ -146,6 +183,8 @@ export async function processUrl( }); } + options.log["uniqueUrlsLength-2"] = uniqueUrls.length; + // Track all discovered URLs uniqueUrls.forEach((discoveredUrl) => { if (!urlTraces.some((t) => t.url === discoveredUrl)) { @@ -182,12 +221,20 @@ export async function processUrl( let rephrasedPrompt = options.prompt ?? searchQuery; try { - rephrasedPrompt = - (await generateBasicCompletion( - buildPreRerankPrompt(rephrasedPrompt, options.schema, baseUrl), - )) ?? - "Extract the data according to the schema: " + + const res = await generateBasicCompletion( + buildPreRerankPrompt(rephrasedPrompt, options.schema, baseUrl), + ); + + if (res) { + rephrasedPrompt = res.text; + costTracking.otherCallCount++; + costTracking.otherCost += res.cost; + costTracking.totalCost += res.cost; + } else { + rephrasedPrompt = + "Extract the data according to the schema: " + JSON.stringify(options.schema, null, 2); + } } catch (error) { console.error("Error generating search query from schema:", error); rephrasedPrompt = @@ -211,13 +258,20 @@ export async function processUrl( links: mappedLinks, searchQuery: rephrasedPrompt, urlTraces, + isMultiEntity: options.isMultiEntity, + reasoning: options.reasoning, + multiEntityKeys: options.multiEntityKeys, + keyIndicators: options.keyIndicators, }); + costTracking.otherCallCount++; + costTracking.otherCost += rerankerResult.cost; + costTracking.totalCost += rerankerResult.cost; mappedLinks = rerankerResult.mapDocument; let tokensUsed = rerankerResult.tokensUsed; logger.info("Reranked! (pass 1)", { linkCount: mappedLinks.length, }); - + options.log["rerankerResult-1"] = mappedLinks.length; // 2nd Pass, useful for when the first pass returns too many links if (mappedLinks.length > 100) { logger.info("Reranking (pass 2)..."); @@ -225,13 +279,21 @@ export async function processUrl( links: mappedLinks, searchQuery: rephrasedPrompt, urlTraces, + isMultiEntity: options.isMultiEntity, + reasoning: options.reasoning, + multiEntityKeys: options.multiEntityKeys, + keyIndicators: options.keyIndicators, }); + costTracking.otherCallCount++; + costTracking.otherCost += rerankerResult.cost; + costTracking.totalCost += rerankerResult.cost; mappedLinks = rerankerResult.mapDocument; tokensUsed += rerankerResult.tokensUsed; logger.info("Reranked! (pass 2)", { linkCount: mappedLinks.length, }); } + options.log["rerankerResult-2"] = mappedLinks.length; // dumpToFile( // "llm-links.txt", diff --git a/apps/api/src/lib/generic-ai.ts b/apps/api/src/lib/generic-ai.ts index 4718e1e2..1274ed32 100644 --- a/apps/api/src/lib/generic-ai.ts +++ b/apps/api/src/lib/generic-ai.ts @@ -1,17 +1,62 @@ -import { createOpenAI } from '@ai-sdk/openai'; +import { openai } from "@ai-sdk/openai"; import { createOllama } from "ollama-ai-provider"; +import { anthropic } from "@ai-sdk/anthropic"; +import { groq } from "@ai-sdk/groq"; +import { google } from "@ai-sdk/google"; +import { createOpenRouter } from "@openrouter/ai-sdk-provider"; +import { fireworks } from "@ai-sdk/fireworks"; +import { deepinfra } from "@ai-sdk/deepinfra"; +import { createVertex } from "@ai-sdk/google-vertex"; -const modelAdapter = process.env.OLLAMA_BASE_URL ? createOllama({ +type Provider = + | "openai" + | "ollama" + | "anthropic" + | "groq" + | "google" + | "openrouter" + | "fireworks" + | "deepinfra" + | "vertex"; +const defaultProvider: Provider = process.env.OLLAMA_BASE_URL + ? "ollama" + : "openai"; + +const providerList: Record = { + openai, //OPENAI_API_KEY + ollama: createOllama({ baseURL: process.env.OLLAMA_BASE_URL, -}) : createOpenAI({ - apiKey: process.env.OPENAI_API_KEY, - baseURL: process.env.OPENAI_BASE_URL, -}); + }), + anthropic, //ANTHROPIC_API_KEY + groq, //GROQ_API_KEY + google, //GOOGLE_GENERATIVE_AI_API_KEY + openrouter: createOpenRouter({ + apiKey: process.env.OPENROUTER_API_KEY, + }), + fireworks, //FIREWORKS_API_KEY + deepinfra, //DEEPINFRA_API_KEY + vertex: createVertex({ + project: "firecrawl", + location: "us-central1", + googleAuthOptions: process.env.VERTEX_CREDENTIALS ? { + credentials: JSON.parse(atob(process.env.VERTEX_CREDENTIALS)), + } : { + keyFile: "./gke-key.json", + }, + }), +}; -export function getModel(name: string) { - return process.env.MODEL_NAME ? modelAdapter(process.env.MODEL_NAME) : modelAdapter(name); +export function getModel(name: string, provider: Provider = defaultProvider) { + return process.env.MODEL_NAME + ? providerList[provider](process.env.MODEL_NAME) + : providerList[provider](name); } -export function getEmbeddingModel(name: string) { - return process.env.MODEL_EMBEDDING_NAME ? modelAdapter.embedding(process.env.MODEL_EMBEDDING_NAME) : modelAdapter.embedding(name); +export function getEmbeddingModel( + name: string, + provider: Provider = defaultProvider, +) { + return process.env.MODEL_EMBEDDING_NAME + ? providerList[provider].embedding(process.env.MODEL_EMBEDDING_NAME) + : providerList[provider].embedding(name); } diff --git a/apps/api/src/routes/v1.ts b/apps/api/src/routes/v1.ts index b962ff16..c53f55a7 100644 --- a/apps/api/src/routes/v1.ts +++ b/apps/api/src/routes/v1.ts @@ -6,6 +6,7 @@ import { crawlStatusController } from "../controllers/v1/crawl-status"; import { mapController } from "../controllers/v1/map"; import { ErrorResponse, + isAgentExtractModelValid, RequestWithACUC, RequestWithAuth, RequestWithMaybeAuth, @@ -93,6 +94,14 @@ export function authMiddleware( ): (req: RequestWithMaybeAuth, res: Response, next: NextFunction) => void { return (req, res, next) => { (async () => { + if (rateLimiterMode === RateLimiterMode.Extract && isAgentExtractModelValid((req.body as any)?.agent?.model)) { + rateLimiterMode = RateLimiterMode.ExtractAgentPreview; + } + + if (rateLimiterMode === RateLimiterMode.Scrape && isAgentExtractModelValid((req.body as any)?.agent?.model)) { + rateLimiterMode = RateLimiterMode.ScrapeAgentPreview; + } + const auth = await authenticateUser(req, res, rateLimiterMode); if (!auth.success) { diff --git a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts new file mode 100644 index 00000000..840c56fc --- /dev/null +++ b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts @@ -0,0 +1,347 @@ +import { Logger } from "winston"; +import { z } from "zod"; +import { + generateCompletions, + GenerateCompletionsOptions, + generateSchemaFromPrompt, +} from "../transformers/llmExtract"; +import { smartScrape } from "./smartScrape"; +import { parseMarkdown } from "../../../lib/html-to-markdown"; +import { getModel } from "../../../lib/generic-ai"; +import { TokenUsage } from "../../../controllers/v1/types"; +import type { SmartScrapeResult } from "./smartScrape"; + +const commonSmartScrapeProperties = { + shouldUseSmartscrape: { + type: "boolean", + description: + "Set to `true` if any of the extractedData is null and you think you can find the information by performing user-like interactions (e.g., clicking buttons/accordions to reveal hidden text, login, inputs etc.). SmartScrape can perform these actions to access the data.", + }, + // Note: extractedData is added dynamically in prepareSmartScrapeSchema +}; + +// Define common properties for reasoning and prompt +const commonReasoningPromptProperties = { + smartscrape_reasoning: { + type: ["string", "null"], + // Using the more detailed multi-step description as the common one + description: + "Reasoning for why a SmartScrape is needed. Explain which data is missing or requires interaction.", + }, + smartscrape_prompt: { + type: ["string", "null"], + description: `A clear, outcome-focused prompt describing what information to find on the page. + Example: "Find the product specifications in the expandable section" rather than "Click the button to reveal product specs". + Used by the smart scraping agent to determine what actions to take. + Dont mention anything about extraction, smartscrape just returns page content.`, + }, +}; + +// Schema for single-step SmartScrape interaction +const smartScrapeWrapperSchemaDefinition = { + type: "object", + properties: { + ...commonSmartScrapeProperties, // Include shared base properties + ...commonReasoningPromptProperties, // Include shared reasoning/prompt properties + // extractedData will be added dynamically + }, + additionalProperties: false, + required: ["extractedData", "shouldUseSmartscrape"], +}; + +// Schema for multi-step SmartScrape interaction +const multiSmartScrapeWrapperSchemaDefinition = { + type: "object", + properties: { + ...commonSmartScrapeProperties, // Include shared base properties + smartScrapePages: { + type: "array", + description: + "Make an entry for each page we want to run smart scrape on, no matter how many actions it should be one entry per page.", + items: { + type: "object", + properties: { + page_index: { + // Specific to items within the array + type: "number", + description: "The index of the page in the SmartScrape process.", + }, + ...commonReasoningPromptProperties, // Include shared reasoning/prompt properties here too + }, + // required: ["page_index", "smartscrape_reasoning", "smartscrape_prompt"], // If needed per step + // additionalProperties: false, + }, + }, + // extractedData will be added dynamically + }, + additionalProperties: false, + required: ["extractedData", "shouldUseSmartscrape"], +}; + +//TODO: go over and check +// should add null to all types +// type:string should be type:["string","null"] +export function makeSchemaNullable(schema: any): any { + if (typeof schema !== "object" || schema === null) { + return schema; // Base case: not an object/array or is null + } + + if (Array.isArray(schema)) { + return schema.map(makeSchemaNullable); // Recurse for array items + } + + // Process object properties + const newSchema: { [key: string]: any } = {}; + let isObject = false; // Flag to track if this level is an object type + + for (const key in schema) { + if (key === "additionalProperties") { + continue; // Skip existing additionalProperties, we'll set it later if needed + } + + if (key === "type") { + const currentType = schema[key]; + let finalType: string | string[]; + + if (typeof currentType === "string") { + if (currentType === "object") isObject = true; + finalType = + currentType === "null" ? currentType : [currentType, "null"]; + } else if (Array.isArray(currentType)) { + if (currentType.includes("object")) isObject = true; + finalType = currentType.includes("null") + ? currentType + : [...currentType, "null"]; + } else { + finalType = currentType; // Handle unexpected types? + } + newSchema[key] = finalType; + } else if (typeof schema[key] === "object" && schema[key] !== null) { + // Recurse for nested objects (properties, items, definitions, etc.) + newSchema[key] = makeSchemaNullable(schema[key]); + if (key === "properties") { + // Having a 'properties' key strongly implies an object type + isObject = true; + } + } else { + // Copy other properties directly (like required, description, etc.) + newSchema[key] = schema[key]; + } + } + + // **Crucial Fix:** If this schema represents an object type, add additionalProperties: false + if (isObject) { + // Ensure 'properties' exists if 'type' was 'object' but 'properties' wasn't defined + if (!newSchema.properties) { + newSchema.properties = {}; + } + newSchema.additionalProperties = false; + } + + return newSchema; +} + +/** + * Wraps the original schema with SmartScrape fields if an original schema exists. + * + * @param originalSchema The user-provided schema (JSON Schema object or Zod schema). + * @param logger Winston logger instance. + * @returns An object containing the schema to use for the LLM call and whether wrapping occurred. + */ +export function prepareSmartScrapeSchema( + originalSchema: any | z.ZodTypeAny | undefined, + logger: Logger, + isSingleUrl: boolean, +) { + // Make the user's schema nullable *and* ensure nested objects have additionalProperties:false + const nullableAndStrictSchema = originalSchema; + + let smartScrapeWrapScehma; + if (isSingleUrl) { + smartScrapeWrapScehma = smartScrapeWrapperSchemaDefinition; + } else { + smartScrapeWrapScehma = multiSmartScrapeWrapperSchemaDefinition; + } + + const wrappedSchema = { + ...smartScrapeWrapScehma, // Uses the wrapper defined above + properties: { + extractedData: nullableAndStrictSchema, // Nest the modified original schema + ...smartScrapeWrapScehma.properties, // Add smartscrape fields + }, + // required is inherited from smartScrapeWrapperSchemaDefinition + // additionalProperties:false is inherited from smartScrapeWrapperSchemaDefinition for the top level + }; + + logger.info("Wrapping original schema with SmartScrape fields.", { + // Limit logging potentially large schemas + wrappedSchemaKeys: Object.keys(wrappedSchema.properties), + }); + return { schemaToUse: wrappedSchema }; +} + +export async function extractData({ + extractOptions, + urls, + useAgent, +}: { + extractOptions: GenerateCompletionsOptions; + urls: string[]; + useAgent: boolean; +}): Promise<{ + extractedDataArray: any[]; + warning: any; + smartScrapeCallCount: number; + otherCallCount: number; + smartScrapeCost: number; + otherCost: number; + costLimitExceededTokenUsage: number | null; +}> { + let schema = extractOptions.options.schema; + const logger = extractOptions.logger; + const isSingleUrl = urls.length === 1; + let smartScrapeCost = 0; + let otherCost = 0; + let smartScrapeCallCount = 0; + let otherCallCount = 0; + let costLimitExceededTokenUsage: number | null = null; + // TODO: remove the "required" fields here!! it breaks o3-mini + + if (!schema && extractOptions.options.prompt) { + logger.info("Generating schema from prompt"); + const genRes = await generateSchemaFromPrompt(extractOptions.options.prompt); + otherCallCount++; + otherCost += genRes.cost; + schema = genRes.extract; + } + + const { schemaToUse } = prepareSmartScrapeSchema(schema, logger, isSingleUrl); + const extractOptionsNewSchema = { + ...extractOptions, + options: { ...extractOptions.options, schema: schemaToUse }, + }; + // console.log("schema", schema); + // console.log("schemaToUse", schemaToUse); + + let extract: any, + warning: string | undefined, + totalUsage: TokenUsage | undefined; + + // checks if using smartScrape is needed for this case + try { + const { + extract: e, + warning: w, + totalUsage: t, + cost: c, + } = await generateCompletions({ + ...extractOptionsNewSchema, + model: getModel("gemini-2.5-pro-preview-03-25", "vertex"), + retryModel: getModel("gemini-2.5-pro-preview-03-25", "google"), + }); + extract = e; + warning = w; + totalUsage = t; + otherCost += c; + otherCallCount++; + } catch (error) { + logger.error( + "failed during extractSmartScrape.ts:generateCompletions", + error, + ); + // console.log("failed during extractSmartScrape.ts:generateCompletions", error); + } + + let extractedData = extract?.extractedData; + + // console.log("shouldUseSmartscrape", extract?.shouldUseSmartscrape); + // console.log("smartscrape_reasoning", extract?.smartscrape_reasoning); + // console.log("smartscrape_prompt", extract?.smartscrape_prompt); + try { + console.log("========================================="); + console.log( + "useAgent:", + useAgent, + "shouldUseSmartscrape:", + extract?.shouldUseSmartscrape, + ); + console.log("url:", urls); + console.log("prompt:", extract?.smartscrape_prompt); + console.log("========================================="); + + if (useAgent && extract?.shouldUseSmartscrape) { + let smartscrapeResults: SmartScrapeResult[]; + if (isSingleUrl) { + smartscrapeResults = [ + await smartScrape(urls[0], extract?.smartscrape_prompt), + ]; + smartScrapeCost += smartscrapeResults[0].tokenUsage; + smartScrapeCallCount++; + } else { + const pages = extract?.smartscrapePages; + //do it async promiseall instead + smartscrapeResults = await Promise.all( + pages.map(async (page) => { + return await smartScrape( + urls[page.page_index], + page.smartscrape_prompt, + ); + }), + ); + smartScrapeCost += smartscrapeResults.reduce( + (acc, result) => acc + result.tokenUsage, + 0, + ); + smartScrapeCallCount += pages.length; + } + // console.log("smartscrapeResults", smartscrapeResults); + + const scrapedPages = smartscrapeResults.map( + (result) => result.scrapedPages, + ); + // console.log("scrapedPages", scrapedPages); + const htmls = scrapedPages.flat().map((page) => page.html); + // console.log("htmls", htmls); + const markdowns = await Promise.all( + htmls.map(async (html) => await parseMarkdown(html)), + ); + // console.log("markdowns", markdowns); + extractedData = await Promise.all( + markdowns.map(async (markdown) => { + const newExtractOptions = { + ...extractOptions, + markdown: markdown, + }; + const { extract, warning, totalUsage, model, cost } = + await generateCompletions(newExtractOptions); + otherCost += cost; + otherCallCount++; + return extract; + }), + ); + + // console.log("markdowns", markdowns); + // extractedData = smartscrapeResult; + } else { + extractedData = [extractedData]; + } + } catch (error) { + console.error(">>>>>>>extractSmartScrape.ts error>>>>>\n", error); + if (error instanceof Error && error.message === "Cost limit exceeded") { + costLimitExceededTokenUsage = (error as any).cause.tokenUsage; + warning = "Smart scrape cost limit exceeded." + (warning ? " " + warning : "") + } else { + throw error; + } + } + + return { + extractedDataArray: extractedData, + warning: warning, + smartScrapeCallCount: smartScrapeCallCount, + otherCallCount: otherCallCount, + smartScrapeCost: smartScrapeCost, + otherCost: otherCost, + costLimitExceededTokenUsage: costLimitExceededTokenUsage, + }; +} diff --git a/apps/api/src/scraper/scrapeURL/lib/smartScrape.ts b/apps/api/src/scraper/scrapeURL/lib/smartScrape.ts new file mode 100644 index 00000000..1afe663a --- /dev/null +++ b/apps/api/src/scraper/scrapeURL/lib/smartScrape.ts @@ -0,0 +1,164 @@ +import { z } from "zod"; +import { logger } from "../../../lib/logger"; +import { robustFetch } from "./fetch"; +import fs from "fs/promises"; +import { configDotenv } from "dotenv"; + +configDotenv(); + +// Define schemas outside the function scope +const tokenUsageDetailSchema = z.object({ + input_tokens: z.number().int(), + output_tokens: z.number().int(), + total_cost: z.number().nullable(), // Allows number or null +}); + +// Schema for an individual scraped page object +const scrapedPageSchema = z.object({ + html: z.string(), + reason: z.string(), + page: z.union([z.string(), z.number()]), +}); + +// Main schema for the structure returned by the smart-scrape endpoint +const smartScrapeResultSchema = z.object({ + sessionId: z.string(), + success: z.boolean(), + scrapedPages: z.array(scrapedPageSchema), + tokenUsage: z.number(), + + // z.record( + // z.string(), // Key is the model name (string) + // tokenUsageDetailSchema, // Value matches the detail schema + // ), +}); + +// Infer the TypeScript type from the Zod schema +export type SmartScrapeResult = z.infer; + +/** + * Sends a POST request to the internal /smart-scrape endpoint to extract + * structured data from a URL based on a prompt. + * + * @param url The URL of the page to scrape. + * @param prompt The prompt guiding the data extraction. + * @returns A promise that resolves to an object matching the SmartScrapeResult type. + * @throws Throws an error if the request fails or the response is invalid. + */ +export async function smartScrape( + url: string, + prompt: string, + sessionId?: string, +): Promise { + try { + logger.info("Initiating smart scrape request", { url, prompt }); + + // Pass schema type as generic parameter to robustFeth + const response = await robustFetch({ + url: `${process.env.SMART_SCRAPE_API_URL}/smart-scrape`, + method: "POST", + body: { + url, + prompt, + userProvidedId: sessionId ?? undefined, + models: { + thinkingModel: { + model: "gemini-2.5-pro-preview-03-25", + provider: "vertex", + supportTools: true, + toolChoice: "required", + cost: { + input: 1.3, + output: 5, + }, + }, + toolModel: { + model: "gemini-2.0-flash", + provider: "google", + }, + }, + }, + schema: smartScrapeResultSchema, // Pass the schema instance for validation + logger, + mock: null, // Keep mock null if not mocking + }); + + // Check if the response indicates a 500 error + // Use type assertion to handle the error response structure + const errorResponse = response as unknown as { + success: boolean; + error?: string; + details?: string; + }; + + if ( + errorResponse && + errorResponse.success === false && + errorResponse.error + ) { + if (errorResponse.error === "Cost limit exceeded") { + throw new Error("Cost limit exceeded", { + cause: { tokenUsage: (errorResponse as any).tokenUsage }, + }); + } + + logger.error("Smart scrape returned error response", { + url, + prompt, + error: errorResponse.error, + details: errorResponse.details || "No details provided", + }); + throw new Error( + `Smart scrape failed: ${errorResponse.error}${errorResponse.details ? ` - ${errorResponse.details}` : ""}`, + ); + } + + logger.info("Smart scrape successful", { + url, + prompt, + sessionId: response.sessionId, + }); + + logger.info("Smart scrape cost $" + response.tokenUsage); + + return response; // The response type now matches SmartScrapeResult + } catch (error) { + // Safely extract error information without circular references + const errorInfo = { + message: error instanceof Error ? error.message : String(error), + name: error instanceof Error ? error.name : "Unknown", + stack: error instanceof Error ? error.stack : undefined, + // Extract cause safely if it exists + cause: + error instanceof Error && error.cause + ? error.cause instanceof Error + ? { + message: error.cause.message, + name: error.cause.name, + stack: error.cause.stack, + } + : typeof error.cause === "object" + ? { + ...Object.fromEntries( + Object.entries(error.cause).filter( + ([_, v]) => v !== null && typeof v !== "object", + ), + ), + error: + (error.cause as any)?.error?.message || + (error.cause as any)?.error, + } + : String(error.cause) + : undefined, + }; + + logger.error("Smart scrape request failed", { + url, + prompt, + error: JSON.stringify(errorInfo), + }); + + // Rethrowing the error to be handled by the caller + throw new Error(`Failed to smart scrape URL: ${url}`, { cause: error }); + } +} diff --git a/apps/api/src/scraper/scrapeURL/transformers/agent.ts b/apps/api/src/scraper/scrapeURL/transformers/agent.ts new file mode 100644 index 00000000..6ab32862 --- /dev/null +++ b/apps/api/src/scraper/scrapeURL/transformers/agent.ts @@ -0,0 +1,65 @@ +import { + Document, +} from "../../../controllers/v1/types"; +import { Meta } from ".."; +import { logger } from "../../../lib/logger"; +import { parseMarkdown } from "../../../lib/html-to-markdown"; +import { smartScrape, SmartScrapeResult } from "../lib/smartScrape"; + + +export async function performAgent( + meta: Meta, + document: Document, +): Promise { + if (meta.options.agent?.prompt) { + const url: string | undefined = document.url || document.metadata.sourceURL + + if (!url) { + logger.error("document.url or document.metadata.sourceURL is undefined -- this is unexpected"); + // throw new Error("document.url or document.metadata.sourceURL is undefined -- this is unexpected"); + return document; + } + + const prompt = meta.options.agent?.prompt ?? undefined + const sessionId = meta.options.agent?.sessionId ?? undefined + + let smartscrapeResults: SmartScrapeResult; + try { + smartscrapeResults = await smartScrape(url, prompt, sessionId) + } catch (error) { + if (error instanceof Error && error.message === "Cost limit exceeded") { + logger.error("Cost limit exceeded", { error }) + document.warning = "Smart scrape cost limit exceeded." + (document.warning ? " " + document.warning : "") + return document; + } else { + throw error; + } + } + + const html = smartscrapeResults.scrapedPages[smartscrapeResults.scrapedPages.length - 1].html + + if (meta.options.formats.includes("markdown")) { + const markdown = await parseMarkdown(html) + document.markdown = markdown + } + if (meta.options.formats.includes("html")) { + document.html = html + } + + if (document.metadata.costTracking) { + document.metadata.costTracking.smartScrapeCallCount++; + document.metadata.costTracking.smartScrapeCost = document.metadata.costTracking.smartScrapeCost + smartscrapeResults.tokenUsage; + document.metadata.costTracking.totalCost = document.metadata.costTracking.totalCost + smartscrapeResults.tokenUsage; + } else { + document.metadata.costTracking = { + smartScrapeCallCount: 1, + smartScrapeCost: smartscrapeResults.tokenUsage, + otherCallCount: 0, + otherCost: 0, + totalCost: smartscrapeResults.tokenUsage, + } + } + } + + return document; +} diff --git a/apps/api/src/scraper/scrapeURL/transformers/diff.ts b/apps/api/src/scraper/scrapeURL/transformers/diff.ts index 196027a5..8cdea891 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/diff.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/diff.ts @@ -6,9 +6,9 @@ import gitDiff from 'git-diff'; import parseDiff from 'parse-diff'; import { generateCompletions } from "./llmExtract"; -async function extractDataWithSchema(content: string, meta: Meta): Promise { +async function extractDataWithSchema(content: string, meta: Meta): Promise<{ extract: any, cost: number } | null> { try { - const { extract } = await generateCompletions({ + const { extract, cost } = await generateCompletions({ logger: meta.logger.child({ method: "extractDataWithSchema/generateCompletions", }), @@ -20,7 +20,7 @@ async function extractDataWithSchema(content: string, meta: Meta): Promise }, markdown: content }); - return extract; + return { extract, cost }; } catch (error) { meta.logger.error("Error extracting data with schema", { error }); return null; @@ -144,7 +144,20 @@ export async function deriveDiff(meta: Meta, document: Document): Promise { @@ -74,7 +94,10 @@ function normalizeSchema(x: any): any { return { ...x, properties: Object.fromEntries( - Object.entries(x.properties || {}).map(([k, v]) => [k, normalizeSchema(v)]), + Object.entries(x.properties || {}).map(([k, v]) => [ + k, + normalizeSchema(v), + ]), ), required: Object.keys(x.properties || {}), additionalProperties: false, @@ -89,21 +112,24 @@ function normalizeSchema(x: any): any { } } - - interface TrimResult { text: string; numTokens: number; warning?: string; } -export function trimToTokenLimit(text: string, maxTokens: number, modelId: string="gpt-4o", previousWarning?: string): TrimResult { +export function trimToTokenLimit( + text: string, + maxTokens: number, + modelId: string = "gpt-4o", + previousWarning?: string, +): TrimResult { try { const encoder = encoding_for_model(modelId as TiktokenModel); try { const tokens = encoder.encode(text); const numTokens = tokens.length; - + if (numTokens <= maxTokens) { return { text, numTokens }; } @@ -111,7 +137,7 @@ export function trimToTokenLimit(text: string, maxTokens: number, modelId: strin const modifier = 3; // Start with 3 chars per token estimation let currentText = text.slice(0, Math.floor(maxTokens * modifier) - 1); - + // Keep trimming until we're under the token limit while (true) { const currentTokens = encoder.encode(currentText); @@ -120,14 +146,18 @@ export function trimToTokenLimit(text: string, maxTokens: number, modelId: strin return { text: currentText, numTokens: currentTokens.length, - warning: previousWarning ? `${warning} ${previousWarning}` : warning + warning: previousWarning + ? `${warning} ${previousWarning}` + : warning, }; } const overflow = currentTokens.length * modifier - maxTokens - 1; // If still over limit, remove another chunk - currentText = currentText.slice(0, Math.floor(currentText.length - overflow)); + currentText = currentText.slice( + 0, + Math.floor(currentText.length - overflow), + ); } - } catch (e) { throw e; } finally { @@ -138,88 +168,203 @@ export function trimToTokenLimit(text: string, maxTokens: number, modelId: strin const estimatedCharsPerToken = 2.8; const safeLength = maxTokens * estimatedCharsPerToken; const trimmedText = text.slice(0, Math.floor(safeLength)); - + const warning = `Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (${maxTokens}) we support.`; - + return { text: trimmedText, numTokens: maxTokens, // We assume we hit the max in this fallback case - warning: previousWarning ? `${warning} ${previousWarning}` : warning + warning: previousWarning ? `${warning} ${previousWarning}` : warning, }; } } -export async function generateCompletions({ - logger, - options, - markdown, - previousWarning, - isExtractEndpoint, - model = getModel("gpt-4o-mini"), - mode = "object", -}: { - model?: LanguageModel; +export function calculateCost( + model: string, + inputTokens: number, + outputTokens: number, +) { + const modelCosts = { + "openai/o3-mini": { input_cost: 1.1, output_cost: 4.4 }, + "google/gemini-2.0-flash-001": { input_cost: 0.15, output_cost: 0.6 }, + "deepseek/deepseek-r1": { input_cost: 0.55, output_cost: 2.19 }, + "google/gemini-2.0-flash-thinking-exp:free": { + input_cost: 0.55, + output_cost: 2.19, + }, + }; + let modelCost = modelCosts[model] || { input_cost: 0, output_cost: 0 }; + //gemini-2.5-pro-exp-03-25 pricing + if ( + model === "gemini-2.5-pro-exp-03-25" || + model === "gemini-2.5-pro-preview-03-25" + ) { + let inputCost = 0; + let outputCost = 0; + if (inputTokens <= 200000) { + inputCost = 1.25; + } else { + inputCost = 2.5; + } + if (outputTokens <= 200000) { + outputCost = 10.0; + } else { + outputCost = 15.0; + } + modelCost = { input_cost: inputCost, output_cost: outputCost }; + } + const totalCost = + (inputTokens * modelCost.input_cost + + outputTokens * modelCost.output_cost) / + 1_000_000; + + return totalCost; +} + +export type GenerateCompletionsOptions = { + model?: LanguageModel; logger: Logger; options: ExtractOptions; markdown?: string; previousWarning?: string; isExtractEndpoint?: boolean; mode?: "object" | "no-object"; -}): Promise<{ + providerOptions?: LanguageModelV1ProviderMetadata; + retryModel?: LanguageModel; +}; +export async function generateCompletions({ + logger, + options, + markdown, + previousWarning, + isExtractEndpoint, + model = getModel("gpt-4o-mini", "openai"), + mode = "object", + providerOptions, + retryModel = getModel("claude-3-5-sonnet-20240620", "anthropic"), +}: GenerateCompletionsOptions): Promise<{ extract: any; numTokens: number; warning: string | undefined; totalUsage: TokenUsage; model: string; + cost: number; }> { let extract: any; let warning: string | undefined; + let currentModel = model; + let lastError: Error | null = null; if (markdown === undefined) { throw new Error("document.markdown is undefined -- this is unexpected"); } - const { maxInputTokens, maxOutputTokens } = getModelLimits(model.modelId); + const { maxInputTokens, maxOutputTokens } = getModelLimits( + currentModel.modelId, + ); // Calculate 80% of max input tokens (for content) const maxTokensSafe = Math.floor(maxInputTokens * 0.8); // Use the new trimming function - const { text: trimmedMarkdown, numTokens, warning: trimWarning } = trimToTokenLimit( - markdown, - maxTokensSafe, - model.modelId, - previousWarning - ); + const { + text: trimmedMarkdown, + numTokens, + warning: trimWarning, + } = trimToTokenLimit(markdown, maxTokensSafe, model.modelId, previousWarning); - markdown = trimmedMarkdown; - warning = trimWarning; + // WE USE BIG MODELS NOW + // markdown = trimmedMarkdown; + // warning = trimWarning; try { - const prompt = options.prompt !== undefined - ? `Transform the following content into structured JSON output based on the provided schema and this user request: ${options.prompt}. If schema is provided, strictly follow it.\n\n${markdown}` - : `Transform the following content into structured JSON output based on the provided schema if any.\n\n${markdown}`; + const prompt = + options.prompt !== undefined + ? `Transform the following content into structured JSON output based on the provided schema and this user request: ${options.prompt}. If schema is provided, strictly follow it.\n\n${markdown}` + : `Transform the following content into structured JSON output based on the provided schema if any.\n\n${markdown}`; if (mode === "no-object") { - const result = await generateText({ - model: model, - prompt: options.prompt + (markdown ? `\n\nData:${markdown}` : ""), - temperature: options.temperature ?? 0, - system: options.systemPrompt, - }); + try { + const result = await generateText({ + model: currentModel, + prompt: options.prompt + (markdown ? `\n\nData:${markdown}` : ""), + system: options.systemPrompt, + providerOptions: { + anthropic: { + thinking: { type: "enabled", budgetTokens: 12000 }, + }, + }, + }); - extract = result.text; - - return { - extract, - warning, - numTokens, - totalUsage: { - promptTokens: numTokens, - completionTokens: result.usage?.completionTokens ?? 0, - totalTokens: numTokens + (result.usage?.completionTokens ?? 0), - }, - model: model.modelId, - }; + extract = result.text; + + return { + extract, + warning, + numTokens, + totalUsage: { + promptTokens: numTokens, + completionTokens: result.usage?.completionTokens ?? 0, + totalTokens: numTokens + (result.usage?.completionTokens ?? 0), + }, + model: currentModel.modelId, + cost: calculateCost( + currentModel.modelId, + numTokens, + result.usage?.completionTokens ?? 0, + ), + }; + } catch (error) { + lastError = error as Error; + if ( + error.message?.includes("Quota exceeded") || + error.message?.includes("You exceeded your current quota") || + error.message?.includes("rate limit") + ) { + logger.warn("Quota exceeded, retrying with fallback model", { + error: lastError.message, + }); + currentModel = retryModel; + try { + const result = await generateText({ + model: currentModel, + prompt: options.prompt + (markdown ? `\n\nData:${markdown}` : ""), + system: options.systemPrompt, + providerOptions: { + anthropic: { + thinking: { type: "enabled", budgetTokens: 12000 }, + }, + }, + }); + + extract = result.text; + + return { + extract, + warning, + numTokens, + totalUsage: { + promptTokens: numTokens, + completionTokens: result.usage?.completionTokens ?? 0, + totalTokens: numTokens + (result.usage?.completionTokens ?? 0), + }, + model: currentModel.modelId, + cost: calculateCost( + currentModel.modelId, + numTokens, + result.usage?.completionTokens ?? 0, + ), + }; + } catch (retryError) { + lastError = retryError as Error; + logger.error("Failed with fallback model", { + originalError: lastError.message, + model: currentModel.modelId, + }); + throw lastError; + } + } + throw lastError; + } } let schema = options.schema; @@ -276,32 +421,114 @@ export async function generateCompletions({ } catch (_) {} } - const { text: fixedText } = await generateText({ - model: model, - prompt: `Fix this JSON that had the following error: ${error}\n\nOriginal text:\n${text}\n\nReturn only the fixed JSON, no explanation.`, - system: "You are a JSON repair expert. Your only job is to fix malformed JSON and return valid JSON that matches the original structure and intent as closely as possible. Do not include any explanation or commentary - only return the fixed JSON. Do not return it in a Markdown code block, just plain JSON." - }); - return fixedText; - } + try { + const { text: fixedText } = await generateText({ + model: currentModel, + prompt: `Fix this JSON that had the following error: ${error}\n\nOriginal text:\n${text}\n\nReturn only the fixed JSON, no explanation.`, + system: + "You are a JSON repair expert. Your only job is to fix malformed JSON and return valid JSON that matches the original structure and intent as closely as possible. Do not include any explanation or commentary - only return the fixed JSON. Do not return it in a Markdown code block, just plain JSON.", + providerOptions: { + anthropic: { + thinking: { type: "enabled", budgetTokens: 12000 }, + }, + }, + }); + return fixedText; + } catch (repairError) { + lastError = repairError as Error; + logger.error("Failed to repair JSON", { error: lastError.message }); + throw lastError; + } + }, }; const generateObjectConfig = { - model: model, + model: currentModel, prompt: prompt, - temperature: options.temperature ?? 0, + providerOptions: providerOptions || undefined, system: options.systemPrompt, - ...(schema && { schema: schema instanceof z.ZodType ? schema : jsonSchema(schema) }), - ...(!schema && { output: 'no-schema' as const }), + ...(schema && { + schema: schema instanceof z.ZodType ? schema : jsonSchema(schema), + }), + ...(!schema && { output: "no-schema" as const }), ...repairConfig, ...(!schema && { onError: (error: Error) => { + lastError = error; console.error(error); - } - }) + }, + }), } satisfies Parameters[0]; - const result = await generateObject(generateObjectConfig); - extract = result.object; + // const now = new Date().getTime(); + // await fs.writeFile( + // `logs/generateObjectConfig-${now}.json`, + // JSON.stringify(generateObjectConfig, null, 2), + // ); + + let result: { object: any; usage: TokenUsage } | undefined; + try { + result = await generateObject(generateObjectConfig); + } catch (error) { + lastError = error as Error; + if ( + error.message?.includes("Quota exceeded") || + error.message?.includes("You exceeded your current quota") || + error.message?.includes("rate limit") + ) { + logger.warn("Quota exceeded, retrying with fallback model", { + error: lastError.message, + }); + currentModel = retryModel; + try { + const retryConfig = { + ...generateObjectConfig, + model: currentModel, + }; + result = await generateObject(retryConfig); + } catch (retryError) { + lastError = retryError as Error; + logger.error("Failed with fallback model", { + originalError: lastError.message, + model: currentModel.modelId, + }); + throw lastError; + } + } else if (NoObjectGeneratedError.isInstance(error)) { + console.log("No object generated", error); + if ( + error.text && + error.text.startsWith("```json") && + error?.text.endsWith("```") + ) { + try { + extract = JSON.parse( + error.text.slice("```json".length, -"```".length).trim(), + ); + result = { + object: extract, + usage: { + promptTokens: error.usage?.promptTokens ?? 0, + completionTokens: error.usage?.completionTokens ?? 0, + totalTokens: error.usage?.totalTokens ?? 0, + }, + }; + } catch (parseError) { + lastError = parseError as Error; + logger.error("Failed to parse JSON from error text", { + error: lastError.message, + }); + throw lastError; + } + } else { + throw lastError; + } + } else { + throw lastError; + } + } + + extract = result?.object; // If the users actually wants the items object, they can specify it as 'required' in the schema // otherwise, we just return the items array @@ -326,13 +553,20 @@ export async function generateCompletions({ completionTokens, totalTokens: promptTokens + completionTokens, }, - model: model.modelId, + model: currentModel.modelId, + cost: calculateCost(currentModel.modelId, promptTokens, completionTokens), }; } catch (error) { - if (error.message?.includes('refused')) { + lastError = error as Error; + if (error.message?.includes("refused")) { throw new LLMRefusalError(error.message); } - throw error; + logger.error("LLM extraction failed", { + error: lastError.message, + model: currentModel.modelId, + mode, + }); + throw lastError; } } @@ -341,22 +575,139 @@ export async function performLLMExtract( document: Document, ): Promise { if (meta.options.formats.includes("extract")) { - meta.internalOptions.abort?.throwIfAborted(); - const { extract, warning } = await generateCompletions({ + // const originalOptions = meta.options.extract!; + + // let generationOptions = { ...originalOptions }; // Start with original options + + const generationOptions: GenerateCompletionsOptions = { logger: meta.logger.child({ method: "performLLMExtract/generateCompletions", }), options: meta.options.extract!, markdown: document.markdown, - previousWarning: document.warning - }); + previousWarning: document.warning, + // ... existing model and provider options ... + // model: getModel("o3-mini", "openai"), // Keeping existing model selection + // model: getModel("o3-mini", "openai"), + // model: getModel("qwen-qwq-32b", "groq"), + // model: getModel("gemini-2.0-flash", "google"), + // model: getModel("gemini-2.5-pro-preview-03-25", "vertex"), + model: getModel("gemini-2.5-pro-preview-03-25", "vertex"), + retryModel: getModel("gemini-2.5-pro-preview-03-25", "google"), + }; - if (meta.options.formats.includes("json")) { - document.json = extract; - } else { - document.extract = extract; + const { extractedDataArray, warning, smartScrapeCost, otherCost, costLimitExceededTokenUsage } = + await extractData({ + extractOptions: generationOptions, + urls: [meta.url], + useAgent: isAgentExtractModelValid(meta.options.extract?.agent?.model), + }); + + if (warning) { + document.warning = warning + (document.warning ? " " + document.warning : ""); } - document.warning = warning; + + if (document.metadata.costTracking) { + document.metadata.costTracking.smartScrapeCallCount++; + document.metadata.costTracking.smartScrapeCost += smartScrapeCost; + document.metadata.costTracking.otherCallCount++; + document.metadata.costTracking.otherCost += otherCost; + document.metadata.costTracking.totalCost += smartScrapeCost + otherCost; + if (costLimitExceededTokenUsage) { + document.metadata.costTracking.costLimitExceededTokenUsage = costLimitExceededTokenUsage; + } + } else { + document.metadata.costTracking = { + smartScrapeCallCount: 1, + smartScrapeCost: smartScrapeCost, + otherCallCount: 1, + otherCost: otherCost, + totalCost: smartScrapeCost + otherCost, + }; + } + + // IMPORTANT: here it only get's the last page!!! + const extractedData = + extractedDataArray[extractedDataArray.length - 1] ?? undefined; + + // // Prepare the schema, potentially wrapping it + // const { schemaToUse, schemaWasWrapped } = prepareSmartScrapeSchema( + // originalOptions.schema, + // meta.logger, + // ); + + // // Update generationOptions with the potentially wrapped schema + // generationOptions.schema = schemaToUse; + + // meta.internalOptions.abort?.throwIfAborted(); + // const { + // extract: rawExtract, + // warning, + // totalUsage, + // model, + // } = await generateCompletions({ + // logger: meta.logger.child({ + // method: "performLLMExtract/generateCompletions", + // }), + // options: generationOptions, // Use the potentially modified options + // markdown: document.markdown, + // previousWarning: document.warning, + // // ... existing model and provider options ... + // model: getModel("o3-mini", "openai"), // Keeping existing model selection + // providerOptions: { + // anthropic: { + // thinking: { type: "enabled", budgetTokens: 12000 }, + // }, + // }, + // }); + + // // Log token usage + // meta.logger.info("LLM extraction token usage", { + // model: model, + // promptTokens: totalUsage.promptTokens, + // completionTokens: totalUsage.completionTokens, + // totalTokens: totalUsage.totalTokens, + // }); + + // // Process the result to extract data and SmartScrape decision + // const { + // extractedData, + // shouldUseSmartscrape, + // smartscrape_reasoning, + // smartscrape_prompt, + // } = processSmartScrapeResult(rawExtract, schemaWasWrapped, meta.logger); + + // // Log the SmartScrape decision if applicable + // if (schemaWasWrapped) { + // meta.logger.info("SmartScrape decision processing result", { + // shouldUseSmartscrape, + // smartscrape_reasoning, + // // Don't log the full prompt potentially + // smartscrape_prompt_present: !!smartscrape_prompt, + // extractedDataIsPresent: + // extractedData !== undefined && extractedData !== null, + // }); + + // // TODO: Implement logic to ACTUALLY trigger SmartScrape based on the result + // // For example: + // // if (shouldUseSmartscrape && smartscrape_prompt) { + // // meta.logger.info("Triggering SmartScrape refinement...", { reason: smartscrape_reasoning, prompt: smartscrape_prompt }); + // // // Call the smartScrape function (which needs to be implemented/imported) + // // // const smartScrapedDocs = await smartScrape(meta.url, smartscrape_prompt); + // // // Process/merge smartScrapedDocs with extractedData + // // // ... potentially update finalExtract ... + // // } else { + // // meta.logger.info("SmartScrape not required based on LLM output."); + // // } + // } + + // Assign the final extracted data + if (meta.options.formats.includes("json")) { + document.json = extractedData; + } else { + document.extract = extractedData; + } + // document.warning = warning; } return document; @@ -366,7 +717,7 @@ export function removeDefaultProperty(schema: any): any { if (typeof schema !== "object" || schema === null) return schema; const rest = { ...schema }; - + // unsupported global keys delete rest.default; @@ -408,18 +759,22 @@ export function removeDefaultProperty(schema: any): any { return rest; } -export async function generateSchemaFromPrompt(prompt: string): Promise { - const model = getModel("gpt-4o"); +export async function generateSchemaFromPrompt( + prompt: string, +): Promise<{ extract: any; cost: number }> { + const model = getModel("gpt-4o", "openai"); + const retryModel = getModel("gpt-4o-mini", "openai"); const temperatures = [0, 0.1, 0.3]; // Different temperatures to try let lastError: Error | null = null; for (const temp of temperatures) { try { - const { extract } = await generateCompletions({ + const { extract, cost } = await generateCompletions({ logger: logger.child({ method: "generateSchemaFromPrompt/generateCompletions", }), - model: model, + model, + retryModel, options: { mode: "llm", systemPrompt: `You are a schema generator for a web scraping system. Generate a JSON schema based on the user's prompt. @@ -448,13 +803,12 @@ DO NOT USE FORMATS. Keep it simple. Don't create too many properties, just the ones that are needed. Don't invent properties. Return a valid JSON schema object with properties that would capture the information requested in the prompt.`, prompt: `Generate a JSON schema for extracting the following information: ${prompt}`, - temperature: temp + // temperature: temp, }, - markdown: prompt + markdown: prompt, }); - return extract; - + return { extract, cost }; } catch (error) { lastError = error as Error; logger.warn(`Failed attempt with temperature ${temp}: ${error.message}`); diff --git a/apps/api/src/services/agentLivecastWS.ts b/apps/api/src/services/agentLivecastWS.ts new file mode 100644 index 00000000..b6d64c7b --- /dev/null +++ b/apps/api/src/services/agentLivecastWS.ts @@ -0,0 +1,56 @@ +import { configDotenv } from 'dotenv'; +import { logger } from '../lib/logger'; +import type { Request } from 'express'; +import WSWebSocket from 'ws'; +configDotenv(); + +/** + * Attaches WebSocket proxying logic to the Express application + * This function should be called after creating the Express app but before starting the server + */ +export function attachWsProxy(app: any) { + logger.info('Attaching WebSocket proxy to Express app'); + + // Make sure express-ws is properly initialized + if (!app.ws) { + logger.error('Express app does not have WebSocket support. Make sure express-ws is properly initialized.'); + return; + } + + // Define the WebSocket route + app.ws('/agent-livecast', (clientWs: WSWebSocket, req: Request) => { + try { + console.log(req.url); + const url = new URL(req.url ?? '', 'http://placeholder/'); + const sessionIdParam = url.searchParams.get('userProvidedId') || ''; + + const workerWsUrl = `${process.env.FIRE_ENGINE_BETA_URL?.replace('http', 'ws')}?userProvidedId=${sessionIdParam}`; + console.log(workerWsUrl) + const wsWorker = new WebSocket(workerWsUrl); + + wsWorker.onopen = () => { + // clientWs is your user's browser socket + // wsWorker is the worker's socket + + // Forward messages from the user -> worker + clientWs.on('message', (dataFromClient) => { + wsWorker.send(dataFromClient as unknown as string); + }); + + // Forward messages from the worker -> user + wsWorker.onmessage = (event) => { + clientWs.send(event.data); + }; + + // Close events + clientWs.on('close', () => wsWorker.close()); + wsWorker.onclose = () => clientWs.close(); + }; + } catch (error) { + console.error('Error in wsProxy upgrade:', error); + clientWs.close(); + } + }); + + logger.info('WebSocket proxy successfully attached to Express app'); +} diff --git a/apps/api/src/services/logging/log_job.ts b/apps/api/src/services/logging/log_job.ts index ce6a4887..7149709b 100644 --- a/apps/api/src/services/logging/log_job.ts +++ b/apps/api/src/services/logging/log_job.ts @@ -102,6 +102,7 @@ export async function logJob(job: FirecrawlJob, force: boolean = false) { crawl_id: job.crawl_id, tokens_billed: job.tokens_billed, is_migrated: true, + cost_tracking: job.cost_tracking, }; // Send job to external server @@ -181,6 +182,7 @@ export async function logJob(job: FirecrawlJob, force: boolean = false) { num_tokens: job.num_tokens, retry: job.retry, tokens_billed: job.tokens_billed, + cost_tracking: job.cost_tracking, }, }; if (job.mode !== "single_urls") { diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 57ae8eb8..3f096448 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -61,7 +61,10 @@ import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings"; import { indexPage } from "../lib/extract/index/pinecone"; import { Document } from "../controllers/v1/types"; -import { performExtraction } from "../lib/extract/extraction-service"; +import { + ExtractResult, + performExtraction, +} from "../lib/extract/extraction-service"; import { supabase_service } from "../services/supabase"; import { normalizeUrl, normalizeUrlOnlyHostname } from "../lib/canonical-url"; import { saveExtract, updateExtract } from "../lib/extract/extract-redis"; @@ -71,6 +74,7 @@ import { updateDeepResearch } from "../lib/deep-research/deep-research-redis"; import { performDeepResearch } from "../lib/deep-research/deep-research-service"; import { performGenerateLlmsTxt } from "../lib/generate-llmstxt/generate-llmstxt-service"; import { updateGeneratedLlmsTxt } from "../lib/generate-llmstxt/generate-llmstxt-redis"; +import { performExtraction_F0 } from "../lib/extract/fire-0/extraction-service-f0"; configDotenv(); @@ -100,19 +104,35 @@ const runningJobs: Set = new Set(); async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) { if (await finishCrawlPre(job.data.crawl_id)) { - if (job.data.crawlerOptions && !await redisConnection.exists("crawl:" + job.data.crawl_id + ":invisible_urls")) { - await redisConnection.set("crawl:" + job.data.crawl_id + ":invisible_urls", "done", "EX", 60 * 60 * 24); + if ( + job.data.crawlerOptions && + !(await redisConnection.exists( + "crawl:" + job.data.crawl_id + ":invisible_urls", + )) + ) { + await redisConnection.set( + "crawl:" + job.data.crawl_id + ":invisible_urls", + "done", + "EX", + 60 * 60 * 24, + ); const sc = (await getCrawl(job.data.crawl_id))!; - const visitedUrls = new Set(await redisConnection.smembers( - "crawl:" + job.data.crawl_id + ":visited_unique", - )); + const visitedUrls = new Set( + await redisConnection.smembers( + "crawl:" + job.data.crawl_id + ":visited_unique", + ), + ); - const lastUrls: string[] = ((await supabase_service.rpc("diff_get_last_crawl_urls", { - i_team_id: job.data.team_id, - i_url: sc.originUrl!, - })).data ?? []).map(x => x.url); + const lastUrls: string[] = ( + ( + await supabase_service.rpc("diff_get_last_crawl_urls", { + i_team_id: job.data.team_id, + i_url: sc.originUrl!, + }) + ).data ?? [] + ).map((x) => x.url); const lastUrlsSet = new Set(lastUrls); @@ -124,14 +144,24 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) { ); const univistedUrls = crawler.filterLinks( - Array.from(lastUrlsSet).filter(x => !visitedUrls.has(x)), + Array.from(lastUrlsSet).filter((x) => !visitedUrls.has(x)), Infinity, sc.crawlerOptions.maxDepth ?? 10, ); - - const addableJobCount = sc.crawlerOptions.limit === undefined ? Infinity : (sc.crawlerOptions.limit - await getDoneJobsOrderedLength(job.data.crawl_id)); - console.log(sc.originUrl!, univistedUrls, visitedUrls, lastUrls, addableJobCount); + const addableJobCount = + sc.crawlerOptions.limit === undefined + ? Infinity + : sc.crawlerOptions.limit - + (await getDoneJobsOrderedLength(job.data.crawl_id)); + + console.log( + sc.originUrl!, + univistedUrls, + visitedUrls, + lastUrls, + addableJobCount, + ); if (univistedUrls.length !== 0 && addableJobCount > 0) { const jobs = univistedUrls.slice(0, addableJobCount).map((url) => { @@ -401,13 +431,29 @@ const processExtractJobInternal = async ( }, jobLockExtendInterval); try { - const result = await performExtraction(job.data.extractId, { + let result: ExtractResult | null = null; + + // const model = job.data.request.agent?.model + // if (job.data.request.agent && model && model.toLowerCase().includes("fire-1")) { + // result = await performExtraction(job.data.extractId, { + // request: job.data.request, + // teamId: job.data.teamId, + // subId: job.data.subId, + // }); + // } else { + // result = await performExtraction_F0(job.data.extractId, { + // request: job.data.request, + // teamId: job.data.teamId, + // subId: job.data.subId, + // }); + // } + result = await performExtraction_F0(job.data.extractId, { request: job.data.request, teamId: job.data.teamId, subId: job.data.subId, }); - if (result.success) { + if (result && result.success) { // Move job to completed state in Redis await job.moveToCompleted(result, token, false); return result; @@ -418,7 +464,7 @@ const processExtractJobInternal = async ( await updateExtract(job.data.extractId, { status: "failed", error: - result.error ?? + result?.error ?? "Unknown error, please contact help@firecrawl.com. Extract id: " + job.data.extractId, }); @@ -481,7 +527,10 @@ const processDeepResearchJobInternal = async ( }, jobLockExtendInterval); try { - console.log("[Deep Research] Starting deep research: ", job.data.researchId); + console.log( + "[Deep Research] Starting deep research: ", + job.data.researchId, + ); const result = await performDeepResearch({ researchId: job.data.researchId, teamId: job.data.teamId, @@ -494,9 +543,9 @@ const processDeepResearchJobInternal = async ( systemPrompt: job.data.request.systemPrompt, formats: job.data.request.formats, jsonOptions: job.data.request.jsonOptions, - }); - - if(result.success) { + }); + + if (result.success) { // Move job to completed state in Redis and update research status await job.moveToCompleted(result, token, false); return result; @@ -544,7 +593,7 @@ const processGenerateLlmsTxtJobInternal = async ( ) => { const logger = _logger.child({ module: "generate-llmstxt-worker", - method: "processJobInternal", + method: "processJobInternal", jobId: job.id, generateId: job.data.generateId, teamId: job.data?.teamId ?? undefined, @@ -574,7 +623,9 @@ const processGenerateLlmsTxtJobInternal = async ( }); return result; } else { - const error = new Error("LLMs text generation failed without specific error"); + const error = new Error( + "LLMs text generation failed without specific error", + ); await job.moveToFailed(error, token, false); await updateGeneratedLlmsTxt(job.data.generateId, { status: "failed", @@ -598,7 +649,7 @@ const processGenerateLlmsTxtJobInternal = async ( } await updateGeneratedLlmsTxt(job.data.generateId, { - status: "failed", + status: "failed", error: error.message || "Unknown error occurred", }); @@ -685,7 +736,11 @@ const workerFun = async ( // we are 1 under the limit, assuming the job insertion logic never over-inserts. - MG const nextJob = await takeConcurrencyLimitedJob(job.data.team_id); if (nextJob !== null) { - await pushConcurrencyLimitActiveJob(job.data.team_id, nextJob.id, 60 * 1000); // 60s initial timeout + await pushConcurrencyLimitActiveJob( + job.data.team_id, + nextJob.id, + 60 * 1000, + ); // 60s initial timeout await queue.add( nextJob.id, @@ -1002,7 +1057,9 @@ async function processJob(job: Job & { id: string }, token: string) { } if (job.data.concurrencyLimited) { - doc.warning = "This scrape job was throttled at your current concurrency limit. If you'd like to scrape faster, you can upgrade your plan." + (doc.warning ? " " + doc.warning : ""); + doc.warning = + "This scrape job was throttled at your current concurrency limit. If you'd like to scrape faster, you can upgrade your plan." + + (doc.warning ? " " + doc.warning : ""); } const data = { @@ -1061,7 +1118,9 @@ async function processJob(job: Job & { id: string }, token: string) { // If this would be done for non-crossdomain redirects, but also for e.g. // redirecting / -> /introduction (like our docs site does), it would // break crawling the entire site without allowBackwardsCrawling - mogery - const isHostnameDifferent = normalizeUrlOnlyHostname(doc.metadata.url) !== normalizeUrlOnlyHostname(doc.metadata.sourceURL); + const isHostnameDifferent = + normalizeUrlOnlyHostname(doc.metadata.url) !== + normalizeUrlOnlyHostname(doc.metadata.sourceURL); if (job.data.isCrawlSourceScrape && isHostnameDifferent) { // TODO: re-fetch sitemap for redirect target domain sc.originUrl = doc.metadata.url; @@ -1172,7 +1231,8 @@ async function processJob(job: Job & { id: string }, token: string) { internalOptions: sc.internalOptions, crawlerOptions: { ...sc.crawlerOptions, - currentDiscoveryDepth: (job.data.crawlerOptions?.currentDiscoveryDepth ?? 0) + 1, + currentDiscoveryDepth: + (job.data.crawlerOptions?.currentDiscoveryDepth ?? 0) + 1, }, origin: job.data.origin, crawl_id: job.data.crawl_id, @@ -1199,14 +1259,27 @@ async function processJob(job: Job & { id: string }, token: string) { } // Only run check after adding new jobs for discovery - mogery - if (job.data.isCrawlSourceScrape && crawler.filterLinks([doc.metadata.url ?? doc.metadata.sourceURL!], 1, sc.crawlerOptions?.maxDepth ?? 10).length === 0) { - throw new Error("Source URL is not allowed by includePaths/excludePaths rules") + if ( + job.data.isCrawlSourceScrape && + crawler.filterLinks( + [doc.metadata.url ?? doc.metadata.sourceURL!], + 1, + sc.crawlerOptions?.maxDepth ?? 10, + ).length === 0 + ) { + throw new Error( + "Source URL is not allowed by includePaths/excludePaths rules", + ); } } } await finishCrawlIfNeeded(job, sc); } else { + const cost_tracking = doc?.metadata?.costTracking; + + delete doc.metadata.costTracking; + await logJob({ job_id: job.id, success: true, @@ -1220,6 +1293,7 @@ async function processJob(job: Job & { id: string }, token: string) { scrapeOptions: job.data.scrapeOptions, origin: job.data.origin, num_tokens: 0, // TODO: fix + cost_tracking, }); indexJob(job, doc); @@ -1230,16 +1304,25 @@ async function processJob(job: Job & { id: string }, token: string) { if (job.data.scrapeOptions.extract) { creditsToBeBilled = 5; } + if (job.data.scrapeOptions.agent?.model?.toLowerCase() === "fire-1") { + creditsToBeBilled = 150; + } - if (job.data.team_id !== process.env.BACKGROUND_INDEX_TEAM_ID! && process.env.USE_DB_AUTHENTICATION === "true") { + if ( + job.data.team_id !== process.env.BACKGROUND_INDEX_TEAM_ID! && + process.env.USE_DB_AUTHENTICATION === "true" + ) { try { const billingJobId = uuidv4(); - logger.debug(`Adding billing job to queue for team ${job.data.team_id}`, { - billingJobId, - credits: creditsToBeBilled, - is_extract: false, - }); - + logger.debug( + `Adding billing job to queue for team ${job.data.team_id}`, + { + billingJobId, + credits: creditsToBeBilled, + is_extract: false, + }, + ); + // Add directly to the billing queue - the billing worker will handle the rest await getBillingQueue().add( "bill_team", @@ -1249,12 +1332,12 @@ async function processJob(job: Job & { id: string }, token: string) { credits: creditsToBeBilled, is_extract: false, timestamp: new Date().toISOString(), - originating_job_id: job.id + originating_job_id: job.id, }, { jobId: billingJobId, priority: 10, - } + }, ); } catch (error) { logger.error( diff --git a/apps/api/src/services/rate-limiter.ts b/apps/api/src/services/rate-limiter.ts index 600b42a6..2a502c3b 100644 --- a/apps/api/src/services/rate-limiter.ts +++ b/apps/api/src/services/rate-limiter.ts @@ -22,35 +22,6 @@ export const testSuiteRateLimiter = new RateLimiterRedis({ duration: 60, // Duration in seconds }); -// TODO: PUT OVERRIDES FOR THESE INTO THE DB - mogery -const testSuiteTokens = [ - "a01ccae", - "6254cf9", - "0f96e673", - "23befa1b", - "69141c4", - "48f9a97", - "5dc70ad", - "e5e60e5", - "65181ba", - "77c85b7", - "8567275", - "6c46abb", - "cb0ff78", - "fd769b2", - // "4c2638d", - "cbb3462", // don't remove (s-ai) - "824abcd", // don't remove (s-ai) - "0966288", - "226556f", - "0a18c9e", // gh -]; - -// TODO: PUT OVERRIDES FOR THESE INTO THE DB - mogery -// const manual_growth = ["22a07b64-cbfe-4924-9273-e3f01709cdf2"]; -// const manual = ["69be9e74-7624-4990-b20d-08e0acc70cf6", "9661a311-3d75-45d2-bb70-71004d995873"]; -// const manual_etier2c = ["77545e01-9cec-4fa9-8356-883fc66ac13e", "778c62c4-306f-4039-b372-eb20174760c0"]; - const fallbackRateLimits: AuthCreditUsageChunk["rate_limits"] = { crawl: 15, scrape: 100, @@ -60,6 +31,8 @@ const fallbackRateLimits: AuthCreditUsageChunk["rate_limits"] = { preview: 25, extractStatus: 25000, crawlStatus: 25000, + extractAgentPreview: 1, + scrapeAgentPreview: 5, }; export function getRateLimiter( @@ -68,10 +41,6 @@ export function getRateLimiter( ): RateLimiterRedis { return createRateLimiter( `${mode}`, - (rate_limits ?? fallbackRateLimits)[mode] ?? 500, + (rate_limits?.[mode] ?? fallbackRateLimits?.[mode] ?? 500), ); } - -export function isTestSuiteToken(token: string): boolean { - return testSuiteTokens.some((testToken) => token.includes(testToken)); -} diff --git a/apps/api/src/types.ts b/apps/api/src/types.ts index 6a6ae6d9..65e3a428 100644 --- a/apps/api/src/types.ts +++ b/apps/api/src/types.ts @@ -7,6 +7,7 @@ import { } from "./controllers/v1/types"; import { ExtractorOptions, Document } from "./lib/entities"; import { InternalOptions } from "./scraper/scrapeURL"; +import type { CostTracking } from "./lib/extract/extraction-service"; type Mode = "crawl" | "single_urls" | "sitemap"; @@ -90,6 +91,7 @@ export interface FirecrawlJob { crawl_id?: string; tokens_billed?: number; sources?: Record; + cost_tracking?: CostTracking; } export interface FirecrawlScrapeResponse { @@ -132,11 +134,13 @@ export enum RateLimiterMode { Crawl = "crawl", CrawlStatus = "crawlStatus", Scrape = "scrape", + ScrapeAgentPreview = "scrapeAgentPreview", Preview = "preview", Search = "search", Map = "map", Extract = "extract", ExtractStatus = "extractStatus", + ExtractAgentPreview = "extractAgentPreview", } export type AuthResponse = diff --git a/apps/api/tsconfig.json b/apps/api/tsconfig.json index ab2a9546..e220e213 100644 --- a/apps/api/tsconfig.json +++ b/apps/api/tsconfig.json @@ -3,23 +3,26 @@ "rootDir": "./src", "lib": ["ES2022", "DOM"], - // or higher "target": "ES2022", - "module": "commonjs", + "module": "NodeNext", "esModuleInterop": true, "sourceMap": true, "outDir": "./dist/src", - "moduleResolution": "node", + "moduleResolution": "NodeNext", "baseUrl": ".", "strictNullChecks": true, - "paths": { - "*": ["node_modules/*", "src/types/*"], - }, - - "inlineSources": true, + "inlineSources": true }, - "include": ["src/","src/**/*", "services/db/supabase.ts", "utils/utils.ts", "services/db/supabaseEmbeddings.ts", "utils/EventEmmitter.ts", "src/services/queue-service.ts"] + "include": [ + "src/", + "src/**/*", + "services/db/supabase.ts", + "utils/utils.ts", + "services/db/supabaseEmbeddings.ts", + "utils/EventEmmitter.ts", + "src/services/queue-service.ts" + ] } diff --git a/apps/js-sdk/firecrawl/pnpm-lock.yaml b/apps/js-sdk/firecrawl/pnpm-lock.yaml new file mode 100644 index 00000000..c8e1e564 --- /dev/null +++ b/apps/js-sdk/firecrawl/pnpm-lock.yaml @@ -0,0 +1,3611 @@ +lockfileVersion: '9.0' + +settings: + autoInstallPeers: true + excludeLinksFromLockfile: false + +importers: + + .: + dependencies: + axios: + specifier: ^1.6.8 + version: 1.7.9 + isows: + specifier: ^1.0.4 + version: 1.0.6(ws@8.18.0) + typescript-event-target: + specifier: ^1.1.1 + version: 1.1.1 + zod: + specifier: ^3.23.8 + version: 3.24.1 + zod-to-json-schema: + specifier: ^3.23.0 + version: 3.24.1(zod@3.24.1) + devDependencies: + '@jest/globals': + specifier: ^29.7.0 + version: 29.7.0 + '@types/axios': + specifier: ^0.14.0 + version: 0.14.4 + '@types/dotenv': + specifier: ^8.2.0 + version: 8.2.3 + '@types/jest': + specifier: ^29.5.12 + version: 29.5.14 + '@types/mocha': + specifier: ^10.0.6 + version: 10.0.10 + '@types/node': + specifier: ^20.12.12 + version: 20.17.10 + '@types/uuid': + specifier: ^9.0.8 + version: 9.0.8 + dotenv: + specifier: ^16.4.5 + version: 16.4.7 + jest: + specifier: ^29.7.0 + version: 29.7.0(@types/node@20.17.10) + ts-jest: + specifier: ^29.2.2 + version: 29.2.5(@babel/core@7.26.0)(@jest/transform@29.7.0)(@jest/types@29.6.3)(babel-jest@29.7.0(@babel/core@7.26.0))(esbuild@0.24.0)(jest@29.7.0(@types/node@20.17.10))(typescript@5.7.2) + tsup: + specifier: ^8.2.4 + version: 8.3.5(typescript@5.7.2) + typescript: + specifier: ^5.4.5 + version: 5.7.2 + uuid: + specifier: ^9.0.1 + version: 9.0.1 + +packages: + + '@ampproject/remapping@2.3.0': + resolution: {integrity: sha512-30iZtAPgz+LTIYoeivqYo853f02jBYSd5uGnGpkFV0M3xOt9aN73erkgYAmZU43x4VfqcnLxW9Kpg3R5LC4YYw==} + engines: {node: '>=6.0.0'} + + '@babel/code-frame@7.26.2': + resolution: {integrity: sha512-RJlIHRueQgwWitWgF8OdFYGZX328Ax5BCemNGlqHfplnRT9ESi8JkFlvaVYbS+UubVY6dpv87Fs2u5M29iNFVQ==} + engines: {node: '>=6.9.0'} + + '@babel/compat-data@7.26.3': + resolution: {integrity: sha512-nHIxvKPniQXpmQLb0vhY3VaFb3S0YrTAwpOWJZh1wn3oJPjJk9Asva204PsBdmAE8vpzfHudT8DB0scYvy9q0g==} + engines: {node: '>=6.9.0'} + + '@babel/core@7.26.0': + resolution: {integrity: sha512-i1SLeK+DzNnQ3LL/CswPCa/E5u4lh1k6IAEphON8F+cXt0t9euTshDru0q7/IqMa1PMPz5RnHuHscF8/ZJsStg==} + engines: {node: '>=6.9.0'} + + '@babel/generator@7.26.3': + resolution: {integrity: sha512-6FF/urZvD0sTeO7k6/B15pMLC4CHUv1426lzr3N01aHJTl046uCAh9LXW/fzeXXjPNCJ6iABW5XaWOsIZB93aQ==} + engines: {node: '>=6.9.0'} + + '@babel/helper-compilation-targets@7.25.9': + resolution: {integrity: sha512-j9Db8Suy6yV/VHa4qzrj9yZfZxhLWQdVnRlXxmKLYlhWUVB1sB2G5sxuWYXk/whHD9iW76PmNzxZ4UCnTQTVEQ==} + engines: {node: '>=6.9.0'} + + '@babel/helper-module-imports@7.25.9': + resolution: {integrity: sha512-tnUA4RsrmflIM6W6RFTLFSXITtl0wKjgpnLgXyowocVPrbYrLUXSBXDgTs8BlbmIzIdlBySRQjINYs2BAkiLtw==} + engines: {node: '>=6.9.0'} + + '@babel/helper-module-transforms@7.26.0': + resolution: {integrity: sha512-xO+xu6B5K2czEnQye6BHA7DolFFmS3LB7stHZFaOLb1pAwO1HWLS8fXA+eh0A2yIvltPVmx3eNNDBJA2SLHXFw==} + engines: {node: '>=6.9.0'} + peerDependencies: + '@babel/core': ^7.0.0 + + '@babel/helper-plugin-utils@7.25.9': + resolution: {integrity: sha512-kSMlyUVdWe25rEsRGviIgOWnoT/nfABVWlqt9N19/dIPWViAOW2s9wznP5tURbs/IDuNk4gPy3YdYRgH3uxhBw==} + engines: {node: '>=6.9.0'} + + '@babel/helper-string-parser@7.25.9': + resolution: {integrity: sha512-4A/SCr/2KLd5jrtOMFzaKjVtAei3+2r/NChoBNoZ3EyP/+GlhoaEGoWOZUmFmoITP7zOJyHIMm+DYRd8o3PvHA==} + engines: {node: '>=6.9.0'} + + '@babel/helper-validator-identifier@7.25.9': + resolution: {integrity: sha512-Ed61U6XJc3CVRfkERJWDz4dJwKe7iLmmJsbOGu9wSloNSFttHV0I8g6UAgb7qnK5ly5bGLPd4oXZlxCdANBOWQ==} + engines: {node: '>=6.9.0'} + + '@babel/helper-validator-option@7.25.9': + resolution: {integrity: sha512-e/zv1co8pp55dNdEcCynfj9X7nyUKUXoUEwfXqaZt0omVOmDe9oOTdKStH4GmAw6zxMFs50ZayuMfHDKlO7Tfw==} + engines: {node: '>=6.9.0'} + + '@babel/helpers@7.26.0': + resolution: {integrity: sha512-tbhNuIxNcVb21pInl3ZSjksLCvgdZy9KwJ8brv993QtIVKJBBkYXz4q4ZbAv31GdnC+R90np23L5FbEBlthAEw==} + engines: {node: '>=6.9.0'} + + '@babel/parser@7.26.3': + resolution: {integrity: sha512-WJ/CvmY8Mea8iDXo6a7RK2wbmJITT5fN3BEkRuFlxVyNx8jOKIIhmC4fSkTcPcf8JyavbBwIe6OpiCOBXt/IcA==} + engines: {node: '>=6.0.0'} + hasBin: true + + '@babel/plugin-syntax-async-generators@7.8.4': + resolution: {integrity: sha512-tycmZxkGfZaxhMRbXlPXuVFpdWlXpir2W4AMhSJgRKzk/eDlIXOhb2LHWoLpDF7TEHylV5zNhykX6KAgHJmTNw==} + peerDependencies: + '@babel/core': ^7.0.0-0 + + '@babel/plugin-syntax-bigint@7.8.3': + resolution: {integrity: sha512-wnTnFlG+YxQm3vDxpGE57Pj0srRU4sHE/mDkt1qv2YJJSeUAec2ma4WLUnUPeKjyrfntVwe/N6dCXpU+zL3Npg==} + peerDependencies: + '@babel/core': ^7.0.0-0 + + '@babel/plugin-syntax-class-properties@7.12.13': + resolution: {integrity: sha512-fm4idjKla0YahUNgFNLCB0qySdsoPiZP3iQE3rky0mBUtMZ23yDJ9SJdg6dXTSDnulOVqiF3Hgr9nbXvXTQZYA==} + peerDependencies: + '@babel/core': ^7.0.0-0 + + '@babel/plugin-syntax-class-static-block@7.14.5': + resolution: {integrity: sha512-b+YyPmr6ldyNnM6sqYeMWE+bgJcJpO6yS4QD7ymxgH34GBPNDM/THBh8iunyvKIZztiwLH4CJZ0RxTk9emgpjw==} + engines: {node: '>=6.9.0'} + peerDependencies: + '@babel/core': ^7.0.0-0 + + '@babel/plugin-syntax-import-attributes@7.26.0': + resolution: {integrity: sha512-e2dttdsJ1ZTpi3B9UYGLw41hifAubg19AtCu/2I/F1QNVclOBr1dYpTdmdyZ84Xiz43BS/tCUkMAZNLv12Pi+A==} + engines: {node: '>=6.9.0'} + peerDependencies: + '@babel/core': ^7.0.0-0 + + '@babel/plugin-syntax-import-meta@7.10.4': + resolution: {integrity: sha512-Yqfm+XDx0+Prh3VSeEQCPU81yC+JWZ2pDPFSS4ZdpfZhp4MkFMaDC1UqseovEKwSUpnIL7+vK+Clp7bfh0iD7g==} + peerDependencies: + '@babel/core': ^7.0.0-0 + + '@babel/plugin-syntax-json-strings@7.8.3': + resolution: {integrity: sha512-lY6kdGpWHvjoe2vk4WrAapEuBR69EMxZl+RoGRhrFGNYVK8mOPAW8VfbT/ZgrFbXlDNiiaxQnAtgVCZ6jv30EA==} + peerDependencies: + '@babel/core': ^7.0.0-0 + + '@babel/plugin-syntax-jsx@7.25.9': + resolution: {integrity: sha512-ld6oezHQMZsZfp6pWtbjaNDF2tiiCYYDqQszHt5VV437lewP9aSi2Of99CK0D0XB21k7FLgnLcmQKyKzynfeAA==} + engines: {node: '>=6.9.0'} + peerDependencies: + '@babel/core': ^7.0.0-0 + + '@babel/plugin-syntax-logical-assignment-operators@7.10.4': + resolution: {integrity: sha512-d8waShlpFDinQ5MtvGU9xDAOzKH47+FFoney2baFIoMr952hKOLp1HR7VszoZvOsV/4+RRszNY7D17ba0te0ig==} + peerDependencies: + '@babel/core': ^7.0.0-0 + + '@babel/plugin-syntax-nullish-coalescing-operator@7.8.3': + resolution: {integrity: sha512-aSff4zPII1u2QD7y+F8oDsz19ew4IGEJg9SVW+bqwpwtfFleiQDMdzA/R+UlWDzfnHFCxxleFT0PMIrR36XLNQ==} + peerDependencies: + '@babel/core': ^7.0.0-0 + + '@babel/plugin-syntax-numeric-separator@7.10.4': + resolution: {integrity: sha512-9H6YdfkcK/uOnY/K7/aA2xpzaAgkQn37yzWUMRK7OaPOqOpGS1+n0H5hxT9AUw9EsSjPW8SVyMJwYRtWs3X3ug==} + peerDependencies: + '@babel/core': ^7.0.0-0 + + '@babel/plugin-syntax-object-rest-spread@7.8.3': + resolution: {integrity: sha512-XoqMijGZb9y3y2XskN+P1wUGiVwWZ5JmoDRwx5+3GmEplNyVM2s2Dg8ILFQm8rWM48orGy5YpI5Bl8U1y7ydlA==} + peerDependencies: + '@babel/core': ^7.0.0-0 + + '@babel/plugin-syntax-optional-catch-binding@7.8.3': + resolution: {integrity: sha512-6VPD0Pc1lpTqw0aKoeRTMiB+kWhAoT24PA+ksWSBrFtl5SIRVpZlwN3NNPQjehA2E/91FV3RjLWoVTglWcSV3Q==} + peerDependencies: + '@babel/core': ^7.0.0-0 + + '@babel/plugin-syntax-optional-chaining@7.8.3': + resolution: {integrity: sha512-KoK9ErH1MBlCPxV0VANkXW2/dw4vlbGDrFgz8bmUsBGYkFRcbRwMh6cIJubdPrkxRwuGdtCk0v/wPTKbQgBjkg==} + peerDependencies: + '@babel/core': ^7.0.0-0 + + '@babel/plugin-syntax-private-property-in-object@7.14.5': + resolution: {integrity: sha512-0wVnp9dxJ72ZUJDV27ZfbSj6iHLoytYZmh3rFcxNnvsJF3ktkzLDZPy/mA17HGsaQT3/DQsWYX1f1QGWkCoVUg==} + engines: {node: '>=6.9.0'} + peerDependencies: + '@babel/core': ^7.0.0-0 + + '@babel/plugin-syntax-top-level-await@7.14.5': + resolution: {integrity: sha512-hx++upLv5U1rgYfwe1xBQUhRmU41NEvpUvrp8jkrSCdvGSnM5/qdRMtylJ6PG5OFkBaHkbTAKTnd3/YyESRHFw==} + engines: {node: '>=6.9.0'} + peerDependencies: + '@babel/core': ^7.0.0-0 + + '@babel/plugin-syntax-typescript@7.25.9': + resolution: {integrity: sha512-hjMgRy5hb8uJJjUcdWunWVcoi9bGpJp8p5Ol1229PoN6aytsLwNMgmdftO23wnCLMfVmTwZDWMPNq/D1SY60JQ==} + engines: {node: '>=6.9.0'} + peerDependencies: + '@babel/core': ^7.0.0-0 + + '@babel/template@7.25.9': + resolution: {integrity: sha512-9DGttpmPvIxBb/2uwpVo3dqJ+O6RooAFOS+lB+xDqoE2PVCE8nfoHMdZLpfCQRLwvohzXISPZcgxt80xLfsuwg==} + engines: {node: '>=6.9.0'} + + '@babel/traverse@7.26.4': + resolution: {integrity: sha512-fH+b7Y4p3yqvApJALCPJcwb0/XaOSgtK4pzV6WVjPR5GLFQBRI7pfoX2V2iM48NXvX07NUxxm1Vw98YjqTcU5w==} + engines: {node: '>=6.9.0'} + + '@babel/types@7.26.3': + resolution: {integrity: sha512-vN5p+1kl59GVKMvTHt55NzzmYVxprfJD+ql7U9NFIfKCBkYE55LYtS+WtPlaYOyzydrKI8Nezd+aZextrd+FMA==} + engines: {node: '>=6.9.0'} + + '@bcoe/v8-coverage@0.2.3': + resolution: {integrity: sha512-0hYQ8SB4Db5zvZB4axdMHGwEaQjkZzFjQiN9LVYvIFB2nSUHW9tYpxWriPrWDASIxiaXax83REcLxuSdnGPZtw==} + + '@esbuild/aix-ppc64@0.24.0': + resolution: {integrity: sha512-WtKdFM7ls47zkKHFVzMz8opM7LkcsIp9amDUBIAWirg70RM71WRSjdILPsY5Uv1D42ZpUfaPILDlfactHgsRkw==} + engines: {node: '>=18'} + cpu: [ppc64] + os: [aix] + + '@esbuild/android-arm64@0.24.0': + resolution: {integrity: sha512-Vsm497xFM7tTIPYK9bNTYJyF/lsP590Qc1WxJdlB6ljCbdZKU9SY8i7+Iin4kyhV/KV5J2rOKsBQbB77Ab7L/w==} + engines: {node: '>=18'} + cpu: [arm64] + os: [android] + + '@esbuild/android-arm@0.24.0': + resolution: {integrity: sha512-arAtTPo76fJ/ICkXWetLCc9EwEHKaeya4vMrReVlEIUCAUncH7M4bhMQ+M9Vf+FFOZJdTNMXNBrWwW+OXWpSew==} + engines: {node: '>=18'} + cpu: [arm] + os: [android] + + '@esbuild/android-x64@0.24.0': + resolution: {integrity: sha512-t8GrvnFkiIY7pa7mMgJd7p8p8qqYIz1NYiAoKc75Zyv73L3DZW++oYMSHPRarcotTKuSs6m3hTOa5CKHaS02TQ==} + engines: {node: '>=18'} + cpu: [x64] + os: [android] + + '@esbuild/darwin-arm64@0.24.0': + resolution: {integrity: sha512-CKyDpRbK1hXwv79soeTJNHb5EiG6ct3efd/FTPdzOWdbZZfGhpbcqIpiD0+vwmpu0wTIL97ZRPZu8vUt46nBSw==} + engines: {node: '>=18'} + cpu: [arm64] + os: [darwin] + + '@esbuild/darwin-x64@0.24.0': + resolution: {integrity: sha512-rgtz6flkVkh58od4PwTRqxbKH9cOjaXCMZgWD905JOzjFKW+7EiUObfd/Kav+A6Gyud6WZk9w+xu6QLytdi2OA==} + engines: {node: '>=18'} + cpu: [x64] + os: [darwin] + + '@esbuild/freebsd-arm64@0.24.0': + resolution: {integrity: sha512-6Mtdq5nHggwfDNLAHkPlyLBpE5L6hwsuXZX8XNmHno9JuL2+bg2BX5tRkwjyfn6sKbxZTq68suOjgWqCicvPXA==} + engines: {node: '>=18'} + cpu: [arm64] + os: [freebsd] + + '@esbuild/freebsd-x64@0.24.0': + resolution: {integrity: sha512-D3H+xh3/zphoX8ck4S2RxKR6gHlHDXXzOf6f/9dbFt/NRBDIE33+cVa49Kil4WUjxMGW0ZIYBYtaGCa2+OsQwQ==} + engines: {node: '>=18'} + cpu: [x64] + os: [freebsd] + + '@esbuild/linux-arm64@0.24.0': + resolution: {integrity: sha512-TDijPXTOeE3eaMkRYpcy3LarIg13dS9wWHRdwYRnzlwlA370rNdZqbcp0WTyyV/k2zSxfko52+C7jU5F9Tfj1g==} + engines: {node: '>=18'} + cpu: [arm64] + os: [linux] + + '@esbuild/linux-arm@0.24.0': + resolution: {integrity: sha512-gJKIi2IjRo5G6Glxb8d3DzYXlxdEj2NlkixPsqePSZMhLudqPhtZ4BUrpIuTjJYXxvF9njql+vRjB2oaC9XpBw==} + engines: {node: '>=18'} + cpu: [arm] + os: [linux] + + '@esbuild/linux-ia32@0.24.0': + resolution: {integrity: sha512-K40ip1LAcA0byL05TbCQ4yJ4swvnbzHscRmUilrmP9Am7//0UjPreh4lpYzvThT2Quw66MhjG//20mrufm40mA==} + engines: {node: '>=18'} + cpu: [ia32] + os: [linux] + + '@esbuild/linux-loong64@0.24.0': + resolution: {integrity: sha512-0mswrYP/9ai+CU0BzBfPMZ8RVm3RGAN/lmOMgW4aFUSOQBjA31UP8Mr6DDhWSuMwj7jaWOT0p0WoZ6jeHhrD7g==} + engines: {node: '>=18'} + cpu: [loong64] + os: [linux] + + '@esbuild/linux-mips64el@0.24.0': + resolution: {integrity: sha512-hIKvXm0/3w/5+RDtCJeXqMZGkI2s4oMUGj3/jM0QzhgIASWrGO5/RlzAzm5nNh/awHE0A19h/CvHQe6FaBNrRA==} + engines: {node: '>=18'} + cpu: [mips64el] + os: [linux] + + '@esbuild/linux-ppc64@0.24.0': + resolution: {integrity: sha512-HcZh5BNq0aC52UoocJxaKORfFODWXZxtBaaZNuN3PUX3MoDsChsZqopzi5UupRhPHSEHotoiptqikjN/B77mYQ==} + engines: {node: '>=18'} + cpu: [ppc64] + os: [linux] + + '@esbuild/linux-riscv64@0.24.0': + resolution: {integrity: sha512-bEh7dMn/h3QxeR2KTy1DUszQjUrIHPZKyO6aN1X4BCnhfYhuQqedHaa5MxSQA/06j3GpiIlFGSsy1c7Gf9padw==} + engines: {node: '>=18'} + cpu: [riscv64] + os: [linux] + + '@esbuild/linux-s390x@0.24.0': + resolution: {integrity: sha512-ZcQ6+qRkw1UcZGPyrCiHHkmBaj9SiCD8Oqd556HldP+QlpUIe2Wgn3ehQGVoPOvZvtHm8HPx+bH20c9pvbkX3g==} + engines: {node: '>=18'} + cpu: [s390x] + os: [linux] + + '@esbuild/linux-x64@0.24.0': + resolution: {integrity: sha512-vbutsFqQ+foy3wSSbmjBXXIJ6PL3scghJoM8zCL142cGaZKAdCZHyf+Bpu/MmX9zT9Q0zFBVKb36Ma5Fzfa8xA==} + engines: {node: '>=18'} + cpu: [x64] + os: [linux] + + '@esbuild/netbsd-x64@0.24.0': + resolution: {integrity: sha512-hjQ0R/ulkO8fCYFsG0FZoH+pWgTTDreqpqY7UnQntnaKv95uP5iW3+dChxnx7C3trQQU40S+OgWhUVwCjVFLvg==} + engines: {node: '>=18'} + cpu: [x64] + os: [netbsd] + + '@esbuild/openbsd-arm64@0.24.0': + resolution: {integrity: sha512-MD9uzzkPQbYehwcN583yx3Tu5M8EIoTD+tUgKF982WYL9Pf5rKy9ltgD0eUgs8pvKnmizxjXZyLt0z6DC3rRXg==} + engines: {node: '>=18'} + cpu: [arm64] + os: [openbsd] + + '@esbuild/openbsd-x64@0.24.0': + resolution: {integrity: sha512-4ir0aY1NGUhIC1hdoCzr1+5b43mw99uNwVzhIq1OY3QcEwPDO3B7WNXBzaKY5Nsf1+N11i1eOfFcq+D/gOS15Q==} + engines: {node: '>=18'} + cpu: [x64] + os: [openbsd] + + '@esbuild/sunos-x64@0.24.0': + resolution: {integrity: sha512-jVzdzsbM5xrotH+W5f1s+JtUy1UWgjU0Cf4wMvffTB8m6wP5/kx0KiaLHlbJO+dMgtxKV8RQ/JvtlFcdZ1zCPA==} + engines: {node: '>=18'} + cpu: [x64] + os: [sunos] + + '@esbuild/win32-arm64@0.24.0': + resolution: {integrity: sha512-iKc8GAslzRpBytO2/aN3d2yb2z8XTVfNV0PjGlCxKo5SgWmNXx82I/Q3aG1tFfS+A2igVCY97TJ8tnYwpUWLCA==} + engines: {node: '>=18'} + cpu: [arm64] + os: [win32] + + '@esbuild/win32-ia32@0.24.0': + resolution: {integrity: sha512-vQW36KZolfIudCcTnaTpmLQ24Ha1RjygBo39/aLkM2kmjkWmZGEJ5Gn9l5/7tzXA42QGIoWbICfg6KLLkIw6yw==} + engines: {node: '>=18'} + cpu: [ia32] + os: [win32] + + '@esbuild/win32-x64@0.24.0': + resolution: {integrity: sha512-7IAFPrjSQIJrGsK6flwg7NFmwBoSTyF3rl7If0hNUFQU4ilTsEPL6GuMuU9BfIWVVGuRnuIidkSMC+c0Otu8IA==} + engines: {node: '>=18'} + cpu: [x64] + os: [win32] + + '@isaacs/cliui@8.0.2': + resolution: {integrity: sha512-O8jcjabXaleOG9DQ0+ARXWZBTfnP4WNAqzuiJK7ll44AmxGKv/J2M4TPjxjY3znBCfvBXFzucm1twdyFybFqEA==} + engines: {node: '>=12'} + + '@istanbuljs/load-nyc-config@1.1.0': + resolution: {integrity: sha512-VjeHSlIzpv/NyD3N0YuHfXOPDIixcA1q2ZV98wsMqcYlPmv2n3Yb2lYP9XMElnaFVXg5A7YLTeLu6V84uQDjmQ==} + engines: {node: '>=8'} + + '@istanbuljs/schema@0.1.3': + resolution: {integrity: sha512-ZXRY4jNvVgSVQ8DL3LTcakaAtXwTVUxE81hslsyD2AtoXW/wVob10HkOJ1X/pAlcI7D+2YoZKg5do8G/w6RYgA==} + engines: {node: '>=8'} + + '@jest/console@29.7.0': + resolution: {integrity: sha512-5Ni4CU7XHQi32IJ398EEP4RrB8eV09sXP2ROqD4bksHrnTree52PsxvX8tpL8LvTZ3pFzXyPbNQReSN41CAhOg==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + + '@jest/core@29.7.0': + resolution: {integrity: sha512-n7aeXWKMnGtDA48y8TLWJPJmLmmZ642Ceo78cYWEpiD7FzDgmNDV/GCVRorPABdXLJZ/9wzzgZAlHjXjxDHGsg==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + peerDependencies: + node-notifier: ^8.0.1 || ^9.0.0 || ^10.0.0 + peerDependenciesMeta: + node-notifier: + optional: true + + '@jest/environment@29.7.0': + resolution: {integrity: sha512-aQIfHDq33ExsN4jP1NWGXhxgQ/wixs60gDiKO+XVMd8Mn0NWPWgc34ZQDTb2jKaUWQ7MuwoitXAsN2XVXNMpAw==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + + '@jest/expect-utils@29.7.0': + resolution: {integrity: sha512-GlsNBWiFQFCVi9QVSx7f5AgMeLxe9YCCs5PuP2O2LdjDAA8Jh9eX7lA1Jq/xdXw3Wb3hyvlFNfZIfcRetSzYcA==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + + '@jest/expect@29.7.0': + resolution: {integrity: sha512-8uMeAMycttpva3P1lBHB8VciS9V0XAr3GymPpipdyQXbBcuhkLQOSe8E/p92RyAdToS6ZD1tFkX+CkhoECE0dQ==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + + '@jest/fake-timers@29.7.0': + resolution: {integrity: sha512-q4DH1Ha4TTFPdxLsqDXK1d3+ioSL7yL5oCMJZgDYm6i+6CygW5E5xVr/D1HdsGxjt1ZWSfUAs9OxSB/BNelWrQ==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + + '@jest/globals@29.7.0': + resolution: {integrity: sha512-mpiz3dutLbkW2MNFubUGUEVLkTGiqW6yLVTA+JbP6fI6J5iL9Y0Nlg8k95pcF8ctKwCS7WVxteBs29hhfAotzQ==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + + '@jest/reporters@29.7.0': + resolution: {integrity: sha512-DApq0KJbJOEzAFYjHADNNxAE3KbhxQB1y5Kplb5Waqw6zVbuWatSnMjE5gs8FUgEPmNsnZA3NCWl9NG0ia04Pg==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + peerDependencies: + node-notifier: ^8.0.1 || ^9.0.0 || ^10.0.0 + peerDependenciesMeta: + node-notifier: + optional: true + + '@jest/schemas@29.6.3': + resolution: {integrity: sha512-mo5j5X+jIZmJQveBKeS/clAueipV7KgiX1vMgCxam1RNYiqE1w62n0/tJJnHtjW8ZHcQco5gY85jA3mi0L+nSA==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + + '@jest/source-map@29.6.3': + resolution: {integrity: sha512-MHjT95QuipcPrpLM+8JMSzFx6eHp5Bm+4XeFDJlwsvVBjmKNiIAvasGK2fxz2WbGRlnvqehFbh07MMa7n3YJnw==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + + '@jest/test-result@29.7.0': + resolution: {integrity: sha512-Fdx+tv6x1zlkJPcWXmMDAG2HBnaR9XPSd5aDWQVsfrZmLVT3lU1cwyxLgRmXR9yrq4NBoEm9BMsfgFzTQAbJYA==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + + '@jest/test-sequencer@29.7.0': + resolution: {integrity: sha512-GQwJ5WZVrKnOJuiYiAF52UNUJXgTZx1NHjFSEB0qEMmSZKAkdMoIzw/Cj6x6NF4AvV23AUqDpFzQkN/eYCYTxw==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + + '@jest/transform@29.7.0': + resolution: {integrity: sha512-ok/BTPFzFKVMwO5eOHRrvnBVHdRy9IrsrW1GpMaQ9MCnilNLXQKmAX8s1YXDFaai9xJpac2ySzV0YeRRECr2Vw==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + + '@jest/types@29.6.3': + resolution: {integrity: sha512-u3UPsIilWKOM3F9CXtrG8LEJmNxwoCQC/XVj4IKYXvvpx7QIi/Kg1LI5uDmDpKlac62NUtX7eLjRh+jVZcLOzw==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + + '@jridgewell/gen-mapping@0.3.8': + resolution: {integrity: sha512-imAbBGkb+ebQyxKgzv5Hu2nmROxoDOXHh80evxdoXNOrvAnVx7zimzc1Oo5h9RlfV4vPXaE2iM5pOFbvOCClWA==} + engines: {node: '>=6.0.0'} + + '@jridgewell/resolve-uri@3.1.2': + resolution: {integrity: sha512-bRISgCIjP20/tbWSPWMEi54QVPRZExkuD9lJL+UIxUKtwVJA8wW1Trb1jMs1RFXo1CBTNZ/5hpC9QvmKWdopKw==} + engines: {node: '>=6.0.0'} + + '@jridgewell/set-array@1.2.1': + resolution: {integrity: sha512-R8gLRTZeyp03ymzP/6Lil/28tGeGEzhx1q2k703KGWRAI1VdvPIXdG70VJc2pAMw3NA6JKL5hhFu1sJX0Mnn/A==} + engines: {node: '>=6.0.0'} + + '@jridgewell/sourcemap-codec@1.5.0': + resolution: {integrity: sha512-gv3ZRaISU3fjPAgNsriBRqGWQL6quFx04YMPW/zD8XMLsU32mhCCbfbO6KZFLjvYpCZ8zyDEgqsgf+PwPaM7GQ==} + + '@jridgewell/trace-mapping@0.3.25': + resolution: {integrity: sha512-vNk6aEwybGtawWmy/PzwnGDOjCkLWSD2wqvjGGAgOAwCGWySYXfYoxt00IJkTF+8Lb57DwOb3Aa0o9CApepiYQ==} + + '@pkgjs/parseargs@0.11.0': + resolution: {integrity: sha512-+1VkjdD0QBLPodGrJUeqarH8VAIvQODIbwh9XpP5Syisf7YoQgsJKPNFoqqLQlu+VQ/tVSshMR6loPMn8U+dPg==} + engines: {node: '>=14'} + + '@rollup/rollup-android-arm-eabi@4.28.1': + resolution: {integrity: sha512-2aZp8AES04KI2dy3Ss6/MDjXbwBzj+i0GqKtWXgw2/Ma6E4jJvujryO6gJAghIRVz7Vwr9Gtl/8na3nDUKpraQ==} + cpu: [arm] + os: [android] + + '@rollup/rollup-android-arm64@4.28.1': + resolution: {integrity: sha512-EbkK285O+1YMrg57xVA+Dp0tDBRB93/BZKph9XhMjezf6F4TpYjaUSuPt5J0fZXlSag0LmZAsTmdGGqPp4pQFA==} + cpu: [arm64] + os: [android] + + '@rollup/rollup-darwin-arm64@4.28.1': + resolution: {integrity: sha512-prduvrMKU6NzMq6nxzQw445zXgaDBbMQvmKSJaxpaZ5R1QDM8w+eGxo6Y/jhT/cLoCvnZI42oEqf9KQNYz1fqQ==} + cpu: [arm64] + os: [darwin] + + '@rollup/rollup-darwin-x64@4.28.1': + resolution: {integrity: sha512-WsvbOunsUk0wccO/TV4o7IKgloJ942hVFK1CLatwv6TJspcCZb9umQkPdvB7FihmdxgaKR5JyxDjWpCOp4uZlQ==} + cpu: [x64] + os: [darwin] + + '@rollup/rollup-freebsd-arm64@4.28.1': + resolution: {integrity: sha512-HTDPdY1caUcU4qK23FeeGxCdJF64cKkqajU0iBnTVxS8F7H/7BewvYoG+va1KPSL63kQ1PGNyiwKOfReavzvNA==} + cpu: [arm64] + os: [freebsd] + + '@rollup/rollup-freebsd-x64@4.28.1': + resolution: {integrity: sha512-m/uYasxkUevcFTeRSM9TeLyPe2QDuqtjkeoTpP9SW0XxUWfcYrGDMkO/m2tTw+4NMAF9P2fU3Mw4ahNvo7QmsQ==} + cpu: [x64] + os: [freebsd] + + '@rollup/rollup-linux-arm-gnueabihf@4.28.1': + resolution: {integrity: sha512-QAg11ZIt6mcmzpNE6JZBpKfJaKkqTm1A9+y9O+frdZJEuhQxiugM05gnCWiANHj4RmbgeVJpTdmKRmH/a+0QbA==} + cpu: [arm] + os: [linux] + + '@rollup/rollup-linux-arm-musleabihf@4.28.1': + resolution: {integrity: sha512-dRP9PEBfolq1dmMcFqbEPSd9VlRuVWEGSmbxVEfiq2cs2jlZAl0YNxFzAQS2OrQmsLBLAATDMb3Z6MFv5vOcXg==} + cpu: [arm] + os: [linux] + + '@rollup/rollup-linux-arm64-gnu@4.28.1': + resolution: {integrity: sha512-uGr8khxO+CKT4XU8ZUH1TTEUtlktK6Kgtv0+6bIFSeiSlnGJHG1tSFSjm41uQ9sAO/5ULx9mWOz70jYLyv1QkA==} + cpu: [arm64] + os: [linux] + + '@rollup/rollup-linux-arm64-musl@4.28.1': + resolution: {integrity: sha512-QF54q8MYGAqMLrX2t7tNpi01nvq5RI59UBNx+3+37zoKX5KViPo/gk2QLhsuqok05sSCRluj0D00LzCwBikb0A==} + cpu: [arm64] + os: [linux] + + '@rollup/rollup-linux-loongarch64-gnu@4.28.1': + resolution: {integrity: sha512-vPul4uodvWvLhRco2w0GcyZcdyBfpfDRgNKU+p35AWEbJ/HPs1tOUrkSueVbBS0RQHAf/A+nNtDpvw95PeVKOA==} + cpu: [loong64] + os: [linux] + + '@rollup/rollup-linux-powerpc64le-gnu@4.28.1': + resolution: {integrity: sha512-pTnTdBuC2+pt1Rmm2SV7JWRqzhYpEILML4PKODqLz+C7Ou2apEV52h19CR7es+u04KlqplggmN9sqZlekg3R1A==} + cpu: [ppc64] + os: [linux] + + '@rollup/rollup-linux-riscv64-gnu@4.28.1': + resolution: {integrity: sha512-vWXy1Nfg7TPBSuAncfInmAI/WZDd5vOklyLJDdIRKABcZWojNDY0NJwruY2AcnCLnRJKSaBgf/GiJfauu8cQZA==} + cpu: [riscv64] + os: [linux] + + '@rollup/rollup-linux-s390x-gnu@4.28.1': + resolution: {integrity: sha512-/yqC2Y53oZjb0yz8PVuGOQQNOTwxcizudunl/tFs1aLvObTclTwZ0JhXF2XcPT/zuaymemCDSuuUPXJJyqeDOg==} + cpu: [s390x] + os: [linux] + + '@rollup/rollup-linux-x64-gnu@4.28.1': + resolution: {integrity: sha512-fzgeABz7rrAlKYB0y2kSEiURrI0691CSL0+KXwKwhxvj92VULEDQLpBYLHpF49MSiPG4sq5CK3qHMnb9tlCjBw==} + cpu: [x64] + os: [linux] + + '@rollup/rollup-linux-x64-musl@4.28.1': + resolution: {integrity: sha512-xQTDVzSGiMlSshpJCtudbWyRfLaNiVPXt1WgdWTwWz9n0U12cI2ZVtWe/Jgwyv/6wjL7b66uu61Vg0POWVfz4g==} + cpu: [x64] + os: [linux] + + '@rollup/rollup-win32-arm64-msvc@4.28.1': + resolution: {integrity: sha512-wSXmDRVupJstFP7elGMgv+2HqXelQhuNf+IS4V+nUpNVi/GUiBgDmfwD0UGN3pcAnWsgKG3I52wMOBnk1VHr/A==} + cpu: [arm64] + os: [win32] + + '@rollup/rollup-win32-ia32-msvc@4.28.1': + resolution: {integrity: sha512-ZkyTJ/9vkgrE/Rk9vhMXhf8l9D+eAhbAVbsGsXKy2ohmJaWg0LPQLnIxRdRp/bKyr8tXuPlXhIoGlEB5XpJnGA==} + cpu: [ia32] + os: [win32] + + '@rollup/rollup-win32-x64-msvc@4.28.1': + resolution: {integrity: sha512-ZvK2jBafvttJjoIdKm/Q/Bh7IJ1Ose9IBOwpOXcOvW3ikGTQGmKDgxTC6oCAzW6PynbkKP8+um1du81XJHZ0JA==} + cpu: [x64] + os: [win32] + + '@sinclair/typebox@0.27.8': + resolution: {integrity: sha512-+Fj43pSMwJs4KRrH/938Uf+uAELIgVBmQzg/q1YG10djyfA3TnrU8N8XzqCh/okZdszqBQTZf96idMfE5lnwTA==} + + '@sinonjs/commons@3.0.1': + resolution: {integrity: sha512-K3mCHKQ9sVh8o1C9cxkwxaOmXoAMlDxC1mYyHrjqOWEcBjYr76t96zL2zlj5dUGZ3HSw240X1qgH3Mjf1yJWpQ==} + + '@sinonjs/fake-timers@10.3.0': + resolution: {integrity: sha512-V4BG07kuYSUkTCSBHG8G8TNhM+F19jXFWnQtzj+we8DrkpSBCee9Z3Ms8yiGer/dlmhe35/Xdgyo3/0rQKg7YA==} + + '@types/axios@0.14.4': + resolution: {integrity: sha512-9JgOaunvQdsQ/qW2OPmE5+hCeUB52lQSolecrFrthct55QekhmXEwT203s20RL+UHtCQc15y3VXpby9E7Kkh/g==} + deprecated: This is a stub types definition. axios provides its own type definitions, so you do not need this installed. + + '@types/babel__core@7.20.5': + resolution: {integrity: sha512-qoQprZvz5wQFJwMDqeseRXWv3rqMvhgpbXFfVyWhbx9X47POIA6i/+dXefEmZKoAgOaTdaIgNSMqMIU61yRyzA==} + + '@types/babel__generator@7.6.8': + resolution: {integrity: sha512-ASsj+tpEDsEiFr1arWrlN6V3mdfjRMZt6LtK/Vp/kreFLnr5QH5+DhvD5nINYZXzwJvXeGq+05iUXcAzVrqWtw==} + + '@types/babel__template@7.4.4': + resolution: {integrity: sha512-h/NUaSyG5EyxBIp8YRxo4RMe2/qQgvyowRwVMzhYhBCONbW8PUsg4lkFMrhgZhUe5z3L3MiLDuvyJ/CaPa2A8A==} + + '@types/babel__traverse@7.20.6': + resolution: {integrity: sha512-r1bzfrm0tomOI8g1SzvCaQHo6Lcv6zu0EA+W2kHrt8dyrHQxGzBBL4kdkzIS+jBMV+EYcMAEAqXqYaLJq5rOZg==} + + '@types/dotenv@8.2.3': + resolution: {integrity: sha512-g2FXjlDX/cYuc5CiQvyU/6kkbP1JtmGzh0obW50zD7OKeILVL0NSpPWLXVfqoAGQjom2/SLLx9zHq0KXvD6mbw==} + deprecated: This is a stub types definition. dotenv provides its own type definitions, so you do not need this installed. + + '@types/estree@1.0.6': + resolution: {integrity: sha512-AYnb1nQyY49te+VRAVgmzfcgjYS91mY5P0TKUDCLEM+gNnA+3T6rWITXRLYCpahpqSQbN5cE+gHpnPyXjHWxcw==} + + '@types/graceful-fs@4.1.9': + resolution: {integrity: sha512-olP3sd1qOEe5dXTSaFvQG+02VdRXcdytWLAZsAq1PecU8uqQAhkrnbli7DagjtXKW/Bl7YJbUsa8MPcuc8LHEQ==} + + '@types/istanbul-lib-coverage@2.0.6': + resolution: {integrity: sha512-2QF/t/auWm0lsy8XtKVPG19v3sSOQlJe/YHZgfjb/KBBHOGSV+J2q/S671rcq9uTBrLAXmZpqJiaQbMT+zNU1w==} + + '@types/istanbul-lib-report@3.0.3': + resolution: {integrity: sha512-NQn7AHQnk/RSLOxrBbGyJM/aVQ+pjj5HCgasFxc0K/KhoATfQ/47AyUl15I2yBUpihjmas+a+VJBOqecrFH+uA==} + + '@types/istanbul-reports@3.0.4': + resolution: {integrity: sha512-pk2B1NWalF9toCRu6gjBzR69syFjP4Od8WRAX+0mmf9lAjCRicLOWc+ZrxZHx/0XRjotgkF9t6iaMJ+aXcOdZQ==} + + '@types/jest@29.5.14': + resolution: {integrity: sha512-ZN+4sdnLUbo8EVvVc2ao0GFW6oVrQRPn4K2lglySj7APvSrgzxHiNNK99us4WDMi57xxA2yggblIAMNhXOotLQ==} + + '@types/mocha@10.0.10': + resolution: {integrity: sha512-xPyYSz1cMPnJQhl0CLMH68j3gprKZaTjG3s5Vi+fDgx+uhG9NOXwbVt52eFS8ECyXhyKcjDLCBEqBExKuiZb7Q==} + + '@types/node@20.17.10': + resolution: {integrity: sha512-/jrvh5h6NXhEauFFexRin69nA0uHJ5gwk4iDivp/DeoEua3uwCUto6PC86IpRITBOs4+6i2I56K5x5b6WYGXHA==} + + '@types/stack-utils@2.0.3': + resolution: {integrity: sha512-9aEbYZ3TbYMznPdcdr3SmIrLXwC/AKZXQeCf9Pgao5CKb8CyHuEX5jzWPTkvregvhRJHcpRO6BFoGW9ycaOkYw==} + + '@types/uuid@9.0.8': + resolution: {integrity: sha512-jg+97EGIcY9AGHJJRaaPVgetKDsrTgbRjQ5Msgjh/DQKEFl0DtyRr/VCOyD1T2R1MNeWPK/u7JoGhlDZnKBAfA==} + + '@types/yargs-parser@21.0.3': + resolution: {integrity: sha512-I4q9QU9MQv4oEOz4tAHJtNz1cwuLxn2F3xcc2iV5WdqLPpUnj30aUuxt1mAxYTG+oe8CZMV/+6rU4S4gRDzqtQ==} + + '@types/yargs@17.0.33': + resolution: {integrity: sha512-WpxBCKWPLr4xSsHgz511rFJAM+wS28w2zEO1QDNY5zM/S8ok70NNfztH0xwhqKyaK0OHCbN98LDAZuy1ctxDkA==} + + ansi-escapes@4.3.2: + resolution: {integrity: sha512-gKXj5ALrKWQLsYG9jlTRmR/xKluxHV+Z9QEwNIgCfM1/uwPMCuzVVnh5mwTd+OuBZcwSIMbqssNWRm1lE51QaQ==} + engines: {node: '>=8'} + + ansi-regex@5.0.1: + resolution: {integrity: sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==} + engines: {node: '>=8'} + + ansi-regex@6.1.0: + resolution: {integrity: sha512-7HSX4QQb4CspciLpVFwyRe79O3xsIZDDLER21kERQ71oaPodF8jL725AgJMFAYbooIqolJoRLuM81SpeUkpkvA==} + engines: {node: '>=12'} + + ansi-styles@4.3.0: + resolution: {integrity: sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==} + engines: {node: '>=8'} + + ansi-styles@5.2.0: + resolution: {integrity: sha512-Cxwpt2SfTzTtXcfOlzGEee8O+c+MmUgGrNiBcXnuWxuFJHe6a5Hz7qwhwe5OgaSYI0IJvkLqWX1ASG+cJOkEiA==} + engines: {node: '>=10'} + + ansi-styles@6.2.1: + resolution: {integrity: sha512-bN798gFfQX+viw3R7yrGWRqnrN2oRkEkUjjl4JNn4E8GxxbjtG3FbrEIIY3l8/hrwUwIeCZvi4QuOTP4MErVug==} + engines: {node: '>=12'} + + any-promise@1.3.0: + resolution: {integrity: sha512-7UvmKalWRt1wgjL1RrGxoSJW/0QZFIegpeGvZG9kjp8vrRu55XTHbwnqq2GpXm9uLbcuhxm3IqX9OB4MZR1b2A==} + + anymatch@3.1.3: + resolution: {integrity: sha512-KMReFUr0B4t+D+OBkjR3KYqvocp2XaSzO55UcB6mgQMd3KbcE+mWTyvVV7D/zsdEbNnV6acZUutkiHQXvTr1Rw==} + engines: {node: '>= 8'} + + argparse@1.0.10: + resolution: {integrity: sha512-o5Roy6tNG4SL/FOkCAN6RzjiakZS25RLYFrcMttJqbdd8BWrnA+fGz57iN5Pb06pvBGvl5gQ0B48dJlslXvoTg==} + + async@3.2.6: + resolution: {integrity: sha512-htCUDlxyyCLMgaM3xXg0C0LW2xqfuQ6p05pCEIsXuyQ+a1koYKTuBMzRNwmybfLgvJDMd0r1LTn4+E0Ti6C2AA==} + + asynckit@0.4.0: + resolution: {integrity: sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==} + + axios@1.7.9: + resolution: {integrity: sha512-LhLcE7Hbiryz8oMDdDptSrWowmB4Bl6RCt6sIJKpRB4XtVf0iEgewX3au/pJqm+Py1kCASkb/FFKjxQaLtxJvw==} + + babel-jest@29.7.0: + resolution: {integrity: sha512-BrvGY3xZSwEcCzKvKsCi2GgHqDqsYkOP4/by5xCgIwGXQxIEh+8ew3gmrE1y7XRR6LHZIj6yLYnUi/mm2KXKBg==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + peerDependencies: + '@babel/core': ^7.8.0 + + babel-plugin-istanbul@6.1.1: + resolution: {integrity: sha512-Y1IQok9821cC9onCx5otgFfRm7Lm+I+wwxOx738M/WLPZ9Q42m4IG5W0FNX8WLL2gYMZo3JkuXIH2DOpWM+qwA==} + engines: {node: '>=8'} + + babel-plugin-jest-hoist@29.6.3: + resolution: {integrity: sha512-ESAc/RJvGTFEzRwOTT4+lNDk/GNHMkKbNzsvT0qKRfDyyYTskxB5rnU2njIDYVxXCBHHEI1c0YwHob3WaYujOg==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + + babel-preset-current-node-syntax@1.1.0: + resolution: {integrity: sha512-ldYss8SbBlWva1bs28q78Ju5Zq1F+8BrqBZZ0VFhLBvhh6lCpC2o3gDJi/5DRLs9FgYZCnmPYIVFU4lRXCkyUw==} + peerDependencies: + '@babel/core': ^7.0.0 + + babel-preset-jest@29.6.3: + resolution: {integrity: sha512-0B3bhxR6snWXJZtR/RliHTDPRgn1sNHOR0yVtq/IiQFyuOVjFS+wuio/R4gSNkyYmKmJB4wGZv2NZanmKmTnNA==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + peerDependencies: + '@babel/core': ^7.0.0 + + balanced-match@1.0.2: + resolution: {integrity: sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw==} + + brace-expansion@1.1.11: + resolution: {integrity: sha512-iCuPHDFgrHX7H2vEI/5xpz07zSHB00TpugqhmYtVmMO6518mCuRMoOYFldEBl0g187ufozdaHgWKcYFb61qGiA==} + + brace-expansion@2.0.1: + resolution: {integrity: sha512-XnAIvQ8eM+kC6aULx6wuQiwVsnzsi9d3WxzV3FpWTGA19F621kwdbsAcFKXgKUHZWsy+mY6iL1sHTxWEFCytDA==} + + braces@3.0.3: + resolution: {integrity: sha512-yQbXgO/OSZVD2IsiLlro+7Hf6Q18EJrKSEsdoMzKePKXct3gvD8oLcOQdIzGupr5Fj+EDe8gO/lxc1BzfMpxvA==} + engines: {node: '>=8'} + + browserslist@4.24.3: + resolution: {integrity: sha512-1CPmv8iobE2fyRMV97dAcMVegvvWKxmq94hkLiAkUGwKVTyDLw33K+ZxiFrREKmmps4rIw6grcCFCnTMSZ/YiA==} + engines: {node: ^6 || ^7 || ^8 || ^9 || ^10 || ^11 || ^12 || >=13.7} + hasBin: true + + bs-logger@0.2.6: + resolution: {integrity: sha512-pd8DCoxmbgc7hyPKOvxtqNcjYoOsABPQdcCUjGp3d42VR2CX1ORhk2A87oqqu5R1kk+76nsxZupkmyd+MVtCog==} + engines: {node: '>= 6'} + + bser@2.1.1: + resolution: {integrity: sha512-gQxTNE/GAfIIrmHLUE3oJyp5FO6HRBfhjnw4/wMmA63ZGDJnWBmgY/lyQBpnDUkGmAhbSe39tx2d/iTOAfglwQ==} + + buffer-from@1.1.2: + resolution: {integrity: sha512-E+XQCRwSbaaiChtv6k6Dwgc+bx+Bs6vuKJHHl5kox/BaKbhiXzqQOwK4cO22yElGp2OCmjwVhT3HmxgyPGnJfQ==} + + bundle-require@5.0.0: + resolution: {integrity: sha512-GuziW3fSSmopcx4KRymQEJVbZUfqlCqcq7dvs6TYwKRZiegK/2buMxQTPs6MGlNv50wms1699qYO54R8XfRX4w==} + engines: {node: ^12.20.0 || ^14.13.1 || >=16.0.0} + peerDependencies: + esbuild: '>=0.18' + + cac@6.7.14: + resolution: {integrity: sha512-b6Ilus+c3RrdDk+JhLKUAQfzzgLEPy6wcXqS7f/xe1EETvsDP6GORG7SFuOs6cID5YkqchW/LXZbX5bc8j7ZcQ==} + engines: {node: '>=8'} + + callsites@3.1.0: + resolution: {integrity: sha512-P8BjAsXvZS+VIDUI11hHCQEv74YT67YUi5JJFNWIqL235sBmjX4+qx9Muvls5ivyNENctx46xQLQ3aTuE7ssaQ==} + engines: {node: '>=6'} + + camelcase@5.3.1: + resolution: {integrity: sha512-L28STB170nwWS63UjtlEOE3dldQApaJXZkOI1uMFfzf3rRuPegHaHesyee+YxQ+W6SvRDQV6UrdOdRiR153wJg==} + engines: {node: '>=6'} + + camelcase@6.3.0: + resolution: {integrity: sha512-Gmy6FhYlCY7uOElZUSbxo2UCDH8owEk996gkbrpsgGtrJLM3J7jGxl9Ic7Qwwj4ivOE5AWZWRMecDdF7hqGjFA==} + engines: {node: '>=10'} + + caniuse-lite@1.0.30001690: + resolution: {integrity: sha512-5ExiE3qQN6oF8Clf8ifIDcMRCRE/dMGcETG/XGMD8/XiXm6HXQgQTh1yZYLXXpSOsEUlJm1Xr7kGULZTuGtP/w==} + + chalk@4.1.2: + resolution: {integrity: sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==} + engines: {node: '>=10'} + + char-regex@1.0.2: + resolution: {integrity: sha512-kWWXztvZ5SBQV+eRgKFeh8q5sLuZY2+8WUIzlxWVTg+oGwY14qylx1KbKzHd8P6ZYkAg0xyIDU9JMHhyJMZ1jw==} + engines: {node: '>=10'} + + chokidar@4.0.3: + resolution: {integrity: sha512-Qgzu8kfBvo+cA4962jnP1KkS6Dop5NS6g7R5LFYJr4b8Ub94PPQXUksCw9PvXoeXPRRddRNC5C1JQUR2SMGtnA==} + engines: {node: '>= 14.16.0'} + + ci-info@3.9.0: + resolution: {integrity: sha512-NIxF55hv4nSqQswkAeiOi1r83xy8JldOFDTWiug55KBu9Jnblncd2U6ViHmYgHf01TPZS77NJBhBMKdWj9HQMQ==} + engines: {node: '>=8'} + + cjs-module-lexer@1.4.1: + resolution: {integrity: sha512-cuSVIHi9/9E/+821Qjdvngor+xpnlwnuwIyZOaLmHBVdXL+gP+I6QQB9VkO7RI77YIcTV+S1W9AreJ5eN63JBA==} + + cliui@8.0.1: + resolution: {integrity: sha512-BSeNnyus75C4//NQ9gQt1/csTXyo/8Sb+afLAkzAptFuMsod9HFokGNudZpi/oQV73hnVK+sR+5PVRMd+Dr7YQ==} + engines: {node: '>=12'} + + co@4.6.0: + resolution: {integrity: sha512-QVb0dM5HvG+uaxitm8wONl7jltx8dqhfU33DcqtOZcLSVIKSDDLDi7+0LbAKiyI8hD9u42m2YxXSkMGWThaecQ==} + engines: {iojs: '>= 1.0.0', node: '>= 0.12.0'} + + collect-v8-coverage@1.0.2: + resolution: {integrity: sha512-lHl4d5/ONEbLlJvaJNtsF/Lz+WvB07u2ycqTYbdrq7UypDXailES4valYb2eWiJFxZlVmpGekfqoxQhzyFdT4Q==} + + color-convert@2.0.1: + resolution: {integrity: sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==} + engines: {node: '>=7.0.0'} + + color-name@1.1.4: + resolution: {integrity: sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==} + + combined-stream@1.0.8: + resolution: {integrity: sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==} + engines: {node: '>= 0.8'} + + commander@4.1.1: + resolution: {integrity: sha512-NOKm8xhkzAjzFx8B2v5OAHT+u5pRQc2UCa2Vq9jYL/31o2wi9mxBA7LIFs3sV5VSC49z6pEhfbMULvShKj26WA==} + engines: {node: '>= 6'} + + concat-map@0.0.1: + resolution: {integrity: sha512-/Srv4dswyQNBfohGpz9o6Yb3Gz3SrUDqBH5rTuhGR7ahtlbYKnVxw2bCFMRljaA7EXHaXZ8wsHdodFvbkhKmqg==} + + consola@3.2.3: + resolution: {integrity: sha512-I5qxpzLv+sJhTVEoLYNcTW+bThDCPsit0vLNKShZx6rLtpilNpmmeTPaeqJb9ZE9dV3DGaeby6Vuhrw38WjeyQ==} + engines: {node: ^14.18.0 || >=16.10.0} + + convert-source-map@2.0.0: + resolution: {integrity: sha512-Kvp459HrV2FEJ1CAsi1Ku+MY3kasH19TFykTz2xWmMeq6bk2NU3XXvfJ+Q61m0xktWwt+1HSYf3JZsTms3aRJg==} + + create-jest@29.7.0: + resolution: {integrity: sha512-Adz2bdH0Vq3F53KEMJOoftQFutWCukm6J24wbPWRO4k1kMY7gS7ds/uoJkNuV8wDCtWWnuwGcJwpWcih+zEW1Q==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + hasBin: true + + cross-spawn@7.0.6: + resolution: {integrity: sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==} + engines: {node: '>= 8'} + + debug@4.4.0: + resolution: {integrity: sha512-6WTZ/IxCY/T6BALoZHaE4ctp9xm+Z5kY/pzYaCHRFeyVhojxlrm+46y68HA6hr0TcwEssoxNiDEUJQjfPZ/RYA==} + engines: {node: '>=6.0'} + peerDependencies: + supports-color: '*' + peerDependenciesMeta: + supports-color: + optional: true + + dedent@1.5.3: + resolution: {integrity: sha512-NHQtfOOW68WD8lgypbLA5oT+Bt0xXJhiYvoR6SmmNXZfpzOGXwdKWmcwG8N7PwVVWV3eF/68nmD9BaJSsTBhyQ==} + peerDependencies: + babel-plugin-macros: ^3.1.0 + peerDependenciesMeta: + babel-plugin-macros: + optional: true + + deepmerge@4.3.1: + resolution: {integrity: sha512-3sUqbMEc77XqpdNO7FRyRog+eW3ph+GYCbj+rK+uYyRMuwsVy0rMiVtPn+QJlKFvWP/1PYpapqYn0Me2knFn+A==} + engines: {node: '>=0.10.0'} + + delayed-stream@1.0.0: + resolution: {integrity: sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==} + engines: {node: '>=0.4.0'} + + detect-newline@3.1.0: + resolution: {integrity: sha512-TLz+x/vEXm/Y7P7wn1EJFNLxYpUD4TgMosxY6fAVJUnJMbupHBOncxyWUG9OpTaH9EBD7uFI5LfEgmMOc54DsA==} + engines: {node: '>=8'} + + diff-sequences@29.6.3: + resolution: {integrity: sha512-EjePK1srD3P08o2j4f0ExnylqRs5B9tJjcp9t1krH2qRi8CCdsYfwe9JgSLurFBWwq4uOlipzfk5fHNvwFKr8Q==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + + dotenv@16.4.7: + resolution: {integrity: sha512-47qPchRCykZC03FhkYAhrvwU4xDBFIj1QPqaarj6mdM/hgUzfPHcpkHJOn3mJAufFeeAxAzeGsr5X0M4k6fLZQ==} + engines: {node: '>=12'} + + eastasianwidth@0.2.0: + resolution: {integrity: sha512-I88TYZWc9XiYHRQ4/3c5rjjfgkjhLyW2luGIheGERbNQ6OY7yTybanSpDXZa8y7VUP9YmDcYa+eyq4ca7iLqWA==} + + ejs@3.1.10: + resolution: {integrity: sha512-UeJmFfOrAQS8OJWPZ4qtgHyWExa088/MtK5UEyoJGFH67cDEXkZSviOiKRCZ4Xij0zxI3JECgYs3oKx+AizQBA==} + engines: {node: '>=0.10.0'} + hasBin: true + + electron-to-chromium@1.5.74: + resolution: {integrity: sha512-ck3//9RC+6oss/1Bh9tiAVFy5vfSKbRHAFh7Z3/eTRkEqJeWgymloShB17Vg3Z4nmDNp35vAd1BZ6CMW4Wt6Iw==} + + emittery@0.13.1: + resolution: {integrity: sha512-DeWwawk6r5yR9jFgnDKYt4sLS0LmHJJi3ZOnb5/JdbYwj3nW+FxQnHIjhBKz8YLC7oRNPVM9NQ47I3CVx34eqQ==} + engines: {node: '>=12'} + + emoji-regex@8.0.0: + resolution: {integrity: sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A==} + + emoji-regex@9.2.2: + resolution: {integrity: sha512-L18DaJsXSUk2+42pv8mLs5jJT2hqFkFE4j21wOmgbUqsZ2hL72NsUU785g9RXgo3s0ZNgVl42TiHp3ZtOv/Vyg==} + + error-ex@1.3.2: + resolution: {integrity: sha512-7dFHNmqeFSEt2ZBsCriorKnn3Z2pj+fd9kmI6QoWw4//DL+icEBfc0U7qJCisqrTsKTjw4fNFy2pW9OqStD84g==} + + esbuild@0.24.0: + resolution: {integrity: sha512-FuLPevChGDshgSicjisSooU0cemp/sGXR841D5LHMB7mTVOmsEHcAxaH3irL53+8YDIeVNQEySh4DaYU/iuPqQ==} + engines: {node: '>=18'} + hasBin: true + + escalade@3.2.0: + resolution: {integrity: sha512-WUj2qlxaQtO4g6Pq5c29GTcWGDyd8itL8zTlipgECz3JesAiiOKotd8JU6otB3PACgG6xkJUyVhboMS+bje/jA==} + engines: {node: '>=6'} + + escape-string-regexp@2.0.0: + resolution: {integrity: sha512-UpzcLCXolUWcNu5HtVMHYdXJjArjsF9C0aNnquZYY4uW/Vu0miy5YoWvbV345HauVvcAUnpRuhMMcqTcGOY2+w==} + engines: {node: '>=8'} + + esprima@4.0.1: + resolution: {integrity: sha512-eGuFFw7Upda+g4p+QHvnW0RyTX/SVeJBDM/gCtMARO0cLuT2HcEKnTPvhjV6aGeqrCB/sbNop0Kszm0jsaWU4A==} + engines: {node: '>=4'} + hasBin: true + + execa@5.1.1: + resolution: {integrity: sha512-8uSpZZocAZRBAPIEINJj3Lo9HyGitllczc27Eh5YYojjMFMn8yHMDMaUHE2Jqfq05D/wucwI4JGURyXt1vchyg==} + engines: {node: '>=10'} + + exit@0.1.2: + resolution: {integrity: sha512-Zk/eNKV2zbjpKzrsQ+n1G6poVbErQxJ0LBOJXaKZ1EViLzH+hrLu9cdXI4zw9dBQJslwBEpbQ2P1oS7nDxs6jQ==} + engines: {node: '>= 0.8.0'} + + expect@29.7.0: + resolution: {integrity: sha512-2Zks0hf1VLFYI1kbh0I5jP3KHHyCHpkfyHBzsSXRFgl/Bg9mWYfMW8oD+PdMPlEwy5HNsR9JutYy6pMeOh61nw==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + + fast-json-stable-stringify@2.1.0: + resolution: {integrity: sha512-lhd/wF+Lk98HZoTCtlVraHtfh5XYijIjalXck7saUtuanSDyLMxnHhSXEDJqHxD7msR8D0uCmqlkwjCV8xvwHw==} + + fb-watchman@2.0.2: + resolution: {integrity: sha512-p5161BqbuCaSnB8jIbzQHOlpgsPmK5rJVDfDKO91Axs5NC1uu3HRQm6wt9cd9/+GtQQIO53JdGXXoyDpTAsgYA==} + + fdir@6.4.2: + resolution: {integrity: sha512-KnhMXsKSPZlAhp7+IjUkRZKPb4fUyccpDrdFXbi4QL1qkmFh9kVY09Yox+n4MaOb3lHZ1Tv829C3oaaXoMYPDQ==} + peerDependencies: + picomatch: ^3 || ^4 + peerDependenciesMeta: + picomatch: + optional: true + + filelist@1.0.4: + resolution: {integrity: sha512-w1cEuf3S+DrLCQL7ET6kz+gmlJdbq9J7yXCSjK/OZCPA+qEN1WyF4ZAf0YYJa4/shHJra2t/d/r8SV4Ji+x+8Q==} + + fill-range@7.1.1: + resolution: {integrity: sha512-YsGpe3WHLK8ZYi4tWDg2Jy3ebRz2rXowDxnld4bkQB00cc/1Zw9AWnC0i9ztDJitivtQvaI9KaLyKrc+hBW0yg==} + engines: {node: '>=8'} + + find-up@4.1.0: + resolution: {integrity: sha512-PpOwAdQ/YlXQ2vj8a3h8IipDuYRi3wceVQQGYWxNINccq40Anw7BlsEXCMbt1Zt+OLA6Fq9suIpIWD0OsnISlw==} + engines: {node: '>=8'} + + follow-redirects@1.15.9: + resolution: {integrity: sha512-gew4GsXizNgdoRyqmyfMHyAmXsZDk6mHkSxZFCzW9gwlbtOW44CDtYavM+y+72qD/Vq2l550kMF52DT8fOLJqQ==} + engines: {node: '>=4.0'} + peerDependencies: + debug: '*' + peerDependenciesMeta: + debug: + optional: true + + foreground-child@3.3.0: + resolution: {integrity: sha512-Ld2g8rrAyMYFXBhEqMz8ZAHBi4J4uS1i/CxGMDnjyFWddMXLVcDp051DZfu+t7+ab7Wv6SMqpWmyFIj5UbfFvg==} + engines: {node: '>=14'} + + form-data@4.0.1: + resolution: {integrity: sha512-tzN8e4TX8+kkxGPK8D5u0FNmjPUjw3lwC9lSLxxoB/+GtsJG91CO8bSWy73APlgAZzZbXEYZJuxjkHH2w+Ezhw==} + engines: {node: '>= 6'} + + fs.realpath@1.0.0: + resolution: {integrity: sha512-OO0pH2lK6a0hZnAdau5ItzHPI6pUlvI7jMVnxUQRtw4owF2wk8lOSabtGDCTP4Ggrg2MbGnWO9X8K1t4+fGMDw==} + + fsevents@2.3.3: + resolution: {integrity: sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==} + engines: {node: ^8.16.0 || ^10.6.0 || >=11.0.0} + os: [darwin] + + function-bind@1.1.2: + resolution: {integrity: sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA==} + + gensync@1.0.0-beta.2: + resolution: {integrity: sha512-3hN7NaskYvMDLQY55gnW3NQ+mesEAepTqlg+VEbj7zzqEMBVNhzcGYYeqFo/TlYz6eQiFcp1HcsCZO+nGgS8zg==} + engines: {node: '>=6.9.0'} + + get-caller-file@2.0.5: + resolution: {integrity: sha512-DyFP3BM/3YHTQOCUL/w0OZHR0lpKeGrxotcHWcqNEdnltqFwXVfhEBQ94eIo34AfQpo0rGki4cyIiftY06h2Fg==} + engines: {node: 6.* || 8.* || >= 10.*} + + get-package-type@0.1.0: + resolution: {integrity: sha512-pjzuKtY64GYfWizNAJ0fr9VqttZkNiK2iS430LtIHzjBEr6bX8Am2zm4sW4Ro5wjWW5cAlRL1qAMTcXbjNAO2Q==} + engines: {node: '>=8.0.0'} + + get-stream@6.0.1: + resolution: {integrity: sha512-ts6Wi+2j3jQjqi70w5AlN8DFnkSwC+MqmxEzdEALB2qXZYV3X/b1CTfgPLGJNMeAWxdPfU8FO1ms3NUfaHCPYg==} + engines: {node: '>=10'} + + glob@10.4.5: + resolution: {integrity: sha512-7Bv8RF0k6xjo7d4A/PxYLbUCfb6c+Vpd2/mB2yRDlew7Jb5hEXiCD9ibfO7wpk8i4sevK6DFny9h7EYbM3/sHg==} + hasBin: true + + glob@7.2.3: + resolution: {integrity: sha512-nFR0zLpU2YCaRxwoCJvL6UvCH2JFyFVIvwTLsIf21AuHlMskA1hhTdk+LlYJtOlYt9v6dvszD2BGRqBL+iQK9Q==} + deprecated: Glob versions prior to v9 are no longer supported + + globals@11.12.0: + resolution: {integrity: sha512-WOBp/EEGUiIsJSp7wcv/y6MO+lV9UoncWqxuFfm8eBwzWNgyfBd6Gz+IeKQ9jCmyhoH99g15M3T+QaVHFjizVA==} + engines: {node: '>=4'} + + graceful-fs@4.2.11: + resolution: {integrity: sha512-RbJ5/jmFcNNCcDV5o9eTnBLJ/HszWV0P73bc+Ff4nS/rJj+YaS6IGyiOL0VoBYX+l1Wrl3k63h/KrH+nhJ0XvQ==} + + has-flag@4.0.0: + resolution: {integrity: sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==} + engines: {node: '>=8'} + + hasown@2.0.2: + resolution: {integrity: sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ==} + engines: {node: '>= 0.4'} + + html-escaper@2.0.2: + resolution: {integrity: sha512-H2iMtd0I4Mt5eYiapRdIDjp+XzelXQ0tFE4JS7YFwFevXXMmOp9myNrUvCg0D6ws8iqkRPBfKHgbwig1SmlLfg==} + + human-signals@2.1.0: + resolution: {integrity: sha512-B4FFZ6q/T2jhhksgkbEW3HBvWIfDW85snkQgawt07S7J5QXTk6BkNV+0yAeZrM5QpMAdYlocGoljn0sJ/WQkFw==} + engines: {node: '>=10.17.0'} + + import-local@3.2.0: + resolution: {integrity: sha512-2SPlun1JUPWoM6t3F0dw0FkCF/jWY8kttcY4f599GLTSjh2OCuuhdTkJQsEcZzBqbXZGKMK2OqW1oZsjtf/gQA==} + engines: {node: '>=8'} + hasBin: true + + imurmurhash@0.1.4: + resolution: {integrity: sha512-JmXMZ6wuvDmLiHEml9ykzqO6lwFbof0GG4IkcGaENdCRDDmMVnny7s5HsIgHCbaq0w2MyPhDqkhTUgS2LU2PHA==} + engines: {node: '>=0.8.19'} + + inflight@1.0.6: + resolution: {integrity: sha512-k92I/b08q4wvFscXCLvqfsHCrjrF7yiXsQuIVvVE7N82W3+aqpzuUdBbfhWcy/FZR3/4IgflMgKLOsvPDrGCJA==} + deprecated: This module is not supported, and leaks memory. Do not use it. Check out lru-cache if you want a good and tested way to coalesce async requests by a key value, which is much more comprehensive and powerful. + + inherits@2.0.4: + resolution: {integrity: sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==} + + is-arrayish@0.2.1: + resolution: {integrity: sha512-zz06S8t0ozoDXMG+ube26zeCTNXcKIPJZJi8hBrF4idCLms4CG9QtK7qBl1boi5ODzFpjswb5JPmHCbMpjaYzg==} + + is-core-module@2.16.0: + resolution: {integrity: sha512-urTSINYfAYgcbLb0yDQ6egFm6h3Mo1DcF9EkyXSRjjzdHbsulg01qhwWuXdOoUBuTkbQ80KDboXa0vFJ+BDH+g==} + engines: {node: '>= 0.4'} + + is-fullwidth-code-point@3.0.0: + resolution: {integrity: sha512-zymm5+u+sCsSWyD9qNaejV3DFvhCKclKdizYaJUuHA83RLjb7nSuGnddCHGv0hk+KY7BMAlsWeK4Ueg6EV6XQg==} + engines: {node: '>=8'} + + is-generator-fn@2.1.0: + resolution: {integrity: sha512-cTIB4yPYL/Grw0EaSzASzg6bBy9gqCofvWN8okThAYIxKJZC+udlRAmGbM0XLeniEJSs8uEgHPGuHSe1XsOLSQ==} + engines: {node: '>=6'} + + is-number@7.0.0: + resolution: {integrity: sha512-41Cifkg6e8TylSpdtTpeLVMqvSBEVzTttHvERD741+pnZ8ANv0004MRL43QKPDlK9cGvNp6NZWZUBlbGXYxxng==} + engines: {node: '>=0.12.0'} + + is-stream@2.0.1: + resolution: {integrity: sha512-hFoiJiTl63nn+kstHGBtewWSKnQLpyb155KHheA1l39uvtO9nWIop1p3udqPcUd/xbF1VLMO4n7OI6p7RbngDg==} + engines: {node: '>=8'} + + isexe@2.0.0: + resolution: {integrity: sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw==} + + isows@1.0.6: + resolution: {integrity: sha512-lPHCayd40oW98/I0uvgaHKWCSvkzY27LjWLbtzOm64yQ+G3Q5npjjbdppU65iZXkK1Zt+kH9pfegli0AYfwYYw==} + peerDependencies: + ws: '*' + + istanbul-lib-coverage@3.2.2: + resolution: {integrity: sha512-O8dpsF+r0WV/8MNRKfnmrtCWhuKjxrq2w+jpzBL5UZKTi2LeVWnWOmWRxFlesJONmc+wLAGvKQZEOanko0LFTg==} + engines: {node: '>=8'} + + istanbul-lib-instrument@5.2.1: + resolution: {integrity: sha512-pzqtp31nLv/XFOzXGuvhCb8qhjmTVo5vjVk19XE4CRlSWz0KoeJ3bw9XsA7nOp9YBf4qHjwBxkDzKcME/J29Yg==} + engines: {node: '>=8'} + + istanbul-lib-instrument@6.0.3: + resolution: {integrity: sha512-Vtgk7L/R2JHyyGW07spoFlB8/lpjiOLTjMdms6AFMraYt3BaJauod/NGrfnVG/y4Ix1JEuMRPDPEj2ua+zz1/Q==} + engines: {node: '>=10'} + + istanbul-lib-report@3.0.1: + resolution: {integrity: sha512-GCfE1mtsHGOELCU8e/Z7YWzpmybrx/+dSTfLrvY8qRmaY6zXTKWn6WQIjaAFw069icm6GVMNkgu0NzI4iPZUNw==} + engines: {node: '>=10'} + + istanbul-lib-source-maps@4.0.1: + resolution: {integrity: sha512-n3s8EwkdFIJCG3BPKBYvskgXGoy88ARzvegkitk60NxRdwltLOTaH7CUiMRXvwYorl0Q712iEjcWB+fK/MrWVw==} + engines: {node: '>=10'} + + istanbul-reports@3.1.7: + resolution: {integrity: sha512-BewmUXImeuRk2YY0PVbxgKAysvhRPUQE0h5QRM++nVWyubKGV0l8qQ5op8+B2DOmwSe63Jivj0BjkPQVf8fP5g==} + engines: {node: '>=8'} + + jackspeak@3.4.3: + resolution: {integrity: sha512-OGlZQpz2yfahA/Rd1Y8Cd9SIEsqvXkLVoSw/cgwhnhFMDbsQFeZYoJJ7bIZBS9BcamUW96asq/npPWugM+RQBw==} + + jake@10.9.2: + resolution: {integrity: sha512-2P4SQ0HrLQ+fw6llpLnOaGAvN2Zu6778SJMrCUwns4fOoG9ayrTiZk3VV8sCPkVZF8ab0zksVpS8FDY5pRCNBA==} + engines: {node: '>=10'} + hasBin: true + + jest-changed-files@29.7.0: + resolution: {integrity: sha512-fEArFiwf1BpQ+4bXSprcDc3/x4HSzL4al2tozwVpDFpsxALjLYdyiIK4e5Vz66GQJIbXJ82+35PtysofptNX2w==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + + jest-circus@29.7.0: + resolution: {integrity: sha512-3E1nCMgipcTkCocFwM90XXQab9bS+GMsjdpmPrlelaxwD93Ad8iVEjX/vvHPdLPnFf+L40u+5+iutRdA1N9myw==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + + jest-cli@29.7.0: + resolution: {integrity: sha512-OVVobw2IubN/GSYsxETi+gOe7Ka59EFMR/twOU3Jb2GnKKeMGJB5SGUUrEz3SFVmJASUdZUzy83sLNNQ2gZslg==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + hasBin: true + peerDependencies: + node-notifier: ^8.0.1 || ^9.0.0 || ^10.0.0 + peerDependenciesMeta: + node-notifier: + optional: true + + jest-config@29.7.0: + resolution: {integrity: sha512-uXbpfeQ7R6TZBqI3/TxCU4q4ttk3u0PJeC+E0zbfSoSjq6bJ7buBPxzQPL0ifrkY4DNu4JUdk0ImlBUYi840eQ==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + peerDependencies: + '@types/node': '*' + ts-node: '>=9.0.0' + peerDependenciesMeta: + '@types/node': + optional: true + ts-node: + optional: true + + jest-diff@29.7.0: + resolution: {integrity: sha512-LMIgiIrhigmPrs03JHpxUh2yISK3vLFPkAodPeo0+BuF7wA2FoQbkEg1u8gBYBThncu7e1oEDUfIXVuTqLRUjw==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + + jest-docblock@29.7.0: + resolution: {integrity: sha512-q617Auw3A612guyaFgsbFeYpNP5t2aoUNLwBUbc/0kD1R4t9ixDbyFTHd1nok4epoVFpr7PmeWHrhvuV3XaJ4g==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + + jest-each@29.7.0: + resolution: {integrity: sha512-gns+Er14+ZrEoC5fhOfYCY1LOHHr0TI+rQUHZS8Ttw2l7gl+80eHc/gFf2Ktkw0+SIACDTeWvpFcv3B04VembQ==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + + jest-environment-node@29.7.0: + resolution: {integrity: sha512-DOSwCRqXirTOyheM+4d5YZOrWcdu0LNZ87ewUoywbcb2XR4wKgqiG8vNeYwhjFMbEkfju7wx2GYH0P2gevGvFw==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + + jest-get-type@29.6.3: + resolution: {integrity: sha512-zrteXnqYxfQh7l5FHyL38jL39di8H8rHoecLH3JNxH3BwOrBsNeabdap5e0I23lD4HHI8W5VFBZqG4Eaq5LNcw==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + + jest-haste-map@29.7.0: + resolution: {integrity: sha512-fP8u2pyfqx0K1rGn1R9pyE0/KTn+G7PxktWidOBTqFPLYX0b9ksaMFkhK5vrS3DVun09pckLdlx90QthlW7AmA==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + + jest-leak-detector@29.7.0: + resolution: {integrity: sha512-kYA8IJcSYtST2BY9I+SMC32nDpBT3J2NvWJx8+JCuCdl/CR1I4EKUJROiP8XtCcxqgTTBGJNdbB1A8XRKbTetw==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + + jest-matcher-utils@29.7.0: + resolution: {integrity: sha512-sBkD+Xi9DtcChsI3L3u0+N0opgPYnCRPtGcQYrgXmR+hmt/fYfWAL0xRXYU8eWOdfuLgBe0YCW3AFtnRLagq/g==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + + jest-message-util@29.7.0: + resolution: {integrity: sha512-GBEV4GRADeP+qtB2+6u61stea8mGcOT4mCtrYISZwfu9/ISHFJ/5zOMXYbpBE9RsS5+Gb63DW4FgmnKJ79Kf6w==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + + jest-mock@29.7.0: + resolution: {integrity: sha512-ITOMZn+UkYS4ZFh83xYAOzWStloNzJFO2s8DWrE4lhtGD+AorgnbkiKERe4wQVBydIGPx059g6riW5Btp6Llnw==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + + jest-pnp-resolver@1.2.3: + resolution: {integrity: sha512-+3NpwQEnRoIBtx4fyhblQDPgJI0H1IEIkX7ShLUjPGA7TtUTvI1oiKi3SR4oBR0hQhQR80l4WAe5RrXBwWMA8w==} + engines: {node: '>=6'} + peerDependencies: + jest-resolve: '*' + peerDependenciesMeta: + jest-resolve: + optional: true + + jest-regex-util@29.6.3: + resolution: {integrity: sha512-KJJBsRCyyLNWCNBOvZyRDnAIfUiRJ8v+hOBQYGn8gDyF3UegwiP4gwRR3/SDa42g1YbVycTidUF3rKjyLFDWbg==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + + jest-resolve-dependencies@29.7.0: + resolution: {integrity: sha512-un0zD/6qxJ+S0et7WxeI3H5XSe9lTBBR7bOHCHXkKR6luG5mwDDlIzVQ0V5cZCuoTgEdcdwzTghYkTWfubi+nA==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + + jest-resolve@29.7.0: + resolution: {integrity: sha512-IOVhZSrg+UvVAshDSDtHyFCCBUl/Q3AAJv8iZ6ZjnZ74xzvwuzLXid9IIIPgTnY62SJjfuupMKZsZQRsCvxEgA==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + + jest-runner@29.7.0: + resolution: {integrity: sha512-fsc4N6cPCAahybGBfTRcq5wFR6fpLznMg47sY5aDpsoejOcVYFb07AHuSnR0liMcPTgBsA3ZJL6kFOjPdoNipQ==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + + jest-runtime@29.7.0: + resolution: {integrity: sha512-gUnLjgwdGqW7B4LvOIkbKs9WGbn+QLqRQQ9juC6HndeDiezIwhDP+mhMwHWCEcfQ5RUXa6OPnFF8BJh5xegwwQ==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + + jest-snapshot@29.7.0: + resolution: {integrity: sha512-Rm0BMWtxBcioHr1/OX5YCP8Uov4riHvKPknOGs804Zg9JGZgmIBkbtlxJC/7Z4msKYVbIJtfU+tKb8xlYNfdkw==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + + jest-util@29.7.0: + resolution: {integrity: sha512-z6EbKajIpqGKU56y5KBUgy1dt1ihhQJgWzUlZHArA/+X2ad7Cb5iF+AK1EWVL/Bo7Rz9uurpqw6SiBCefUbCGA==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + + jest-validate@29.7.0: + resolution: {integrity: sha512-ZB7wHqaRGVw/9hST/OuFUReG7M8vKeq0/J2egIGLdvjHCmYqGARhzXmtgi+gVeZ5uXFF219aOc3Ls2yLg27tkw==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + + jest-watcher@29.7.0: + resolution: {integrity: sha512-49Fg7WXkU3Vl2h6LbLtMQ/HyB6rXSIX7SqvBLQmssRBGN9I0PNvPmAmCWSOY6SOvrjhI/F7/bGAv9RtnsPA03g==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + + jest-worker@29.7.0: + resolution: {integrity: sha512-eIz2msL/EzL9UFTFFx7jBTkeZfku0yUAyZZZmJ93H2TYEiroIx2PQjEXcwYtYl8zXCxb+PAmA2hLIt/6ZEkPHw==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + + jest@29.7.0: + resolution: {integrity: sha512-NIy3oAFp9shda19hy4HK0HRTWKtPJmGdnvywu01nOqNC2vZg+Z+fvJDxpMQA88eb2I9EcafcdjYgsDthnYTvGw==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + hasBin: true + peerDependencies: + node-notifier: ^8.0.1 || ^9.0.0 || ^10.0.0 + peerDependenciesMeta: + node-notifier: + optional: true + + joycon@3.1.1: + resolution: {integrity: sha512-34wB/Y7MW7bzjKRjUKTa46I2Z7eV62Rkhva+KkopW7Qvv/OSWBqvkSY7vusOPrNuZcUG3tApvdVgNB8POj3SPw==} + engines: {node: '>=10'} + + js-tokens@4.0.0: + resolution: {integrity: sha512-RdJUflcE3cUzKiMqQgsCu06FPu9UdIJO0beYbPhHN4k6apgJtifcoCtT9bcxOpYBtpD2kCM6Sbzg4CausW/PKQ==} + + js-yaml@3.14.1: + resolution: {integrity: sha512-okMH7OXXJ7YrN9Ok3/SXrnu4iX9yOk+25nqX4imS2npuvTYDmo/QEZoqwZkYaIDk3jVvBOTOIEgEhaLOynBS9g==} + hasBin: true + + jsesc@3.1.0: + resolution: {integrity: sha512-/sM3dO2FOzXjKQhJuo0Q173wf2KOo8t4I8vHy6lF9poUp7bKT0/NHE8fPX23PwfhnykfqnC2xRxOnVw5XuGIaA==} + engines: {node: '>=6'} + hasBin: true + + json-parse-even-better-errors@2.3.1: + resolution: {integrity: sha512-xyFwyhro/JEof6Ghe2iz2NcXoj2sloNsWr/XsERDK/oiPCfaNhl5ONfp+jQdAZRQQ0IJWNzH9zIZF7li91kh2w==} + + json5@2.2.3: + resolution: {integrity: sha512-XmOWe7eyHYH14cLdVPoyg+GOH3rYX++KpzrylJwSW98t3Nk+U8XOl8FWKOgwtzdb8lXGf6zYwDUzeHMWfxasyg==} + engines: {node: '>=6'} + hasBin: true + + kleur@3.0.3: + resolution: {integrity: sha512-eTIzlVOSUR+JxdDFepEYcBMtZ9Qqdef+rnzWdRZuMbOywu5tO2w2N7rqjoANZ5k9vywhL6Br1VRjUIgTQx4E8w==} + engines: {node: '>=6'} + + leven@3.1.0: + resolution: {integrity: sha512-qsda+H8jTaUaN/x5vzW2rzc+8Rw4TAQ/4KjB46IwK5VH+IlVeeeje/EoZRpiXvIqjFgK84QffqPztGI3VBLG1A==} + engines: {node: '>=6'} + + lilconfig@3.1.3: + resolution: {integrity: sha512-/vlFKAoH5Cgt3Ie+JLhRbwOsCQePABiU3tJ1egGvyQ+33R/vcwM2Zl2QR/LzjsBeItPt3oSVXapn+m4nQDvpzw==} + engines: {node: '>=14'} + + lines-and-columns@1.2.4: + resolution: {integrity: sha512-7ylylesZQ/PV29jhEDl3Ufjo6ZX7gCqJr5F7PKrqc93v7fzSymt1BpwEU8nAUXs8qzzvqhbjhK5QZg6Mt/HkBg==} + + load-tsconfig@0.2.5: + resolution: {integrity: sha512-IXO6OCs9yg8tMKzfPZ1YmheJbZCiEsnBdcB03l0OcfK9prKnJb96siuHCr5Fl37/yo9DnKU+TLpxzTUspw9shg==} + engines: {node: ^12.20.0 || ^14.13.1 || >=16.0.0} + + locate-path@5.0.0: + resolution: {integrity: sha512-t7hw9pI+WvuwNJXwk5zVHpyhIqzg2qTlklJOf0mVxGSbe3Fp2VieZcduNYjaLDoy6p9uGpQEGWG87WpMKlNq8g==} + engines: {node: '>=8'} + + lodash.memoize@4.1.2: + resolution: {integrity: sha512-t7j+NzmgnQzTAYXcsHYLgimltOV1MXHtlOWf6GjL9Kj8GK5FInw5JotxvbOs+IvV1/Dzo04/fCGfLVs7aXb4Ag==} + + lodash.sortby@4.7.0: + resolution: {integrity: sha512-HDWXG8isMntAyRF5vZ7xKuEvOhT4AhlRt/3czTSjvGUxjYCBVRQY48ViDHyfYz9VIoBkW4TMGQNapx+l3RUwdA==} + + lru-cache@10.4.3: + resolution: {integrity: sha512-JNAzZcXrCt42VGLuYz0zfAzDfAvJWW6AfYlDBQyDV5DClI2m5sAmK+OIO7s59XfsRsWHp02jAJrRadPRGTt6SQ==} + + lru-cache@5.1.1: + resolution: {integrity: sha512-KpNARQA3Iwv+jTA0utUVVbrh+Jlrr1Fv0e56GGzAFOXN7dk/FviaDW8LHmK52DlcH4WP2n6gI8vN1aesBFgo9w==} + + make-dir@4.0.0: + resolution: {integrity: sha512-hXdUTZYIVOt1Ex//jAQi+wTZZpUpwBj/0QsOzqegb3rGMMeJiSEu5xLHnYfBrRV4RH2+OCSOO95Is/7x1WJ4bw==} + engines: {node: '>=10'} + + make-error@1.3.6: + resolution: {integrity: sha512-s8UhlNe7vPKomQhC1qFelMokr/Sc3AgNbso3n74mVPA5LTZwkB9NlXf4XPamLxJE8h0gh73rM94xvwRT2CVInw==} + + makeerror@1.0.12: + resolution: {integrity: sha512-JmqCvUhmt43madlpFzG4BQzG2Z3m6tvQDNKdClZnO3VbIudJYmxsT0FNJMeiB2+JTSlTQTSbU8QdesVmwJcmLg==} + + merge-stream@2.0.0: + resolution: {integrity: sha512-abv/qOcuPfk3URPfDzmZU1LKmuw8kT+0nIHvKrKgFrwifol/doWcdA4ZqsWQ8ENrFKkd67Mfpo/LovbIUsbt3w==} + + micromatch@4.0.8: + resolution: {integrity: sha512-PXwfBhYu0hBCPw8Dn0E+WDYb7af3dSLVWKi3HGv84IdF4TyFoC0ysxFd0Goxw7nSv4T/PzEJQxsYsEiFCKo2BA==} + engines: {node: '>=8.6'} + + mime-db@1.52.0: + resolution: {integrity: sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg==} + engines: {node: '>= 0.6'} + + mime-types@2.1.35: + resolution: {integrity: sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==} + engines: {node: '>= 0.6'} + + mimic-fn@2.1.0: + resolution: {integrity: sha512-OqbOk5oEQeAZ8WXWydlu9HJjz9WVdEIvamMCcXmuqUYjTknH/sqsWvhQ3vgwKFRR1HpjvNBKQ37nbJgYzGqGcg==} + engines: {node: '>=6'} + + minimatch@3.1.2: + resolution: {integrity: sha512-J7p63hRiAjw1NDEww1W7i37+ByIrOWO5XQQAzZ3VOcL0PNybwpfmV/N05zFAzwQ9USyEcX6t3UO+K5aqBQOIHw==} + + minimatch@5.1.6: + resolution: {integrity: sha512-lKwV/1brpG6mBUFHtb7NUmtABCb2WZZmm2wNiOA5hAb8VdCS4B3dtMWyvcoViccwAW/COERjXLt0zP1zXUN26g==} + engines: {node: '>=10'} + + minimatch@9.0.5: + resolution: {integrity: sha512-G6T0ZX48xgozx7587koeX9Ys2NYy6Gmv//P89sEte9V9whIapMNF4idKxnW2QtCcLiTWlb/wfCabAtAFWhhBow==} + engines: {node: '>=16 || 14 >=14.17'} + + minipass@7.1.2: + resolution: {integrity: sha512-qOOzS1cBTWYF4BH8fVePDBOO9iptMnGUEZwNc/cMWnTV2nVLZ7VoNWEPHkYczZA0pdoA7dl6e7FL659nX9S2aw==} + engines: {node: '>=16 || 14 >=14.17'} + + ms@2.1.3: + resolution: {integrity: sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==} + + mz@2.7.0: + resolution: {integrity: sha512-z81GNO7nnYMEhrGh9LeymoE4+Yr0Wn5McHIZMK5cfQCl+NDX08sCZgUc9/6MHni9IWuFLm1Z3HTCXu2z9fN62Q==} + + natural-compare@1.4.0: + resolution: {integrity: sha512-OWND8ei3VtNC9h7V60qff3SVobHr996CTwgxubgyQYEpg290h9J0buyECNNJexkFm5sOajh5G116RYA1c8ZMSw==} + + node-int64@0.4.0: + resolution: {integrity: sha512-O5lz91xSOeoXP6DulyHfllpq+Eg00MWitZIbtPfoSEvqIHdl5gfcY6hYzDWnj0qD5tz52PI08u9qUvSVeUBeHw==} + + node-releases@2.0.19: + resolution: {integrity: sha512-xxOWJsBKtzAq7DY0J+DTzuz58K8e7sJbdgwkbMWQe8UYB6ekmsQ45q0M/tJDsGaZmbC+l7n57UV8Hl5tHxO9uw==} + + normalize-path@3.0.0: + resolution: {integrity: sha512-6eZs5Ls3WtCisHWp9S2GUy8dqkpGi4BVSz3GaqiE6ezub0512ESztXUwUB6C6IKbQkY2Pnb/mD4WYojCRwcwLA==} + engines: {node: '>=0.10.0'} + + npm-run-path@4.0.1: + resolution: {integrity: sha512-S48WzZW777zhNIrn7gxOlISNAqi9ZC/uQFnRdbeIHhZhCA6UqpkOT8T1G7BvfdgP4Er8gF4sUbaS0i7QvIfCWw==} + engines: {node: '>=8'} + + object-assign@4.1.1: + resolution: {integrity: sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg==} + engines: {node: '>=0.10.0'} + + once@1.4.0: + resolution: {integrity: sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w==} + + onetime@5.1.2: + resolution: {integrity: sha512-kbpaSSGJTWdAY5KPVeMOKXSrPtr8C8C7wodJbcsd51jRnmD+GZu8Y0VoU6Dm5Z4vWr0Ig/1NKuWRKf7j5aaYSg==} + engines: {node: '>=6'} + + p-limit@2.3.0: + resolution: {integrity: sha512-//88mFWSJx8lxCzwdAABTJL2MyWB12+eIY7MDL2SqLmAkeKU9qxRvWuSyTjm3FUmpBEMuFfckAIqEaVGUDxb6w==} + engines: {node: '>=6'} + + p-limit@3.1.0: + resolution: {integrity: sha512-TYOanM3wGwNGsZN2cVTYPArw454xnXj5qmWF1bEoAc4+cU/ol7GVh7odevjp1FNHduHc3KZMcFduxU5Xc6uJRQ==} + engines: {node: '>=10'} + + p-locate@4.1.0: + resolution: {integrity: sha512-R79ZZ/0wAxKGu3oYMlz8jy/kbhsNrS7SKZ7PxEHBgJ5+F2mtFW2fK2cOtBh1cHYkQsbzFV7I+EoRKe6Yt0oK7A==} + engines: {node: '>=8'} + + p-try@2.2.0: + resolution: {integrity: sha512-R4nPAVTAU0B9D35/Gk3uJf/7XYbQcyohSKdvAxIRSNghFl4e71hVoGnBNQz9cWaXxO2I10KTC+3jMdvvoKw6dQ==} + engines: {node: '>=6'} + + package-json-from-dist@1.0.1: + resolution: {integrity: sha512-UEZIS3/by4OC8vL3P2dTXRETpebLI2NiI5vIrjaD/5UtrkFX/tNbwjTSRAGC/+7CAo2pIcBaRgWmcBBHcsaCIw==} + + parse-json@5.2.0: + resolution: {integrity: sha512-ayCKvm/phCGxOkYRSCM82iDwct8/EonSEgCSxWxD7ve6jHggsFl4fZVQBPRNgQoKiuV/odhFrGzQXZwbifC8Rg==} + engines: {node: '>=8'} + + path-exists@4.0.0: + resolution: {integrity: sha512-ak9Qy5Q7jYb2Wwcey5Fpvg2KoAc/ZIhLSLOSBmRmygPsGwkVVt0fZa0qrtMz+m6tJTAHfZQ8FnmB4MG4LWy7/w==} + engines: {node: '>=8'} + + path-is-absolute@1.0.1: + resolution: {integrity: sha512-AVbw3UJ2e9bq64vSaS9Am0fje1Pa8pbGqTTsmXfaIiMpnr5DlDhfJOuLj9Sf95ZPVDAUerDfEk88MPmPe7UCQg==} + engines: {node: '>=0.10.0'} + + path-key@3.1.1: + resolution: {integrity: sha512-ojmeN0qd+y0jszEtoY48r0Peq5dwMEkIlCOu6Q5f41lfkswXuKtYrhgoTpLnyIcHm24Uhqx+5Tqm2InSwLhE6Q==} + engines: {node: '>=8'} + + path-parse@1.0.7: + resolution: {integrity: sha512-LDJzPVEEEPR+y48z93A0Ed0yXb8pAByGWo/k5YYdYgpY2/2EsOsksJrq7lOHxryrVOn1ejG6oAp8ahvOIQD8sw==} + + path-scurry@1.11.1: + resolution: {integrity: sha512-Xa4Nw17FS9ApQFJ9umLiJS4orGjm7ZzwUrwamcGQuHSzDyth9boKDaycYdDcZDuqYATXw4HFXgaqWTctW/v1HA==} + engines: {node: '>=16 || 14 >=14.18'} + + picocolors@1.1.1: + resolution: {integrity: sha512-xceH2snhtb5M9liqDsmEw56le376mTZkEX/jEb/RxNFyegNul7eNslCXP9FDj/Lcu0X8KEyMceP2ntpaHrDEVA==} + + picomatch@2.3.1: + resolution: {integrity: sha512-JU3teHTNjmE2VCGFzuY8EXzCDVwEqB2a8fsIvwaStHhAWJEeVd1o1QD80CU6+ZdEXXSLbSsuLwJjkCBWqRQUVA==} + engines: {node: '>=8.6'} + + picomatch@4.0.2: + resolution: {integrity: sha512-M7BAV6Rlcy5u+m6oPhAPFgJTzAioX/6B0DxyvDlo9l8+T3nLKbrczg2WLUyzd45L8RqfUMyGPzekbMvX2Ldkwg==} + engines: {node: '>=12'} + + pirates@4.0.6: + resolution: {integrity: sha512-saLsH7WeYYPiD25LDuLRRY/i+6HaPYr6G1OUlN39otzkSTxKnubR9RTxS3/Kk50s1g2JTgFwWQDQyplC5/SHZg==} + engines: {node: '>= 6'} + + pkg-dir@4.2.0: + resolution: {integrity: sha512-HRDzbaKjC+AOWVXxAU/x54COGeIv9eb+6CkDSQoNTt4XyWoIJvuPsXizxu/Fr23EiekbtZwmh1IcIG/l/a10GQ==} + engines: {node: '>=8'} + + postcss-load-config@6.0.1: + resolution: {integrity: sha512-oPtTM4oerL+UXmx+93ytZVN82RrlY/wPUV8IeDxFrzIjXOLF1pN+EmKPLbubvKHT2HC20xXsCAH2Z+CKV6Oz/g==} + engines: {node: '>= 18'} + peerDependencies: + jiti: '>=1.21.0' + postcss: '>=8.0.9' + tsx: ^4.8.1 + yaml: ^2.4.2 + peerDependenciesMeta: + jiti: + optional: true + postcss: + optional: true + tsx: + optional: true + yaml: + optional: true + + pretty-format@29.7.0: + resolution: {integrity: sha512-Pdlw/oPxN+aXdmM9R00JVC9WVFoCLTKJvDVLgmJ+qAffBMxsV85l/Lu7sNx4zSzPyoL2euImuEwHhOXdEgNFZQ==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + + prompts@2.4.2: + resolution: {integrity: sha512-NxNv/kLguCA7p3jE8oL2aEBsrJWgAakBpgmgK6lpPWV+WuOmY6r2/zbAVnP+T8bQlA0nzHXSJSJW0Hq7ylaD2Q==} + engines: {node: '>= 6'} + + proxy-from-env@1.1.0: + resolution: {integrity: sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg==} + + punycode@2.3.1: + resolution: {integrity: sha512-vYt7UD1U9Wg6138shLtLOvdAu+8DsC/ilFtEVHcH+wydcSpNE20AfSOduf6MkRFahL5FY7X1oU7nKVZFtfq8Fg==} + engines: {node: '>=6'} + + pure-rand@6.1.0: + resolution: {integrity: sha512-bVWawvoZoBYpp6yIoQtQXHZjmz35RSVHnUOTefl8Vcjr8snTPY1wnpSPMWekcFwbxI6gtmT7rSYPFvz71ldiOA==} + + react-is@18.3.1: + resolution: {integrity: sha512-/LLMVyas0ljjAtoYiPqYiL8VWXzUUdThrmU5+n20DZv+a+ClRoevUzw5JxU+Ieh5/c87ytoTBV9G1FiKfNJdmg==} + + readdirp@4.0.2: + resolution: {integrity: sha512-yDMz9g+VaZkqBYS/ozoBJwaBhTbZo3UNYQHNRw1D3UFQB8oHB4uS/tAODO+ZLjGWmUbKnIlOWO+aaIiAxrUWHA==} + engines: {node: '>= 14.16.0'} + + require-directory@2.1.1: + resolution: {integrity: sha512-fGxEI7+wsG9xrvdjsrlmL22OMTTiHRwAMroiEeMgq8gzoLC/PQr7RsRDSTLUg/bZAZtF+TVIkHc6/4RIKrui+Q==} + engines: {node: '>=0.10.0'} + + resolve-cwd@3.0.0: + resolution: {integrity: sha512-OrZaX2Mb+rJCpH/6CpSqt9xFVpN++x01XnN2ie9g6P5/3xelLAkXWVADpdz1IHD/KFfEXyE6V0U01OQ3UO2rEg==} + engines: {node: '>=8'} + + resolve-from@5.0.0: + resolution: {integrity: sha512-qYg9KP24dD5qka9J47d0aVky0N+b4fTU89LN9iDnjB5waksiC49rvMB0PrUJQGoTmH50XPiqOvAjDfaijGxYZw==} + engines: {node: '>=8'} + + resolve.exports@2.0.3: + resolution: {integrity: sha512-OcXjMsGdhL4XnbShKpAcSqPMzQoYkYyhbEaeSko47MjRP9NfEQMhZkXL1DoFlt9LWQn4YttrdnV6X2OiyzBi+A==} + engines: {node: '>=10'} + + resolve@1.22.9: + resolution: {integrity: sha512-QxrmX1DzraFIi9PxdG5VkRfRwIgjwyud+z/iBwfRRrVmHc+P9Q7u2lSSpQ6bjr2gy5lrqIiU9vb6iAeGf2400A==} + hasBin: true + + rollup@4.28.1: + resolution: {integrity: sha512-61fXYl/qNVinKmGSTHAZ6Yy8I3YIJC/r2m9feHo6SwVAVcLT5MPwOUFe7EuURA/4m0NR8lXG4BBXuo/IZEsjMg==} + engines: {node: '>=18.0.0', npm: '>=8.0.0'} + hasBin: true + + semver@6.3.1: + resolution: {integrity: sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA==} + hasBin: true + + semver@7.6.3: + resolution: {integrity: sha512-oVekP1cKtI+CTDvHWYFUcMtsK/00wmAEfyqKfNdARm8u1wNVhSgaX7A8d4UuIlUI5e84iEwOhs7ZPYRmzU9U6A==} + engines: {node: '>=10'} + hasBin: true + + shebang-command@2.0.0: + resolution: {integrity: sha512-kHxr2zZpYtdmrN1qDjrrX/Z1rR1kG8Dx+gkpK1G4eXmvXswmcE1hTWBWYUzlraYw1/yZp6YuDY77YtvbN0dmDA==} + engines: {node: '>=8'} + + shebang-regex@3.0.0: + resolution: {integrity: sha512-7++dFhtcx3353uBaq8DDR4NuxBetBzC7ZQOhmTQInHEd6bSrXdiEyzCvG07Z44UYdLShWUyXt5M/yhz8ekcb1A==} + engines: {node: '>=8'} + + signal-exit@3.0.7: + resolution: {integrity: sha512-wnD2ZE+l+SPC/uoS0vXeE9L1+0wuaMqKlfz9AMUo38JsyLSBWSFcHR1Rri62LZc12vLr1gb3jl7iwQhgwpAbGQ==} + + signal-exit@4.1.0: + resolution: {integrity: sha512-bzyZ1e88w9O1iNJbKnOlvYTrWPDl46O1bG0D3XInv+9tkPrxrN8jUUTiFlDkkmKWgn1M6CfIA13SuGqOa9Korw==} + engines: {node: '>=14'} + + sisteransi@1.0.5: + resolution: {integrity: sha512-bLGGlR1QxBcynn2d5YmDX4MGjlZvy2MRBDRNHLJ8VI6l6+9FUiyTFNJ0IveOSP0bcXgVDPRcfGqA0pjaqUpfVg==} + + slash@3.0.0: + resolution: {integrity: sha512-g9Q1haeby36OSStwb4ntCGGGaKsaVSjQ68fBxoQcutl5fS1vuY18H3wSt3jFyFtrkx+Kz0V1G85A4MyAdDMi2Q==} + engines: {node: '>=8'} + + source-map-support@0.5.13: + resolution: {integrity: sha512-SHSKFHadjVA5oR4PPqhtAVdcBWwRYVd6g6cAXnIbRiIwc2EhPrTuKUBdSLvlEKyIP3GCf89fltvcZiP9MMFA1w==} + + source-map@0.6.1: + resolution: {integrity: sha512-UjgapumWlbMhkBgzT7Ykc5YXUT46F0iKu8SGXq0bcwP5dz/h0Plj6enJqjz1Zbq2l5WaqYnrVbwWOWMyF3F47g==} + engines: {node: '>=0.10.0'} + + source-map@0.8.0-beta.0: + resolution: {integrity: sha512-2ymg6oRBpebeZi9UUNsgQ89bhx01TcTkmNTGnNO88imTmbSgy4nfujrgVEFKWpMTEGA11EDkTt7mqObTPdigIA==} + engines: {node: '>= 8'} + + sprintf-js@1.0.3: + resolution: {integrity: sha512-D9cPgkvLlV3t3IzL0D0YLvGA9Ahk4PcvVwUbN0dSGr1aP0Nrt4AEnTUbuGvquEC0mA64Gqt1fzirlRs5ibXx8g==} + + stack-utils@2.0.6: + resolution: {integrity: sha512-XlkWvfIm6RmsWtNJx+uqtKLS8eqFbxUg0ZzLXqY0caEy9l7hruX8IpiDnjsLavoBgqCCR71TqWO8MaXYheJ3RQ==} + engines: {node: '>=10'} + + string-length@4.0.2: + resolution: {integrity: sha512-+l6rNN5fYHNhZZy41RXsYptCjA2Igmq4EG7kZAYFQI1E1VTXarr6ZPXBg6eq7Y6eK4FEhY6AJlyuFIb/v/S0VQ==} + engines: {node: '>=10'} + + string-width@4.2.3: + resolution: {integrity: sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==} + engines: {node: '>=8'} + + string-width@5.1.2: + resolution: {integrity: sha512-HnLOCR3vjcY8beoNLtcjZ5/nxn2afmME6lhrDrebokqMap+XbeW8n9TXpPDOqdGK5qcI3oT0GKTW6wC7EMiVqA==} + engines: {node: '>=12'} + + strip-ansi@6.0.1: + resolution: {integrity: sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==} + engines: {node: '>=8'} + + strip-ansi@7.1.0: + resolution: {integrity: sha512-iq6eVVI64nQQTRYq2KtEg2d2uU7LElhTJwsH4YzIHZshxlgZms/wIc4VoDQTlG/IvVIrBKG06CrZnp0qv7hkcQ==} + engines: {node: '>=12'} + + strip-bom@4.0.0: + resolution: {integrity: sha512-3xurFv5tEgii33Zi8Jtp55wEIILR9eh34FAW00PZf+JnSsTmV/ioewSgQl97JHvgjoRGwPShsWm+IdrxB35d0w==} + engines: {node: '>=8'} + + strip-final-newline@2.0.0: + resolution: {integrity: sha512-BrpvfNAE3dcvq7ll3xVumzjKjZQ5tI1sEUIKr3Uoks0XUl45St3FlatVqef9prk4jRDzhW6WZg+3bk93y6pLjA==} + engines: {node: '>=6'} + + strip-json-comments@3.1.1: + resolution: {integrity: sha512-6fPc+R4ihwqP6N/aIv2f1gMH8lOVtWQHoqC4yK6oSDVVocumAsfCqjkXnqiYMhmMwS/mEHLp7Vehlt3ql6lEig==} + engines: {node: '>=8'} + + sucrase@3.35.0: + resolution: {integrity: sha512-8EbVDiu9iN/nESwxeSxDKe0dunta1GOlHufmSSXxMD2z2/tMZpDMpvXQGsc+ajGo8y2uYUmixaSRUc/QPoQ0GA==} + engines: {node: '>=16 || 14 >=14.17'} + hasBin: true + + supports-color@7.2.0: + resolution: {integrity: sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==} + engines: {node: '>=8'} + + supports-color@8.1.1: + resolution: {integrity: sha512-MpUEN2OodtUzxvKQl72cUF7RQ5EiHsGvSsVG0ia9c5RbWGL2CI4C7EpPS8UTBIplnlzZiNuV56w+FuNxy3ty2Q==} + engines: {node: '>=10'} + + supports-preserve-symlinks-flag@1.0.0: + resolution: {integrity: sha512-ot0WnXS9fgdkgIcePe6RHNk1WA8+muPa6cSjeR3V8K27q9BB1rTE3R1p7Hv0z1ZyAc8s6Vvv8DIyWf681MAt0w==} + engines: {node: '>= 0.4'} + + test-exclude@6.0.0: + resolution: {integrity: sha512-cAGWPIyOHU6zlmg88jwm7VRyXnMN7iV68OGAbYDk/Mh/xC/pzVPlQtY6ngoIH/5/tciuhGfvESU8GrHrcxD56w==} + engines: {node: '>=8'} + + thenify-all@1.6.0: + resolution: {integrity: sha512-RNxQH/qI8/t3thXJDwcstUO4zeqo64+Uy/+sNVRBx4Xn2OX+OZ9oP+iJnNFqplFra2ZUVeKCSa2oVWi3T4uVmA==} + engines: {node: '>=0.8'} + + thenify@3.3.1: + resolution: {integrity: sha512-RVZSIV5IG10Hk3enotrhvz0T9em6cyHBLkH/YAZuKqd8hRkKhSfCGIcP2KUY0EPxndzANBmNllzWPwak+bheSw==} + + tinyexec@0.3.1: + resolution: {integrity: sha512-WiCJLEECkO18gwqIp6+hJg0//p23HXp4S+gGtAKu3mI2F2/sXC4FvHvXvB0zJVVaTPhx1/tOwdbRsa1sOBIKqQ==} + + tinyglobby@0.2.10: + resolution: {integrity: sha512-Zc+8eJlFMvgatPZTl6A9L/yht8QqdmUNtURHaKZLmKBE12hNPSrqNkUp2cs3M/UKmNVVAMFQYSjYIVHDjW5zew==} + engines: {node: '>=12.0.0'} + + tmpl@1.0.5: + resolution: {integrity: sha512-3f0uOEAQwIqGuWW2MVzYg8fV/QNnc/IpuJNG837rLuczAaLVHslWHZQj4IGiEl5Hs3kkbhwL9Ab7Hrsmuj+Smw==} + + to-regex-range@5.0.1: + resolution: {integrity: sha512-65P7iz6X5yEr1cwcgvQxbbIw7Uk3gOy5dIdtZ4rDveLqhrdJP+Li/Hx6tyK0NEb+2GCyneCMJiGqrADCSNk8sQ==} + engines: {node: '>=8.0'} + + tr46@1.0.1: + resolution: {integrity: sha512-dTpowEjclQ7Kgx5SdBkqRzVhERQXov8/l9Ft9dVM9fmg0W0KQSVaXX9T4i6twCPNtYiZM53lpSSUAwJbFPOHxA==} + + tree-kill@1.2.2: + resolution: {integrity: sha512-L0Orpi8qGpRG//Nd+H90vFB+3iHnue1zSSGmNOOCh1GLJ7rUKVwV2HvijphGQS2UmhUZewS9VgvxYIdgr+fG1A==} + hasBin: true + + ts-interface-checker@0.1.13: + resolution: {integrity: sha512-Y/arvbn+rrz3JCKl9C4kVNfTfSm2/mEp5FSz5EsZSANGPSlQrpRI5M4PKF+mJnE52jOO90PnPSc3Ur3bTQw0gA==} + + ts-jest@29.2.5: + resolution: {integrity: sha512-KD8zB2aAZrcKIdGk4OwpJggeLcH1FgrICqDSROWqlnJXGCXK4Mn6FcdK2B6670Xr73lHMG1kHw8R87A0ecZ+vA==} + engines: {node: ^14.15.0 || ^16.10.0 || ^18.0.0 || >=20.0.0} + hasBin: true + peerDependencies: + '@babel/core': '>=7.0.0-beta.0 <8' + '@jest/transform': ^29.0.0 + '@jest/types': ^29.0.0 + babel-jest: ^29.0.0 + esbuild: '*' + jest: ^29.0.0 + typescript: '>=4.3 <6' + peerDependenciesMeta: + '@babel/core': + optional: true + '@jest/transform': + optional: true + '@jest/types': + optional: true + babel-jest: + optional: true + esbuild: + optional: true + + tsup@8.3.5: + resolution: {integrity: sha512-Tunf6r6m6tnZsG9GYWndg0z8dEV7fD733VBFzFJ5Vcm1FtlXB8xBD/rtrBi2a3YKEV7hHtxiZtW5EAVADoe1pA==} + engines: {node: '>=18'} + hasBin: true + peerDependencies: + '@microsoft/api-extractor': ^7.36.0 + '@swc/core': ^1 + postcss: ^8.4.12 + typescript: '>=4.5.0' + peerDependenciesMeta: + '@microsoft/api-extractor': + optional: true + '@swc/core': + optional: true + postcss: + optional: true + typescript: + optional: true + + type-detect@4.0.8: + resolution: {integrity: sha512-0fr/mIH1dlO+x7TlcMy+bIDqKPsw/70tVyeHW787goQjhmqaZe10uwLujubK9q9Lg6Fiho1KUKDYz0Z7k7g5/g==} + engines: {node: '>=4'} + + type-fest@0.21.3: + resolution: {integrity: sha512-t0rzBq87m3fVcduHDUFhKmyyX+9eo6WQjZvf51Ea/M0Q7+T374Jp1aUiyUl0GKxp8M/OETVHSDvmkyPgvX+X2w==} + engines: {node: '>=10'} + + typescript-event-target@1.1.1: + resolution: {integrity: sha512-dFSOFBKV6uwaloBCCUhxlD3Pr/P1a/tJdcmPrTXCHlEFD3faj0mztjcGn6VBAhQ0/Bdy8K3VWrrqwbt/ffsYsg==} + + typescript@5.7.2: + resolution: {integrity: sha512-i5t66RHxDvVN40HfDd1PsEThGNnlMCMT3jMUuoh9/0TaqWevNontacunWyN02LA9/fIbEWlcHZcgTKb9QoaLfg==} + engines: {node: '>=14.17'} + hasBin: true + + undici-types@6.19.8: + resolution: {integrity: sha512-ve2KP6f/JnbPBFyobGHuerC9g1FYGn/F8n1LWTwNxCEzd6IfqTwUQcNXgEtmmQ6DlRrC1hrSrBnCZPokRrDHjw==} + + update-browserslist-db@1.1.1: + resolution: {integrity: sha512-R8UzCaa9Az+38REPiJ1tXlImTJXlVfgHZsglwBD/k6nj76ctsH1E3q4doGrukiLQd3sGQYu56r5+lo5r94l29A==} + hasBin: true + peerDependencies: + browserslist: '>= 4.21.0' + + uuid@9.0.1: + resolution: {integrity: sha512-b+1eJOlsR9K8HJpow9Ok3fiWOWSIcIzXodvv0rQjVoOVNpWMpxf1wZNpt4y9h10odCNrqnYp1OBzRktckBe3sA==} + hasBin: true + + v8-to-istanbul@9.3.0: + resolution: {integrity: sha512-kiGUalWN+rgBJ/1OHZsBtU4rXZOfj/7rKQxULKlIzwzQSvMJUUNgPwJEEh7gU6xEVxC0ahoOBvN2YI8GH6FNgA==} + engines: {node: '>=10.12.0'} + + walker@1.0.8: + resolution: {integrity: sha512-ts/8E8l5b7kY0vlWLewOkDXMmPdLcVV4GmOQLyxuSswIJsweeFZtAsMF7k1Nszz+TYBQrlYRmzOnr398y1JemQ==} + + webidl-conversions@4.0.2: + resolution: {integrity: sha512-YQ+BmxuTgd6UXZW3+ICGfyqRyHXVlD5GtQr5+qjiNW7bF0cqrzX500HVXPBOvgXb5YnzDd+h0zqyv61KUD7+Sg==} + + whatwg-url@7.1.0: + resolution: {integrity: sha512-WUu7Rg1DroM7oQvGWfOiAK21n74Gg+T4elXEQYkOhtyLeWiJFoOGLXPKI/9gzIie9CtwVLm8wtw6YJdKyxSjeg==} + + which@2.0.2: + resolution: {integrity: sha512-BLI3Tl1TW3Pvl70l3yq3Y64i+awpwXqsGBYWkkqMtnbXgrMD+yj7rhW0kuEDxzJaYXGjEW5ogapKNMEKNMjibA==} + engines: {node: '>= 8'} + hasBin: true + + wrap-ansi@7.0.0: + resolution: {integrity: sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==} + engines: {node: '>=10'} + + wrap-ansi@8.1.0: + resolution: {integrity: sha512-si7QWI6zUMq56bESFvagtmzMdGOtoxfR+Sez11Mobfc7tm+VkUckk9bW2UeffTGVUbOksxmSw0AA2gs8g71NCQ==} + engines: {node: '>=12'} + + wrappy@1.0.2: + resolution: {integrity: sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==} + + write-file-atomic@4.0.2: + resolution: {integrity: sha512-7KxauUdBmSdWnmpaGFg+ppNjKF8uNLry8LyzjauQDOVONfFLNKrKvQOxZ/VuTIcS/gge/YNahf5RIIQWTSarlg==} + engines: {node: ^12.13.0 || ^14.15.0 || >=16.0.0} + + ws@8.18.0: + resolution: {integrity: sha512-8VbfWfHLbbwu3+N6OKsOMpBdT4kXPDDB9cJk2bJ6mh9ucxdlnNvH1e+roYkKmN9Nxw2yjz7VzeO9oOz2zJ04Pw==} + engines: {node: '>=10.0.0'} + peerDependencies: + bufferutil: ^4.0.1 + utf-8-validate: '>=5.0.2' + peerDependenciesMeta: + bufferutil: + optional: true + utf-8-validate: + optional: true + + y18n@5.0.8: + resolution: {integrity: sha512-0pfFzegeDWJHJIAmTLRP2DwHjdF5s7jo9tuztdQxAhINCdvS+3nGINqPd00AphqJR/0LhANUS6/+7SCb98YOfA==} + engines: {node: '>=10'} + + yallist@3.1.1: + resolution: {integrity: sha512-a4UGQaWPH59mOXUYnAG2ewncQS4i4F43Tv3JoAM+s2VDAmS9NsK8GpDMLrCHPksFT7h3K6TOoUNn2pb7RoXx4g==} + + yargs-parser@21.1.1: + resolution: {integrity: sha512-tVpsJW7DdjecAiFpbIB1e3qxIQsE6NoPc5/eTdrbbIC4h0LVsWhnoa3g+m2HclBIujHzsxZ4VJVA+GUuc2/LBw==} + engines: {node: '>=12'} + + yargs@17.7.2: + resolution: {integrity: sha512-7dSzzRQ++CKnNI/krKnYRV7JKKPUXMEh61soaHKg9mrWEhzFWhFnxPxGl+69cD1Ou63C13NUPCnmIcrvqCuM6w==} + engines: {node: '>=12'} + + yocto-queue@0.1.0: + resolution: {integrity: sha512-rVksvsnNCdJ/ohGc6xgPwyN8eheCxsiLM8mxuE/t/mOVqJewPuO1miLpTHQiRgTKCLexL4MeAFVagts7HmNZ2Q==} + engines: {node: '>=10'} + + zod-to-json-schema@3.24.1: + resolution: {integrity: sha512-3h08nf3Vw3Wl3PK+q3ow/lIil81IT2Oa7YpQyUUDsEWbXveMesdfK1xBd2RhCkynwZndAxixji/7SYJJowr62w==} + peerDependencies: + zod: ^3.24.1 + + zod@3.24.1: + resolution: {integrity: sha512-muH7gBL9sI1nciMZV67X5fTKKBLtwpZ5VBp1vsOQzj1MhrBZ4wlVCm3gedKZWLp0Oyel8sIGfeiz54Su+OVT+A==} + +snapshots: + + '@ampproject/remapping@2.3.0': + dependencies: + '@jridgewell/gen-mapping': 0.3.8 + '@jridgewell/trace-mapping': 0.3.25 + + '@babel/code-frame@7.26.2': + dependencies: + '@babel/helper-validator-identifier': 7.25.9 + js-tokens: 4.0.0 + picocolors: 1.1.1 + + '@babel/compat-data@7.26.3': {} + + '@babel/core@7.26.0': + dependencies: + '@ampproject/remapping': 2.3.0 + '@babel/code-frame': 7.26.2 + '@babel/generator': 7.26.3 + '@babel/helper-compilation-targets': 7.25.9 + '@babel/helper-module-transforms': 7.26.0(@babel/core@7.26.0) + '@babel/helpers': 7.26.0 + '@babel/parser': 7.26.3 + '@babel/template': 7.25.9 + '@babel/traverse': 7.26.4 + '@babel/types': 7.26.3 + convert-source-map: 2.0.0 + debug: 4.4.0 + gensync: 1.0.0-beta.2 + json5: 2.2.3 + semver: 6.3.1 + transitivePeerDependencies: + - supports-color + + '@babel/generator@7.26.3': + dependencies: + '@babel/parser': 7.26.3 + '@babel/types': 7.26.3 + '@jridgewell/gen-mapping': 0.3.8 + '@jridgewell/trace-mapping': 0.3.25 + jsesc: 3.1.0 + + '@babel/helper-compilation-targets@7.25.9': + dependencies: + '@babel/compat-data': 7.26.3 + '@babel/helper-validator-option': 7.25.9 + browserslist: 4.24.3 + lru-cache: 5.1.1 + semver: 6.3.1 + + '@babel/helper-module-imports@7.25.9': + dependencies: + '@babel/traverse': 7.26.4 + '@babel/types': 7.26.3 + transitivePeerDependencies: + - supports-color + + '@babel/helper-module-transforms@7.26.0(@babel/core@7.26.0)': + dependencies: + '@babel/core': 7.26.0 + '@babel/helper-module-imports': 7.25.9 + '@babel/helper-validator-identifier': 7.25.9 + '@babel/traverse': 7.26.4 + transitivePeerDependencies: + - supports-color + + '@babel/helper-plugin-utils@7.25.9': {} + + '@babel/helper-string-parser@7.25.9': {} + + '@babel/helper-validator-identifier@7.25.9': {} + + '@babel/helper-validator-option@7.25.9': {} + + '@babel/helpers@7.26.0': + dependencies: + '@babel/template': 7.25.9 + '@babel/types': 7.26.3 + + '@babel/parser@7.26.3': + dependencies: + '@babel/types': 7.26.3 + + '@babel/plugin-syntax-async-generators@7.8.4(@babel/core@7.26.0)': + dependencies: + '@babel/core': 7.26.0 + '@babel/helper-plugin-utils': 7.25.9 + + '@babel/plugin-syntax-bigint@7.8.3(@babel/core@7.26.0)': + dependencies: + '@babel/core': 7.26.0 + '@babel/helper-plugin-utils': 7.25.9 + + '@babel/plugin-syntax-class-properties@7.12.13(@babel/core@7.26.0)': + dependencies: + '@babel/core': 7.26.0 + '@babel/helper-plugin-utils': 7.25.9 + + '@babel/plugin-syntax-class-static-block@7.14.5(@babel/core@7.26.0)': + dependencies: + '@babel/core': 7.26.0 + '@babel/helper-plugin-utils': 7.25.9 + + '@babel/plugin-syntax-import-attributes@7.26.0(@babel/core@7.26.0)': + dependencies: + '@babel/core': 7.26.0 + '@babel/helper-plugin-utils': 7.25.9 + + '@babel/plugin-syntax-import-meta@7.10.4(@babel/core@7.26.0)': + dependencies: + '@babel/core': 7.26.0 + '@babel/helper-plugin-utils': 7.25.9 + + '@babel/plugin-syntax-json-strings@7.8.3(@babel/core@7.26.0)': + dependencies: + '@babel/core': 7.26.0 + '@babel/helper-plugin-utils': 7.25.9 + + '@babel/plugin-syntax-jsx@7.25.9(@babel/core@7.26.0)': + dependencies: + '@babel/core': 7.26.0 + '@babel/helper-plugin-utils': 7.25.9 + + '@babel/plugin-syntax-logical-assignment-operators@7.10.4(@babel/core@7.26.0)': + dependencies: + '@babel/core': 7.26.0 + '@babel/helper-plugin-utils': 7.25.9 + + '@babel/plugin-syntax-nullish-coalescing-operator@7.8.3(@babel/core@7.26.0)': + dependencies: + '@babel/core': 7.26.0 + '@babel/helper-plugin-utils': 7.25.9 + + '@babel/plugin-syntax-numeric-separator@7.10.4(@babel/core@7.26.0)': + dependencies: + '@babel/core': 7.26.0 + '@babel/helper-plugin-utils': 7.25.9 + + '@babel/plugin-syntax-object-rest-spread@7.8.3(@babel/core@7.26.0)': + dependencies: + '@babel/core': 7.26.0 + '@babel/helper-plugin-utils': 7.25.9 + + '@babel/plugin-syntax-optional-catch-binding@7.8.3(@babel/core@7.26.0)': + dependencies: + '@babel/core': 7.26.0 + '@babel/helper-plugin-utils': 7.25.9 + + '@babel/plugin-syntax-optional-chaining@7.8.3(@babel/core@7.26.0)': + dependencies: + '@babel/core': 7.26.0 + '@babel/helper-plugin-utils': 7.25.9 + + '@babel/plugin-syntax-private-property-in-object@7.14.5(@babel/core@7.26.0)': + dependencies: + '@babel/core': 7.26.0 + '@babel/helper-plugin-utils': 7.25.9 + + '@babel/plugin-syntax-top-level-await@7.14.5(@babel/core@7.26.0)': + dependencies: + '@babel/core': 7.26.0 + '@babel/helper-plugin-utils': 7.25.9 + + '@babel/plugin-syntax-typescript@7.25.9(@babel/core@7.26.0)': + dependencies: + '@babel/core': 7.26.0 + '@babel/helper-plugin-utils': 7.25.9 + + '@babel/template@7.25.9': + dependencies: + '@babel/code-frame': 7.26.2 + '@babel/parser': 7.26.3 + '@babel/types': 7.26.3 + + '@babel/traverse@7.26.4': + dependencies: + '@babel/code-frame': 7.26.2 + '@babel/generator': 7.26.3 + '@babel/parser': 7.26.3 + '@babel/template': 7.25.9 + '@babel/types': 7.26.3 + debug: 4.4.0 + globals: 11.12.0 + transitivePeerDependencies: + - supports-color + + '@babel/types@7.26.3': + dependencies: + '@babel/helper-string-parser': 7.25.9 + '@babel/helper-validator-identifier': 7.25.9 + + '@bcoe/v8-coverage@0.2.3': {} + + '@esbuild/aix-ppc64@0.24.0': + optional: true + + '@esbuild/android-arm64@0.24.0': + optional: true + + '@esbuild/android-arm@0.24.0': + optional: true + + '@esbuild/android-x64@0.24.0': + optional: true + + '@esbuild/darwin-arm64@0.24.0': + optional: true + + '@esbuild/darwin-x64@0.24.0': + optional: true + + '@esbuild/freebsd-arm64@0.24.0': + optional: true + + '@esbuild/freebsd-x64@0.24.0': + optional: true + + '@esbuild/linux-arm64@0.24.0': + optional: true + + '@esbuild/linux-arm@0.24.0': + optional: true + + '@esbuild/linux-ia32@0.24.0': + optional: true + + '@esbuild/linux-loong64@0.24.0': + optional: true + + '@esbuild/linux-mips64el@0.24.0': + optional: true + + '@esbuild/linux-ppc64@0.24.0': + optional: true + + '@esbuild/linux-riscv64@0.24.0': + optional: true + + '@esbuild/linux-s390x@0.24.0': + optional: true + + '@esbuild/linux-x64@0.24.0': + optional: true + + '@esbuild/netbsd-x64@0.24.0': + optional: true + + '@esbuild/openbsd-arm64@0.24.0': + optional: true + + '@esbuild/openbsd-x64@0.24.0': + optional: true + + '@esbuild/sunos-x64@0.24.0': + optional: true + + '@esbuild/win32-arm64@0.24.0': + optional: true + + '@esbuild/win32-ia32@0.24.0': + optional: true + + '@esbuild/win32-x64@0.24.0': + optional: true + + '@isaacs/cliui@8.0.2': + dependencies: + string-width: 5.1.2 + string-width-cjs: string-width@4.2.3 + strip-ansi: 7.1.0 + strip-ansi-cjs: strip-ansi@6.0.1 + wrap-ansi: 8.1.0 + wrap-ansi-cjs: wrap-ansi@7.0.0 + + '@istanbuljs/load-nyc-config@1.1.0': + dependencies: + camelcase: 5.3.1 + find-up: 4.1.0 + get-package-type: 0.1.0 + js-yaml: 3.14.1 + resolve-from: 5.0.0 + + '@istanbuljs/schema@0.1.3': {} + + '@jest/console@29.7.0': + dependencies: + '@jest/types': 29.6.3 + '@types/node': 20.17.10 + chalk: 4.1.2 + jest-message-util: 29.7.0 + jest-util: 29.7.0 + slash: 3.0.0 + + '@jest/core@29.7.0': + dependencies: + '@jest/console': 29.7.0 + '@jest/reporters': 29.7.0 + '@jest/test-result': 29.7.0 + '@jest/transform': 29.7.0 + '@jest/types': 29.6.3 + '@types/node': 20.17.10 + ansi-escapes: 4.3.2 + chalk: 4.1.2 + ci-info: 3.9.0 + exit: 0.1.2 + graceful-fs: 4.2.11 + jest-changed-files: 29.7.0 + jest-config: 29.7.0(@types/node@20.17.10) + jest-haste-map: 29.7.0 + jest-message-util: 29.7.0 + jest-regex-util: 29.6.3 + jest-resolve: 29.7.0 + jest-resolve-dependencies: 29.7.0 + jest-runner: 29.7.0 + jest-runtime: 29.7.0 + jest-snapshot: 29.7.0 + jest-util: 29.7.0 + jest-validate: 29.7.0 + jest-watcher: 29.7.0 + micromatch: 4.0.8 + pretty-format: 29.7.0 + slash: 3.0.0 + strip-ansi: 6.0.1 + transitivePeerDependencies: + - babel-plugin-macros + - supports-color + - ts-node + + '@jest/environment@29.7.0': + dependencies: + '@jest/fake-timers': 29.7.0 + '@jest/types': 29.6.3 + '@types/node': 20.17.10 + jest-mock: 29.7.0 + + '@jest/expect-utils@29.7.0': + dependencies: + jest-get-type: 29.6.3 + + '@jest/expect@29.7.0': + dependencies: + expect: 29.7.0 + jest-snapshot: 29.7.0 + transitivePeerDependencies: + - supports-color + + '@jest/fake-timers@29.7.0': + dependencies: + '@jest/types': 29.6.3 + '@sinonjs/fake-timers': 10.3.0 + '@types/node': 20.17.10 + jest-message-util: 29.7.0 + jest-mock: 29.7.0 + jest-util: 29.7.0 + + '@jest/globals@29.7.0': + dependencies: + '@jest/environment': 29.7.0 + '@jest/expect': 29.7.0 + '@jest/types': 29.6.3 + jest-mock: 29.7.0 + transitivePeerDependencies: + - supports-color + + '@jest/reporters@29.7.0': + dependencies: + '@bcoe/v8-coverage': 0.2.3 + '@jest/console': 29.7.0 + '@jest/test-result': 29.7.0 + '@jest/transform': 29.7.0 + '@jest/types': 29.6.3 + '@jridgewell/trace-mapping': 0.3.25 + '@types/node': 20.17.10 + chalk: 4.1.2 + collect-v8-coverage: 1.0.2 + exit: 0.1.2 + glob: 7.2.3 + graceful-fs: 4.2.11 + istanbul-lib-coverage: 3.2.2 + istanbul-lib-instrument: 6.0.3 + istanbul-lib-report: 3.0.1 + istanbul-lib-source-maps: 4.0.1 + istanbul-reports: 3.1.7 + jest-message-util: 29.7.0 + jest-util: 29.7.0 + jest-worker: 29.7.0 + slash: 3.0.0 + string-length: 4.0.2 + strip-ansi: 6.0.1 + v8-to-istanbul: 9.3.0 + transitivePeerDependencies: + - supports-color + + '@jest/schemas@29.6.3': + dependencies: + '@sinclair/typebox': 0.27.8 + + '@jest/source-map@29.6.3': + dependencies: + '@jridgewell/trace-mapping': 0.3.25 + callsites: 3.1.0 + graceful-fs: 4.2.11 + + '@jest/test-result@29.7.0': + dependencies: + '@jest/console': 29.7.0 + '@jest/types': 29.6.3 + '@types/istanbul-lib-coverage': 2.0.6 + collect-v8-coverage: 1.0.2 + + '@jest/test-sequencer@29.7.0': + dependencies: + '@jest/test-result': 29.7.0 + graceful-fs: 4.2.11 + jest-haste-map: 29.7.0 + slash: 3.0.0 + + '@jest/transform@29.7.0': + dependencies: + '@babel/core': 7.26.0 + '@jest/types': 29.6.3 + '@jridgewell/trace-mapping': 0.3.25 + babel-plugin-istanbul: 6.1.1 + chalk: 4.1.2 + convert-source-map: 2.0.0 + fast-json-stable-stringify: 2.1.0 + graceful-fs: 4.2.11 + jest-haste-map: 29.7.0 + jest-regex-util: 29.6.3 + jest-util: 29.7.0 + micromatch: 4.0.8 + pirates: 4.0.6 + slash: 3.0.0 + write-file-atomic: 4.0.2 + transitivePeerDependencies: + - supports-color + + '@jest/types@29.6.3': + dependencies: + '@jest/schemas': 29.6.3 + '@types/istanbul-lib-coverage': 2.0.6 + '@types/istanbul-reports': 3.0.4 + '@types/node': 20.17.10 + '@types/yargs': 17.0.33 + chalk: 4.1.2 + + '@jridgewell/gen-mapping@0.3.8': + dependencies: + '@jridgewell/set-array': 1.2.1 + '@jridgewell/sourcemap-codec': 1.5.0 + '@jridgewell/trace-mapping': 0.3.25 + + '@jridgewell/resolve-uri@3.1.2': {} + + '@jridgewell/set-array@1.2.1': {} + + '@jridgewell/sourcemap-codec@1.5.0': {} + + '@jridgewell/trace-mapping@0.3.25': + dependencies: + '@jridgewell/resolve-uri': 3.1.2 + '@jridgewell/sourcemap-codec': 1.5.0 + + '@pkgjs/parseargs@0.11.0': + optional: true + + '@rollup/rollup-android-arm-eabi@4.28.1': + optional: true + + '@rollup/rollup-android-arm64@4.28.1': + optional: true + + '@rollup/rollup-darwin-arm64@4.28.1': + optional: true + + '@rollup/rollup-darwin-x64@4.28.1': + optional: true + + '@rollup/rollup-freebsd-arm64@4.28.1': + optional: true + + '@rollup/rollup-freebsd-x64@4.28.1': + optional: true + + '@rollup/rollup-linux-arm-gnueabihf@4.28.1': + optional: true + + '@rollup/rollup-linux-arm-musleabihf@4.28.1': + optional: true + + '@rollup/rollup-linux-arm64-gnu@4.28.1': + optional: true + + '@rollup/rollup-linux-arm64-musl@4.28.1': + optional: true + + '@rollup/rollup-linux-loongarch64-gnu@4.28.1': + optional: true + + '@rollup/rollup-linux-powerpc64le-gnu@4.28.1': + optional: true + + '@rollup/rollup-linux-riscv64-gnu@4.28.1': + optional: true + + '@rollup/rollup-linux-s390x-gnu@4.28.1': + optional: true + + '@rollup/rollup-linux-x64-gnu@4.28.1': + optional: true + + '@rollup/rollup-linux-x64-musl@4.28.1': + optional: true + + '@rollup/rollup-win32-arm64-msvc@4.28.1': + optional: true + + '@rollup/rollup-win32-ia32-msvc@4.28.1': + optional: true + + '@rollup/rollup-win32-x64-msvc@4.28.1': + optional: true + + '@sinclair/typebox@0.27.8': {} + + '@sinonjs/commons@3.0.1': + dependencies: + type-detect: 4.0.8 + + '@sinonjs/fake-timers@10.3.0': + dependencies: + '@sinonjs/commons': 3.0.1 + + '@types/axios@0.14.4': + dependencies: + axios: 1.7.9 + transitivePeerDependencies: + - debug + + '@types/babel__core@7.20.5': + dependencies: + '@babel/parser': 7.26.3 + '@babel/types': 7.26.3 + '@types/babel__generator': 7.6.8 + '@types/babel__template': 7.4.4 + '@types/babel__traverse': 7.20.6 + + '@types/babel__generator@7.6.8': + dependencies: + '@babel/types': 7.26.3 + + '@types/babel__template@7.4.4': + dependencies: + '@babel/parser': 7.26.3 + '@babel/types': 7.26.3 + + '@types/babel__traverse@7.20.6': + dependencies: + '@babel/types': 7.26.3 + + '@types/dotenv@8.2.3': + dependencies: + dotenv: 16.4.7 + + '@types/estree@1.0.6': {} + + '@types/graceful-fs@4.1.9': + dependencies: + '@types/node': 20.17.10 + + '@types/istanbul-lib-coverage@2.0.6': {} + + '@types/istanbul-lib-report@3.0.3': + dependencies: + '@types/istanbul-lib-coverage': 2.0.6 + + '@types/istanbul-reports@3.0.4': + dependencies: + '@types/istanbul-lib-report': 3.0.3 + + '@types/jest@29.5.14': + dependencies: + expect: 29.7.0 + pretty-format: 29.7.0 + + '@types/mocha@10.0.10': {} + + '@types/node@20.17.10': + dependencies: + undici-types: 6.19.8 + + '@types/stack-utils@2.0.3': {} + + '@types/uuid@9.0.8': {} + + '@types/yargs-parser@21.0.3': {} + + '@types/yargs@17.0.33': + dependencies: + '@types/yargs-parser': 21.0.3 + + ansi-escapes@4.3.2: + dependencies: + type-fest: 0.21.3 + + ansi-regex@5.0.1: {} + + ansi-regex@6.1.0: {} + + ansi-styles@4.3.0: + dependencies: + color-convert: 2.0.1 + + ansi-styles@5.2.0: {} + + ansi-styles@6.2.1: {} + + any-promise@1.3.0: {} + + anymatch@3.1.3: + dependencies: + normalize-path: 3.0.0 + picomatch: 2.3.1 + + argparse@1.0.10: + dependencies: + sprintf-js: 1.0.3 + + async@3.2.6: {} + + asynckit@0.4.0: {} + + axios@1.7.9: + dependencies: + follow-redirects: 1.15.9 + form-data: 4.0.1 + proxy-from-env: 1.1.0 + transitivePeerDependencies: + - debug + + babel-jest@29.7.0(@babel/core@7.26.0): + dependencies: + '@babel/core': 7.26.0 + '@jest/transform': 29.7.0 + '@types/babel__core': 7.20.5 + babel-plugin-istanbul: 6.1.1 + babel-preset-jest: 29.6.3(@babel/core@7.26.0) + chalk: 4.1.2 + graceful-fs: 4.2.11 + slash: 3.0.0 + transitivePeerDependencies: + - supports-color + + babel-plugin-istanbul@6.1.1: + dependencies: + '@babel/helper-plugin-utils': 7.25.9 + '@istanbuljs/load-nyc-config': 1.1.0 + '@istanbuljs/schema': 0.1.3 + istanbul-lib-instrument: 5.2.1 + test-exclude: 6.0.0 + transitivePeerDependencies: + - supports-color + + babel-plugin-jest-hoist@29.6.3: + dependencies: + '@babel/template': 7.25.9 + '@babel/types': 7.26.3 + '@types/babel__core': 7.20.5 + '@types/babel__traverse': 7.20.6 + + babel-preset-current-node-syntax@1.1.0(@babel/core@7.26.0): + dependencies: + '@babel/core': 7.26.0 + '@babel/plugin-syntax-async-generators': 7.8.4(@babel/core@7.26.0) + '@babel/plugin-syntax-bigint': 7.8.3(@babel/core@7.26.0) + '@babel/plugin-syntax-class-properties': 7.12.13(@babel/core@7.26.0) + '@babel/plugin-syntax-class-static-block': 7.14.5(@babel/core@7.26.0) + '@babel/plugin-syntax-import-attributes': 7.26.0(@babel/core@7.26.0) + '@babel/plugin-syntax-import-meta': 7.10.4(@babel/core@7.26.0) + '@babel/plugin-syntax-json-strings': 7.8.3(@babel/core@7.26.0) + '@babel/plugin-syntax-logical-assignment-operators': 7.10.4(@babel/core@7.26.0) + '@babel/plugin-syntax-nullish-coalescing-operator': 7.8.3(@babel/core@7.26.0) + '@babel/plugin-syntax-numeric-separator': 7.10.4(@babel/core@7.26.0) + '@babel/plugin-syntax-object-rest-spread': 7.8.3(@babel/core@7.26.0) + '@babel/plugin-syntax-optional-catch-binding': 7.8.3(@babel/core@7.26.0) + '@babel/plugin-syntax-optional-chaining': 7.8.3(@babel/core@7.26.0) + '@babel/plugin-syntax-private-property-in-object': 7.14.5(@babel/core@7.26.0) + '@babel/plugin-syntax-top-level-await': 7.14.5(@babel/core@7.26.0) + + babel-preset-jest@29.6.3(@babel/core@7.26.0): + dependencies: + '@babel/core': 7.26.0 + babel-plugin-jest-hoist: 29.6.3 + babel-preset-current-node-syntax: 1.1.0(@babel/core@7.26.0) + + balanced-match@1.0.2: {} + + brace-expansion@1.1.11: + dependencies: + balanced-match: 1.0.2 + concat-map: 0.0.1 + + brace-expansion@2.0.1: + dependencies: + balanced-match: 1.0.2 + + braces@3.0.3: + dependencies: + fill-range: 7.1.1 + + browserslist@4.24.3: + dependencies: + caniuse-lite: 1.0.30001690 + electron-to-chromium: 1.5.74 + node-releases: 2.0.19 + update-browserslist-db: 1.1.1(browserslist@4.24.3) + + bs-logger@0.2.6: + dependencies: + fast-json-stable-stringify: 2.1.0 + + bser@2.1.1: + dependencies: + node-int64: 0.4.0 + + buffer-from@1.1.2: {} + + bundle-require@5.0.0(esbuild@0.24.0): + dependencies: + esbuild: 0.24.0 + load-tsconfig: 0.2.5 + + cac@6.7.14: {} + + callsites@3.1.0: {} + + camelcase@5.3.1: {} + + camelcase@6.3.0: {} + + caniuse-lite@1.0.30001690: {} + + chalk@4.1.2: + dependencies: + ansi-styles: 4.3.0 + supports-color: 7.2.0 + + char-regex@1.0.2: {} + + chokidar@4.0.3: + dependencies: + readdirp: 4.0.2 + + ci-info@3.9.0: {} + + cjs-module-lexer@1.4.1: {} + + cliui@8.0.1: + dependencies: + string-width: 4.2.3 + strip-ansi: 6.0.1 + wrap-ansi: 7.0.0 + + co@4.6.0: {} + + collect-v8-coverage@1.0.2: {} + + color-convert@2.0.1: + dependencies: + color-name: 1.1.4 + + color-name@1.1.4: {} + + combined-stream@1.0.8: + dependencies: + delayed-stream: 1.0.0 + + commander@4.1.1: {} + + concat-map@0.0.1: {} + + consola@3.2.3: {} + + convert-source-map@2.0.0: {} + + create-jest@29.7.0(@types/node@20.17.10): + dependencies: + '@jest/types': 29.6.3 + chalk: 4.1.2 + exit: 0.1.2 + graceful-fs: 4.2.11 + jest-config: 29.7.0(@types/node@20.17.10) + jest-util: 29.7.0 + prompts: 2.4.2 + transitivePeerDependencies: + - '@types/node' + - babel-plugin-macros + - supports-color + - ts-node + + cross-spawn@7.0.6: + dependencies: + path-key: 3.1.1 + shebang-command: 2.0.0 + which: 2.0.2 + + debug@4.4.0: + dependencies: + ms: 2.1.3 + + dedent@1.5.3: {} + + deepmerge@4.3.1: {} + + delayed-stream@1.0.0: {} + + detect-newline@3.1.0: {} + + diff-sequences@29.6.3: {} + + dotenv@16.4.7: {} + + eastasianwidth@0.2.0: {} + + ejs@3.1.10: + dependencies: + jake: 10.9.2 + + electron-to-chromium@1.5.74: {} + + emittery@0.13.1: {} + + emoji-regex@8.0.0: {} + + emoji-regex@9.2.2: {} + + error-ex@1.3.2: + dependencies: + is-arrayish: 0.2.1 + + esbuild@0.24.0: + optionalDependencies: + '@esbuild/aix-ppc64': 0.24.0 + '@esbuild/android-arm': 0.24.0 + '@esbuild/android-arm64': 0.24.0 + '@esbuild/android-x64': 0.24.0 + '@esbuild/darwin-arm64': 0.24.0 + '@esbuild/darwin-x64': 0.24.0 + '@esbuild/freebsd-arm64': 0.24.0 + '@esbuild/freebsd-x64': 0.24.0 + '@esbuild/linux-arm': 0.24.0 + '@esbuild/linux-arm64': 0.24.0 + '@esbuild/linux-ia32': 0.24.0 + '@esbuild/linux-loong64': 0.24.0 + '@esbuild/linux-mips64el': 0.24.0 + '@esbuild/linux-ppc64': 0.24.0 + '@esbuild/linux-riscv64': 0.24.0 + '@esbuild/linux-s390x': 0.24.0 + '@esbuild/linux-x64': 0.24.0 + '@esbuild/netbsd-x64': 0.24.0 + '@esbuild/openbsd-arm64': 0.24.0 + '@esbuild/openbsd-x64': 0.24.0 + '@esbuild/sunos-x64': 0.24.0 + '@esbuild/win32-arm64': 0.24.0 + '@esbuild/win32-ia32': 0.24.0 + '@esbuild/win32-x64': 0.24.0 + + escalade@3.2.0: {} + + escape-string-regexp@2.0.0: {} + + esprima@4.0.1: {} + + execa@5.1.1: + dependencies: + cross-spawn: 7.0.6 + get-stream: 6.0.1 + human-signals: 2.1.0 + is-stream: 2.0.1 + merge-stream: 2.0.0 + npm-run-path: 4.0.1 + onetime: 5.1.2 + signal-exit: 3.0.7 + strip-final-newline: 2.0.0 + + exit@0.1.2: {} + + expect@29.7.0: + dependencies: + '@jest/expect-utils': 29.7.0 + jest-get-type: 29.6.3 + jest-matcher-utils: 29.7.0 + jest-message-util: 29.7.0 + jest-util: 29.7.0 + + fast-json-stable-stringify@2.1.0: {} + + fb-watchman@2.0.2: + dependencies: + bser: 2.1.1 + + fdir@6.4.2(picomatch@4.0.2): + optionalDependencies: + picomatch: 4.0.2 + + filelist@1.0.4: + dependencies: + minimatch: 5.1.6 + + fill-range@7.1.1: + dependencies: + to-regex-range: 5.0.1 + + find-up@4.1.0: + dependencies: + locate-path: 5.0.0 + path-exists: 4.0.0 + + follow-redirects@1.15.9: {} + + foreground-child@3.3.0: + dependencies: + cross-spawn: 7.0.6 + signal-exit: 4.1.0 + + form-data@4.0.1: + dependencies: + asynckit: 0.4.0 + combined-stream: 1.0.8 + mime-types: 2.1.35 + + fs.realpath@1.0.0: {} + + fsevents@2.3.3: + optional: true + + function-bind@1.1.2: {} + + gensync@1.0.0-beta.2: {} + + get-caller-file@2.0.5: {} + + get-package-type@0.1.0: {} + + get-stream@6.0.1: {} + + glob@10.4.5: + dependencies: + foreground-child: 3.3.0 + jackspeak: 3.4.3 + minimatch: 9.0.5 + minipass: 7.1.2 + package-json-from-dist: 1.0.1 + path-scurry: 1.11.1 + + glob@7.2.3: + dependencies: + fs.realpath: 1.0.0 + inflight: 1.0.6 + inherits: 2.0.4 + minimatch: 3.1.2 + once: 1.4.0 + path-is-absolute: 1.0.1 + + globals@11.12.0: {} + + graceful-fs@4.2.11: {} + + has-flag@4.0.0: {} + + hasown@2.0.2: + dependencies: + function-bind: 1.1.2 + + html-escaper@2.0.2: {} + + human-signals@2.1.0: {} + + import-local@3.2.0: + dependencies: + pkg-dir: 4.2.0 + resolve-cwd: 3.0.0 + + imurmurhash@0.1.4: {} + + inflight@1.0.6: + dependencies: + once: 1.4.0 + wrappy: 1.0.2 + + inherits@2.0.4: {} + + is-arrayish@0.2.1: {} + + is-core-module@2.16.0: + dependencies: + hasown: 2.0.2 + + is-fullwidth-code-point@3.0.0: {} + + is-generator-fn@2.1.0: {} + + is-number@7.0.0: {} + + is-stream@2.0.1: {} + + isexe@2.0.0: {} + + isows@1.0.6(ws@8.18.0): + dependencies: + ws: 8.18.0 + + istanbul-lib-coverage@3.2.2: {} + + istanbul-lib-instrument@5.2.1: + dependencies: + '@babel/core': 7.26.0 + '@babel/parser': 7.26.3 + '@istanbuljs/schema': 0.1.3 + istanbul-lib-coverage: 3.2.2 + semver: 6.3.1 + transitivePeerDependencies: + - supports-color + + istanbul-lib-instrument@6.0.3: + dependencies: + '@babel/core': 7.26.0 + '@babel/parser': 7.26.3 + '@istanbuljs/schema': 0.1.3 + istanbul-lib-coverage: 3.2.2 + semver: 7.6.3 + transitivePeerDependencies: + - supports-color + + istanbul-lib-report@3.0.1: + dependencies: + istanbul-lib-coverage: 3.2.2 + make-dir: 4.0.0 + supports-color: 7.2.0 + + istanbul-lib-source-maps@4.0.1: + dependencies: + debug: 4.4.0 + istanbul-lib-coverage: 3.2.2 + source-map: 0.6.1 + transitivePeerDependencies: + - supports-color + + istanbul-reports@3.1.7: + dependencies: + html-escaper: 2.0.2 + istanbul-lib-report: 3.0.1 + + jackspeak@3.4.3: + dependencies: + '@isaacs/cliui': 8.0.2 + optionalDependencies: + '@pkgjs/parseargs': 0.11.0 + + jake@10.9.2: + dependencies: + async: 3.2.6 + chalk: 4.1.2 + filelist: 1.0.4 + minimatch: 3.1.2 + + jest-changed-files@29.7.0: + dependencies: + execa: 5.1.1 + jest-util: 29.7.0 + p-limit: 3.1.0 + + jest-circus@29.7.0: + dependencies: + '@jest/environment': 29.7.0 + '@jest/expect': 29.7.0 + '@jest/test-result': 29.7.0 + '@jest/types': 29.6.3 + '@types/node': 20.17.10 + chalk: 4.1.2 + co: 4.6.0 + dedent: 1.5.3 + is-generator-fn: 2.1.0 + jest-each: 29.7.0 + jest-matcher-utils: 29.7.0 + jest-message-util: 29.7.0 + jest-runtime: 29.7.0 + jest-snapshot: 29.7.0 + jest-util: 29.7.0 + p-limit: 3.1.0 + pretty-format: 29.7.0 + pure-rand: 6.1.0 + slash: 3.0.0 + stack-utils: 2.0.6 + transitivePeerDependencies: + - babel-plugin-macros + - supports-color + + jest-cli@29.7.0(@types/node@20.17.10): + dependencies: + '@jest/core': 29.7.0 + '@jest/test-result': 29.7.0 + '@jest/types': 29.6.3 + chalk: 4.1.2 + create-jest: 29.7.0(@types/node@20.17.10) + exit: 0.1.2 + import-local: 3.2.0 + jest-config: 29.7.0(@types/node@20.17.10) + jest-util: 29.7.0 + jest-validate: 29.7.0 + yargs: 17.7.2 + transitivePeerDependencies: + - '@types/node' + - babel-plugin-macros + - supports-color + - ts-node + + jest-config@29.7.0(@types/node@20.17.10): + dependencies: + '@babel/core': 7.26.0 + '@jest/test-sequencer': 29.7.0 + '@jest/types': 29.6.3 + babel-jest: 29.7.0(@babel/core@7.26.0) + chalk: 4.1.2 + ci-info: 3.9.0 + deepmerge: 4.3.1 + glob: 7.2.3 + graceful-fs: 4.2.11 + jest-circus: 29.7.0 + jest-environment-node: 29.7.0 + jest-get-type: 29.6.3 + jest-regex-util: 29.6.3 + jest-resolve: 29.7.0 + jest-runner: 29.7.0 + jest-util: 29.7.0 + jest-validate: 29.7.0 + micromatch: 4.0.8 + parse-json: 5.2.0 + pretty-format: 29.7.0 + slash: 3.0.0 + strip-json-comments: 3.1.1 + optionalDependencies: + '@types/node': 20.17.10 + transitivePeerDependencies: + - babel-plugin-macros + - supports-color + + jest-diff@29.7.0: + dependencies: + chalk: 4.1.2 + diff-sequences: 29.6.3 + jest-get-type: 29.6.3 + pretty-format: 29.7.0 + + jest-docblock@29.7.0: + dependencies: + detect-newline: 3.1.0 + + jest-each@29.7.0: + dependencies: + '@jest/types': 29.6.3 + chalk: 4.1.2 + jest-get-type: 29.6.3 + jest-util: 29.7.0 + pretty-format: 29.7.0 + + jest-environment-node@29.7.0: + dependencies: + '@jest/environment': 29.7.0 + '@jest/fake-timers': 29.7.0 + '@jest/types': 29.6.3 + '@types/node': 20.17.10 + jest-mock: 29.7.0 + jest-util: 29.7.0 + + jest-get-type@29.6.3: {} + + jest-haste-map@29.7.0: + dependencies: + '@jest/types': 29.6.3 + '@types/graceful-fs': 4.1.9 + '@types/node': 20.17.10 + anymatch: 3.1.3 + fb-watchman: 2.0.2 + graceful-fs: 4.2.11 + jest-regex-util: 29.6.3 + jest-util: 29.7.0 + jest-worker: 29.7.0 + micromatch: 4.0.8 + walker: 1.0.8 + optionalDependencies: + fsevents: 2.3.3 + + jest-leak-detector@29.7.0: + dependencies: + jest-get-type: 29.6.3 + pretty-format: 29.7.0 + + jest-matcher-utils@29.7.0: + dependencies: + chalk: 4.1.2 + jest-diff: 29.7.0 + jest-get-type: 29.6.3 + pretty-format: 29.7.0 + + jest-message-util@29.7.0: + dependencies: + '@babel/code-frame': 7.26.2 + '@jest/types': 29.6.3 + '@types/stack-utils': 2.0.3 + chalk: 4.1.2 + graceful-fs: 4.2.11 + micromatch: 4.0.8 + pretty-format: 29.7.0 + slash: 3.0.0 + stack-utils: 2.0.6 + + jest-mock@29.7.0: + dependencies: + '@jest/types': 29.6.3 + '@types/node': 20.17.10 + jest-util: 29.7.0 + + jest-pnp-resolver@1.2.3(jest-resolve@29.7.0): + optionalDependencies: + jest-resolve: 29.7.0 + + jest-regex-util@29.6.3: {} + + jest-resolve-dependencies@29.7.0: + dependencies: + jest-regex-util: 29.6.3 + jest-snapshot: 29.7.0 + transitivePeerDependencies: + - supports-color + + jest-resolve@29.7.0: + dependencies: + chalk: 4.1.2 + graceful-fs: 4.2.11 + jest-haste-map: 29.7.0 + jest-pnp-resolver: 1.2.3(jest-resolve@29.7.0) + jest-util: 29.7.0 + jest-validate: 29.7.0 + resolve: 1.22.9 + resolve.exports: 2.0.3 + slash: 3.0.0 + + jest-runner@29.7.0: + dependencies: + '@jest/console': 29.7.0 + '@jest/environment': 29.7.0 + '@jest/test-result': 29.7.0 + '@jest/transform': 29.7.0 + '@jest/types': 29.6.3 + '@types/node': 20.17.10 + chalk: 4.1.2 + emittery: 0.13.1 + graceful-fs: 4.2.11 + jest-docblock: 29.7.0 + jest-environment-node: 29.7.0 + jest-haste-map: 29.7.0 + jest-leak-detector: 29.7.0 + jest-message-util: 29.7.0 + jest-resolve: 29.7.0 + jest-runtime: 29.7.0 + jest-util: 29.7.0 + jest-watcher: 29.7.0 + jest-worker: 29.7.0 + p-limit: 3.1.0 + source-map-support: 0.5.13 + transitivePeerDependencies: + - supports-color + + jest-runtime@29.7.0: + dependencies: + '@jest/environment': 29.7.0 + '@jest/fake-timers': 29.7.0 + '@jest/globals': 29.7.0 + '@jest/source-map': 29.6.3 + '@jest/test-result': 29.7.0 + '@jest/transform': 29.7.0 + '@jest/types': 29.6.3 + '@types/node': 20.17.10 + chalk: 4.1.2 + cjs-module-lexer: 1.4.1 + collect-v8-coverage: 1.0.2 + glob: 7.2.3 + graceful-fs: 4.2.11 + jest-haste-map: 29.7.0 + jest-message-util: 29.7.0 + jest-mock: 29.7.0 + jest-regex-util: 29.6.3 + jest-resolve: 29.7.0 + jest-snapshot: 29.7.0 + jest-util: 29.7.0 + slash: 3.0.0 + strip-bom: 4.0.0 + transitivePeerDependencies: + - supports-color + + jest-snapshot@29.7.0: + dependencies: + '@babel/core': 7.26.0 + '@babel/generator': 7.26.3 + '@babel/plugin-syntax-jsx': 7.25.9(@babel/core@7.26.0) + '@babel/plugin-syntax-typescript': 7.25.9(@babel/core@7.26.0) + '@babel/types': 7.26.3 + '@jest/expect-utils': 29.7.0 + '@jest/transform': 29.7.0 + '@jest/types': 29.6.3 + babel-preset-current-node-syntax: 1.1.0(@babel/core@7.26.0) + chalk: 4.1.2 + expect: 29.7.0 + graceful-fs: 4.2.11 + jest-diff: 29.7.0 + jest-get-type: 29.6.3 + jest-matcher-utils: 29.7.0 + jest-message-util: 29.7.0 + jest-util: 29.7.0 + natural-compare: 1.4.0 + pretty-format: 29.7.0 + semver: 7.6.3 + transitivePeerDependencies: + - supports-color + + jest-util@29.7.0: + dependencies: + '@jest/types': 29.6.3 + '@types/node': 20.17.10 + chalk: 4.1.2 + ci-info: 3.9.0 + graceful-fs: 4.2.11 + picomatch: 2.3.1 + + jest-validate@29.7.0: + dependencies: + '@jest/types': 29.6.3 + camelcase: 6.3.0 + chalk: 4.1.2 + jest-get-type: 29.6.3 + leven: 3.1.0 + pretty-format: 29.7.0 + + jest-watcher@29.7.0: + dependencies: + '@jest/test-result': 29.7.0 + '@jest/types': 29.6.3 + '@types/node': 20.17.10 + ansi-escapes: 4.3.2 + chalk: 4.1.2 + emittery: 0.13.1 + jest-util: 29.7.0 + string-length: 4.0.2 + + jest-worker@29.7.0: + dependencies: + '@types/node': 20.17.10 + jest-util: 29.7.0 + merge-stream: 2.0.0 + supports-color: 8.1.1 + + jest@29.7.0(@types/node@20.17.10): + dependencies: + '@jest/core': 29.7.0 + '@jest/types': 29.6.3 + import-local: 3.2.0 + jest-cli: 29.7.0(@types/node@20.17.10) + transitivePeerDependencies: + - '@types/node' + - babel-plugin-macros + - supports-color + - ts-node + + joycon@3.1.1: {} + + js-tokens@4.0.0: {} + + js-yaml@3.14.1: + dependencies: + argparse: 1.0.10 + esprima: 4.0.1 + + jsesc@3.1.0: {} + + json-parse-even-better-errors@2.3.1: {} + + json5@2.2.3: {} + + kleur@3.0.3: {} + + leven@3.1.0: {} + + lilconfig@3.1.3: {} + + lines-and-columns@1.2.4: {} + + load-tsconfig@0.2.5: {} + + locate-path@5.0.0: + dependencies: + p-locate: 4.1.0 + + lodash.memoize@4.1.2: {} + + lodash.sortby@4.7.0: {} + + lru-cache@10.4.3: {} + + lru-cache@5.1.1: + dependencies: + yallist: 3.1.1 + + make-dir@4.0.0: + dependencies: + semver: 7.6.3 + + make-error@1.3.6: {} + + makeerror@1.0.12: + dependencies: + tmpl: 1.0.5 + + merge-stream@2.0.0: {} + + micromatch@4.0.8: + dependencies: + braces: 3.0.3 + picomatch: 2.3.1 + + mime-db@1.52.0: {} + + mime-types@2.1.35: + dependencies: + mime-db: 1.52.0 + + mimic-fn@2.1.0: {} + + minimatch@3.1.2: + dependencies: + brace-expansion: 1.1.11 + + minimatch@5.1.6: + dependencies: + brace-expansion: 2.0.1 + + minimatch@9.0.5: + dependencies: + brace-expansion: 2.0.1 + + minipass@7.1.2: {} + + ms@2.1.3: {} + + mz@2.7.0: + dependencies: + any-promise: 1.3.0 + object-assign: 4.1.1 + thenify-all: 1.6.0 + + natural-compare@1.4.0: {} + + node-int64@0.4.0: {} + + node-releases@2.0.19: {} + + normalize-path@3.0.0: {} + + npm-run-path@4.0.1: + dependencies: + path-key: 3.1.1 + + object-assign@4.1.1: {} + + once@1.4.0: + dependencies: + wrappy: 1.0.2 + + onetime@5.1.2: + dependencies: + mimic-fn: 2.1.0 + + p-limit@2.3.0: + dependencies: + p-try: 2.2.0 + + p-limit@3.1.0: + dependencies: + yocto-queue: 0.1.0 + + p-locate@4.1.0: + dependencies: + p-limit: 2.3.0 + + p-try@2.2.0: {} + + package-json-from-dist@1.0.1: {} + + parse-json@5.2.0: + dependencies: + '@babel/code-frame': 7.26.2 + error-ex: 1.3.2 + json-parse-even-better-errors: 2.3.1 + lines-and-columns: 1.2.4 + + path-exists@4.0.0: {} + + path-is-absolute@1.0.1: {} + + path-key@3.1.1: {} + + path-parse@1.0.7: {} + + path-scurry@1.11.1: + dependencies: + lru-cache: 10.4.3 + minipass: 7.1.2 + + picocolors@1.1.1: {} + + picomatch@2.3.1: {} + + picomatch@4.0.2: {} + + pirates@4.0.6: {} + + pkg-dir@4.2.0: + dependencies: + find-up: 4.1.0 + + postcss-load-config@6.0.1: + dependencies: + lilconfig: 3.1.3 + + pretty-format@29.7.0: + dependencies: + '@jest/schemas': 29.6.3 + ansi-styles: 5.2.0 + react-is: 18.3.1 + + prompts@2.4.2: + dependencies: + kleur: 3.0.3 + sisteransi: 1.0.5 + + proxy-from-env@1.1.0: {} + + punycode@2.3.1: {} + + pure-rand@6.1.0: {} + + react-is@18.3.1: {} + + readdirp@4.0.2: {} + + require-directory@2.1.1: {} + + resolve-cwd@3.0.0: + dependencies: + resolve-from: 5.0.0 + + resolve-from@5.0.0: {} + + resolve.exports@2.0.3: {} + + resolve@1.22.9: + dependencies: + is-core-module: 2.16.0 + path-parse: 1.0.7 + supports-preserve-symlinks-flag: 1.0.0 + + rollup@4.28.1: + dependencies: + '@types/estree': 1.0.6 + optionalDependencies: + '@rollup/rollup-android-arm-eabi': 4.28.1 + '@rollup/rollup-android-arm64': 4.28.1 + '@rollup/rollup-darwin-arm64': 4.28.1 + '@rollup/rollup-darwin-x64': 4.28.1 + '@rollup/rollup-freebsd-arm64': 4.28.1 + '@rollup/rollup-freebsd-x64': 4.28.1 + '@rollup/rollup-linux-arm-gnueabihf': 4.28.1 + '@rollup/rollup-linux-arm-musleabihf': 4.28.1 + '@rollup/rollup-linux-arm64-gnu': 4.28.1 + '@rollup/rollup-linux-arm64-musl': 4.28.1 + '@rollup/rollup-linux-loongarch64-gnu': 4.28.1 + '@rollup/rollup-linux-powerpc64le-gnu': 4.28.1 + '@rollup/rollup-linux-riscv64-gnu': 4.28.1 + '@rollup/rollup-linux-s390x-gnu': 4.28.1 + '@rollup/rollup-linux-x64-gnu': 4.28.1 + '@rollup/rollup-linux-x64-musl': 4.28.1 + '@rollup/rollup-win32-arm64-msvc': 4.28.1 + '@rollup/rollup-win32-ia32-msvc': 4.28.1 + '@rollup/rollup-win32-x64-msvc': 4.28.1 + fsevents: 2.3.3 + + semver@6.3.1: {} + + semver@7.6.3: {} + + shebang-command@2.0.0: + dependencies: + shebang-regex: 3.0.0 + + shebang-regex@3.0.0: {} + + signal-exit@3.0.7: {} + + signal-exit@4.1.0: {} + + sisteransi@1.0.5: {} + + slash@3.0.0: {} + + source-map-support@0.5.13: + dependencies: + buffer-from: 1.1.2 + source-map: 0.6.1 + + source-map@0.6.1: {} + + source-map@0.8.0-beta.0: + dependencies: + whatwg-url: 7.1.0 + + sprintf-js@1.0.3: {} + + stack-utils@2.0.6: + dependencies: + escape-string-regexp: 2.0.0 + + string-length@4.0.2: + dependencies: + char-regex: 1.0.2 + strip-ansi: 6.0.1 + + string-width@4.2.3: + dependencies: + emoji-regex: 8.0.0 + is-fullwidth-code-point: 3.0.0 + strip-ansi: 6.0.1 + + string-width@5.1.2: + dependencies: + eastasianwidth: 0.2.0 + emoji-regex: 9.2.2 + strip-ansi: 7.1.0 + + strip-ansi@6.0.1: + dependencies: + ansi-regex: 5.0.1 + + strip-ansi@7.1.0: + dependencies: + ansi-regex: 6.1.0 + + strip-bom@4.0.0: {} + + strip-final-newline@2.0.0: {} + + strip-json-comments@3.1.1: {} + + sucrase@3.35.0: + dependencies: + '@jridgewell/gen-mapping': 0.3.8 + commander: 4.1.1 + glob: 10.4.5 + lines-and-columns: 1.2.4 + mz: 2.7.0 + pirates: 4.0.6 + ts-interface-checker: 0.1.13 + + supports-color@7.2.0: + dependencies: + has-flag: 4.0.0 + + supports-color@8.1.1: + dependencies: + has-flag: 4.0.0 + + supports-preserve-symlinks-flag@1.0.0: {} + + test-exclude@6.0.0: + dependencies: + '@istanbuljs/schema': 0.1.3 + glob: 7.2.3 + minimatch: 3.1.2 + + thenify-all@1.6.0: + dependencies: + thenify: 3.3.1 + + thenify@3.3.1: + dependencies: + any-promise: 1.3.0 + + tinyexec@0.3.1: {} + + tinyglobby@0.2.10: + dependencies: + fdir: 6.4.2(picomatch@4.0.2) + picomatch: 4.0.2 + + tmpl@1.0.5: {} + + to-regex-range@5.0.1: + dependencies: + is-number: 7.0.0 + + tr46@1.0.1: + dependencies: + punycode: 2.3.1 + + tree-kill@1.2.2: {} + + ts-interface-checker@0.1.13: {} + + ts-jest@29.2.5(@babel/core@7.26.0)(@jest/transform@29.7.0)(@jest/types@29.6.3)(babel-jest@29.7.0(@babel/core@7.26.0))(esbuild@0.24.0)(jest@29.7.0(@types/node@20.17.10))(typescript@5.7.2): + dependencies: + bs-logger: 0.2.6 + ejs: 3.1.10 + fast-json-stable-stringify: 2.1.0 + jest: 29.7.0(@types/node@20.17.10) + jest-util: 29.7.0 + json5: 2.2.3 + lodash.memoize: 4.1.2 + make-error: 1.3.6 + semver: 7.6.3 + typescript: 5.7.2 + yargs-parser: 21.1.1 + optionalDependencies: + '@babel/core': 7.26.0 + '@jest/transform': 29.7.0 + '@jest/types': 29.6.3 + babel-jest: 29.7.0(@babel/core@7.26.0) + esbuild: 0.24.0 + + tsup@8.3.5(typescript@5.7.2): + dependencies: + bundle-require: 5.0.0(esbuild@0.24.0) + cac: 6.7.14 + chokidar: 4.0.3 + consola: 3.2.3 + debug: 4.4.0 + esbuild: 0.24.0 + joycon: 3.1.1 + picocolors: 1.1.1 + postcss-load-config: 6.0.1 + resolve-from: 5.0.0 + rollup: 4.28.1 + source-map: 0.8.0-beta.0 + sucrase: 3.35.0 + tinyexec: 0.3.1 + tinyglobby: 0.2.10 + tree-kill: 1.2.2 + optionalDependencies: + typescript: 5.7.2 + transitivePeerDependencies: + - jiti + - supports-color + - tsx + - yaml + + type-detect@4.0.8: {} + + type-fest@0.21.3: {} + + typescript-event-target@1.1.1: {} + + typescript@5.7.2: {} + + undici-types@6.19.8: {} + + update-browserslist-db@1.1.1(browserslist@4.24.3): + dependencies: + browserslist: 4.24.3 + escalade: 3.2.0 + picocolors: 1.1.1 + + uuid@9.0.1: {} + + v8-to-istanbul@9.3.0: + dependencies: + '@jridgewell/trace-mapping': 0.3.25 + '@types/istanbul-lib-coverage': 2.0.6 + convert-source-map: 2.0.0 + + walker@1.0.8: + dependencies: + makeerror: 1.0.12 + + webidl-conversions@4.0.2: {} + + whatwg-url@7.1.0: + dependencies: + lodash.sortby: 4.7.0 + tr46: 1.0.1 + webidl-conversions: 4.0.2 + + which@2.0.2: + dependencies: + isexe: 2.0.0 + + wrap-ansi@7.0.0: + dependencies: + ansi-styles: 4.3.0 + string-width: 4.2.3 + strip-ansi: 6.0.1 + + wrap-ansi@8.1.0: + dependencies: + ansi-styles: 6.2.1 + string-width: 5.1.2 + strip-ansi: 7.1.0 + + wrappy@1.0.2: {} + + write-file-atomic@4.0.2: + dependencies: + imurmurhash: 0.1.4 + signal-exit: 3.0.7 + + ws@8.18.0: {} + + y18n@5.0.8: {} + + yallist@3.1.1: {} + + yargs-parser@21.1.1: {} + + yargs@17.7.2: + dependencies: + cliui: 8.0.1 + escalade: 3.2.0 + get-caller-file: 2.0.5 + require-directory: 2.1.1 + string-width: 4.2.3 + y18n: 5.0.8 + yargs-parser: 21.1.1 + + yocto-queue@0.1.0: {} + + zod-to-json-schema@3.24.1(zod@3.24.1): + dependencies: + zod: 3.24.1 + + zod@3.24.1: {} diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 28fbe075..779c5e71 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -167,6 +167,7 @@ export interface ScrapeParams { prompt?: string; schema?: LLMSchema | object; @@ -306,6 +322,7 @@ export interface ExtractParams { origin?: string; showSources?: boolean; scrapeOptions?: CrawlScrapeOptions; + agent?: AgentOptionsExtract; } /** diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 02d39e74..b033a9d0 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -107,6 +107,7 @@ class FirecrawlApp: # Just for backwards compatibility enableWebSearch: Optional[bool] = False show_sources: Optional[bool] = False + agent: Optional[Dict[str, Any]] = None @@ -183,8 +184,12 @@ class FirecrawlApp: # Include any other params directly at the top level of scrape_params for key, value in params.items(): - if key not in ['jsonOptions', 'changeTrackingOptions']: + if key not in ['jsonOptions', 'changeTrackingOptions', 'agent']: scrape_params[key] = value + + agent = params.get('agent') + if agent: + scrape_params['agent'] = agent endpoint = f'/v1/scrape' @@ -706,6 +711,9 @@ class FirecrawlApp: request_data['systemPrompt'] = params['system_prompt'] elif params.get('systemPrompt'): # Check legacy field name request_data['systemPrompt'] = params['systemPrompt'] + + if params.get('agent'): + request_data['agent'] = params['agent'] try: # Send the initial extract request From edb40d75c18ef88ad64ae91889380ed5e1294bd8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 15 Apr 2025 01:46:31 -0700 Subject: [PATCH 109/160] log session ID --- apps/api/src/scraper/scrapeURL/lib/smartScrape.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/scraper/scrapeURL/lib/smartScrape.ts b/apps/api/src/scraper/scrapeURL/lib/smartScrape.ts index 1afe663a..7ccebe75 100644 --- a/apps/api/src/scraper/scrapeURL/lib/smartScrape.ts +++ b/apps/api/src/scraper/scrapeURL/lib/smartScrape.ts @@ -51,7 +51,7 @@ export async function smartScrape( sessionId?: string, ): Promise { try { - logger.info("Initiating smart scrape request", { url, prompt }); + logger.info("Initiating smart scrape request", { url, prompt, sessionId }); // Pass schema type as generic parameter to robustFeth const response = await robustFetch({ From 9400b1423aaad2f7687ed4d413014518db5fc5d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 15 Apr 2025 02:07:40 -0700 Subject: [PATCH 110/160] fix typing --- apps/api/src/controllers/v1/types.ts | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 9e212d4d..8ecaf899 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -379,11 +379,14 @@ export const scrapeOptions = baseScrapeOptions .object({ model: z.string().default(agentExtractModelValue), prompt: z.string().optional(), + sessionId: z.string().optional(), + waitBeforeClosingMs: z.number().optional(), }) .optional(), extract: extractOptionsWithAgent.optional(), jsonOptions: extractOptionsWithAgent.optional(), }) + .strict(strictMessage) .refine( (obj) => { if (!obj.actions) return true; @@ -400,7 +403,16 @@ export const scrapeOptions = baseScrapeOptions .refine(fire1Refine, fire1RefineOpts) .transform(extractTransform); -export type ScrapeOptions = z.infer; +export type ScrapeOptions = z.infer & { + extract: z.infer, + jsonOptions: z.infer, + agent: { + model: string, + prompt: string, + sessionId?: string, + waitBeforeClosingMs?: number, + }, +}; import Ajv from "ajv"; import type { CostTracking } from "../../lib/extract/extraction-service"; From 0ee96039452849bd94bd6424f6de95684bb01a71 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 15 Apr 2025 02:09:52 -0700 Subject: [PATCH 111/160] FIX MORE --- apps/api/src/controllers/v1/types.ts | 10 ++++++---- apps/api/src/types.ts | 3 ++- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 8ecaf899..334dea16 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -403,10 +403,12 @@ export const scrapeOptions = baseScrapeOptions .refine(fire1Refine, fire1RefineOpts) .transform(extractTransform); -export type ScrapeOptions = z.infer & { - extract: z.infer, - jsonOptions: z.infer, - agent: { +export type BaseScrapeOptions = z.infer; + +export type ScrapeOptions = BaseScrapeOptions & { + extract?: z.infer, + jsonOptions?: z.infer, + agent?: { model: string, prompt: string, sessionId?: string, diff --git a/apps/api/src/types.ts b/apps/api/src/types.ts index 65e3a428..03f0a015 100644 --- a/apps/api/src/types.ts +++ b/apps/api/src/types.ts @@ -1,6 +1,7 @@ import { z } from "zod"; import { AuthCreditUsageChunk, + BaseScrapeOptions, ScrapeOptions, Document as V1Document, webhookSchema, @@ -35,7 +36,7 @@ export interface WebScraperOptions { url: string; mode: Mode; crawlerOptions?: any; - scrapeOptions: ScrapeOptions; + scrapeOptions: BaseScrapeOptions; internalOptions?: InternalOptions; team_id: string; origin?: string; From 31e24e907c2bd019b2db33a39e3c935144355543 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 15 Apr 2025 02:13:31 -0700 Subject: [PATCH 112/160] FIX DAT --- apps/api/src/controllers/v1/types.ts | 2 ++ 1 file changed, 2 insertions(+) diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 334dea16..0a8c9fe4 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -503,6 +503,8 @@ export const scrapeRequestSchema = baseScrapeOptions .object({ model: z.string().default(agentExtractModelValue), prompt: z.string().optional(), + sessionId: z.string().optional(), + waitBeforeClosingMs: z.number().optional(), }) .optional(), extract: extractOptionsWithAgent.optional(), From a840db9ef36dd426b88d57b08eb9226150ea0b73 Mon Sep 17 00:00:00 2001 From: "devin-ai-integration[bot]" <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Tue, 15 Apr 2025 23:09:11 +0200 Subject: [PATCH 113/160] Set default timeout to 120s when proxy is stealth (#1464) Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: mogery@sideguide.dev --- apps/api/src/controllers/v1/types.ts | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 0a8c9fe4..92c95e20 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -352,6 +352,10 @@ const extractTransform = (obj) => { obj = { ...obj, timeout: 300000 }; } + if (obj.proxy === "stealth" && obj.timeout === 30000) { + obj = { ...obj, timeout: 120000 }; + } + if (obj.formats?.includes("json")) { obj.formats.push("extract"); } From 252a9ccc89943c12f9e8bf4ab0db8d399e076fde Mon Sep 17 00:00:00 2001 From: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 15 Apr 2025 17:15:01 -0700 Subject: [PATCH 114/160] Refactor robustFetch logging to exclude sensitive parameters and improve error handling. --- apps/api/src/scraper/scrapeURL/lib/fetch.ts | 31 +++++++++++---------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/apps/api/src/scraper/scrapeURL/lib/fetch.ts b/apps/api/src/scraper/scrapeURL/lib/fetch.ts index e4c3802f..d6bd5157 100644 --- a/apps/api/src/scraper/scrapeURL/lib/fetch.ts +++ b/apps/api/src/scraper/scrapeURL/lib/fetch.ts @@ -123,10 +123,11 @@ export async function robustFetch< return null as Output; } + const resp = await request.text(); response = { status: request.status, headers: request.headers, - body: await request.text(), // NOTE: can this throw an exception? + body: resp, // NOTE: can this throw an exception? }; } else { if (ignoreResponse === true) { @@ -171,7 +172,7 @@ export async function robustFetch< if (tryCount > 1) { logger.debug( "Request sent failure status, trying " + (tryCount - 1) + " more times", - { params, response, requestId }, + { params: { ...params, logger: undefined }, response: { status: response.status, body: response.body }, requestId }, ); if (tryCooldown !== undefined) { await new Promise((resolve) => @@ -186,14 +187,14 @@ export async function robustFetch< }); } else { logger.debug("Request sent failure status", { - params, - response, + params: { ...params, logger: undefined }, + response: { status: response.status, body: response.body }, requestId, }); throw new Error("Request sent failure status", { cause: { - params, - response, + params: { ...params, logger: undefined }, + response: { status: response.status, body: response.body }, requestId, }, }); @@ -217,13 +218,13 @@ export async function robustFetch< data = JSON.parse(response.body); } catch (error) { logger.debug("Request sent malformed JSON", { - params, - response, + params: { ...params, logger: undefined }, + response: { status: response.status, body: response.body }, requestId, }); throw new Error("Request sent malformed JSON", { cause: { - params, + params: { ...params, logger: undefined }, response, requestId, }, @@ -236,15 +237,15 @@ export async function robustFetch< } catch (error) { if (error instanceof ZodError) { logger.debug("Response does not match provided schema", { - params, - response, + params: { ...params, logger: undefined }, + response: { status: response.status, body: response.body }, requestId, error, schema, }); throw new Error("Response does not match provided schema", { cause: { - params, + params: { ...params, logger: undefined }, response, requestId, error, @@ -253,15 +254,15 @@ export async function robustFetch< }); } else { logger.debug("Parsing response with provided schema failed", { - params, - response, + params: { ...params, logger: undefined }, + response: { status: response.status, body: response.body }, requestId, error, schema, }); throw new Error("Parsing response with provided schema failed", { cause: { - params, + params: { ...params, logger: undefined }, response, requestId, error, From 3ccef5fb666f6b308243f1d4b0d34531b70691a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 15 Apr 2025 17:42:26 -0700 Subject: [PATCH 115/160] fix(v1): scrape-status with GCS --- apps/api/src/controllers/v1/scrape-status.ts | 13 ++++++++++++- apps/api/src/lib/supabase-jobs.ts | 2 +- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/apps/api/src/controllers/v1/scrape-status.ts b/apps/api/src/controllers/v1/scrape-status.ts index 416ecdad..fac89b1c 100644 --- a/apps/api/src/controllers/v1/scrape-status.ts +++ b/apps/api/src/controllers/v1/scrape-status.ts @@ -1,5 +1,6 @@ import { Response } from "express"; import { supabaseGetJobByIdOnlyData } from "../../lib/supabase-jobs"; +import { getJob } from "./crawl-status"; export async function scrapeStatusController(req: any, res: any) { const allowedTeams = [ @@ -17,6 +18,13 @@ export async function scrapeStatusController(req: any, res: any) { const job = await supabaseGetJobByIdOnlyData(req.params.jobId); + if (!job) { + return res.status(404).json({ + success: false, + error: "Job not found.", + }); + } + if ( !allowedTeams.includes(job?.team_id) || job?.team_id !== req.auth.team_id @@ -27,7 +35,10 @@ export async function scrapeStatusController(req: any, res: any) { }); } - const data = job?.docs[0]; + const jobData = await getJob(req.params.jobId); + const data = Array.isArray(jobData?.returnvalue) + ? jobData?.returnvalue[0] + : jobData?.returnvalue; return res.status(200).json({ success: true, diff --git a/apps/api/src/lib/supabase-jobs.ts b/apps/api/src/lib/supabase-jobs.ts index e36f3b97..874c5293 100644 --- a/apps/api/src/lib/supabase-jobs.ts +++ b/apps/api/src/lib/supabase-jobs.ts @@ -76,7 +76,7 @@ export const supabaseGetJobsByCrawlId = async (crawlId: string) => { export const supabaseGetJobByIdOnlyData = async (jobId: string) => { const { data, error } = await supabase_rr_service .from("firecrawl_jobs") - .select("docs, team_id") + .select("team_id") .eq("job_id", jobId) .single(); From c8a8e96acc91278687ff7a7476d58c2608cb2a0a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 15 Apr 2025 17:48:54 -0700 Subject: [PATCH 116/160] un-gate scrape status, add test --- apps/api/src/__tests__/snips/lib.ts | 17 +++++++++++++++++ apps/api/src/__tests__/snips/scrape.test.ts | 13 ++++++++++++- apps/api/src/controllers/v1/scrape-status.ts | 14 -------------- 3 files changed, 29 insertions(+), 15 deletions(-) diff --git a/apps/api/src/__tests__/snips/lib.ts b/apps/api/src/__tests__/snips/lib.ts index cfd7515b..f87be6d4 100644 --- a/apps/api/src/__tests__/snips/lib.ts +++ b/apps/api/src/__tests__/snips/lib.ts @@ -34,6 +34,23 @@ export async function scrape(body: ScrapeRequestInput): Promise { return raw.body.data; } +export async function scrapeStatusRaw(jobId: string) { + return await request(TEST_URL) + .get("/v1/scrape/" + encodeURIComponent(jobId)) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .send(); +} + +export async function scrapeStatus(jobId: string): Promise { + const raw = await scrapeStatusRaw(jobId); + expect(raw.statusCode).toBe(200); + expect(raw.body.success).toBe(true); + expect(typeof raw.body.data).toBe("object"); + expect(raw.body.data).not.toBeNull(); + expect(raw.body.data).toBeDefined(); + return raw.body.data; +} + // ========================================= // Crawl API // ========================================= diff --git a/apps/api/src/__tests__/snips/scrape.test.ts b/apps/api/src/__tests__/snips/scrape.test.ts index b9f1401e..a7adffe6 100644 --- a/apps/api/src/__tests__/snips/scrape.test.ts +++ b/apps/api/src/__tests__/snips/scrape.test.ts @@ -1,4 +1,4 @@ -import { scrape } from "./lib"; +import { scrape, scrapeStatus } from "./lib"; describe("Scrape tests", () => { it.concurrent("mocking works properly", async () => { @@ -24,6 +24,17 @@ describe("Scrape tests", () => { expect(response.markdown).toContain("Firecrawl"); }, 30000); + it.concurrent("scrape status works", async () => { + const response = await scrape({ + url: "http://firecrawl.dev" + }); + + expect(response.markdown).toContain("Firecrawl"); + + const status = await scrapeStatus(response.metadata.scrapeId!); + expect(JSON.stringify(status)).toBe(JSON.stringify(response)); + }, 60000); + it.concurrent("handles non-UTF-8 encodings", async () => { const response = await scrape({ url: "https://www.rtpro.yamaha.co.jp/RT/docs/misc/kanji-sjis.html", diff --git a/apps/api/src/controllers/v1/scrape-status.ts b/apps/api/src/controllers/v1/scrape-status.ts index fac89b1c..e89f724e 100644 --- a/apps/api/src/controllers/v1/scrape-status.ts +++ b/apps/api/src/controllers/v1/scrape-status.ts @@ -3,19 +3,6 @@ import { supabaseGetJobByIdOnlyData } from "../../lib/supabase-jobs"; import { getJob } from "./crawl-status"; export async function scrapeStatusController(req: any, res: any) { - const allowedTeams = [ - "41bdbfe1-0579-4d9b-b6d5-809f16be12f5", - "511544f2-2fce-4183-9c59-6c29b02c69b5", - "1ec9a0b3-6e7d-49a9-ad6c-9c598ba824c8", - ]; - - if (!allowedTeams.includes(req.auth.team_id)) { - return res.status(403).json({ - success: false, - error: "Forbidden", - }); - } - const job = await supabaseGetJobByIdOnlyData(req.params.jobId); if (!job) { @@ -26,7 +13,6 @@ export async function scrapeStatusController(req: any, res: any) { } if ( - !allowedTeams.includes(job?.team_id) || job?.team_id !== req.auth.team_id ) { return res.status(403).json({ From 524b9770cd33e6be6a44b4fa18f3e7086e5b4ef1 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 15 Apr 2025 17:52:20 -0700 Subject: [PATCH 117/160] Update queue-worker.ts --- apps/api/src/services/queue-worker.ts | 38 +++++++++++++-------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 3f096448..1118a0f9 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -433,25 +433,25 @@ const processExtractJobInternal = async ( try { let result: ExtractResult | null = null; - // const model = job.data.request.agent?.model - // if (job.data.request.agent && model && model.toLowerCase().includes("fire-1")) { - // result = await performExtraction(job.data.extractId, { - // request: job.data.request, - // teamId: job.data.teamId, - // subId: job.data.subId, - // }); - // } else { - // result = await performExtraction_F0(job.data.extractId, { - // request: job.data.request, - // teamId: job.data.teamId, - // subId: job.data.subId, - // }); - // } - result = await performExtraction_F0(job.data.extractId, { - request: job.data.request, - teamId: job.data.teamId, - subId: job.data.subId, - }); + const model = job.data.request.agent?.model + if (job.data.request.agent && model && model.toLowerCase().includes("fire-1")) { + result = await performExtraction(job.data.extractId, { + request: job.data.request, + teamId: job.data.teamId, + subId: job.data.subId, + }); + } else { + result = await performExtraction_F0(job.data.extractId, { + request: job.data.request, + teamId: job.data.teamId, + subId: job.data.subId, + }); + } + // result = await performExtraction_F0(job.data.extractId, { + // request: job.data.request, + // teamId: job.data.teamId, + // subId: job.data.subId, + // }); if (result && result.success) { // Move job to completed state in Redis From 5515ca7a529112f8eafc90ff37aeb5b571e8fc18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 15 Apr 2025 19:49:14 -0700 Subject: [PATCH 118/160] fix(llm-cost): update --- apps/api/src/lib/extract/usage/llm-cost.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/lib/extract/usage/llm-cost.ts b/apps/api/src/lib/extract/usage/llm-cost.ts index 81e2d2f7..58ab53cd 100644 --- a/apps/api/src/lib/extract/usage/llm-cost.ts +++ b/apps/api/src/lib/extract/usage/llm-cost.ts @@ -8,7 +8,7 @@ interface ModelPricing { input_cost_per_request?: number; mode: string; } -const tokenPerCharacter = 4; +const tokenPerCharacter = 0.5; const baseTokenCost = 300; export function calculateFinalResultCost(data: any): number { From d119552eee24eedf3651e60e80ce6835ee05d8d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 15 Apr 2025 20:20:29 -0700 Subject: [PATCH 119/160] bump rate limits --- apps/api/src/services/rate-limiter.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/api/src/services/rate-limiter.ts b/apps/api/src/services/rate-limiter.ts index 2a502c3b..bcf423ac 100644 --- a/apps/api/src/services/rate-limiter.ts +++ b/apps/api/src/services/rate-limiter.ts @@ -31,8 +31,8 @@ const fallbackRateLimits: AuthCreditUsageChunk["rate_limits"] = { preview: 25, extractStatus: 25000, crawlStatus: 25000, - extractAgentPreview: 1, - scrapeAgentPreview: 5, + extractAgentPreview: 10, + scrapeAgentPreview: 10, }; export function getRateLimiter( From eea1267b504ff3f1dea635bd77afb61e7ed1e1ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 15 Apr 2025 20:28:53 -0700 Subject: [PATCH 120/160] feat(batchExtract): thingymajig --- apps/api/src/lib/extract/completions/batchExtract.ts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/apps/api/src/lib/extract/completions/batchExtract.ts b/apps/api/src/lib/extract/completions/batchExtract.ts index e7f2bb03..aecad61f 100644 --- a/apps/api/src/lib/extract/completions/batchExtract.ts +++ b/apps/api/src/lib/extract/completions/batchExtract.ts @@ -1,4 +1,3 @@ -import { logger } from "../../../lib/logger"; import { generateCompletions, GenerateCompletionsOptions, @@ -14,6 +13,7 @@ import { getModel } from "../../generic-ai"; import fs from "fs/promises"; import { extractData } from "../../../scraper/scrapeURL/lib/extractSmartScrape"; +import type { Logger } from "winston"; type BatchExtractOptions = { multiEntitySchema: any; @@ -33,7 +33,7 @@ type BatchExtractOptions = { * @param doc - The document to extract information from * @returns The completion promise */ -export async function batchExtractPromise(options: BatchExtractOptions): Promise<{ +export async function batchExtractPromise(options: BatchExtractOptions, logger: Logger): Promise<{ extract: any; // array of extracted data numTokens: number; totalUsage: TokenUsage; @@ -82,7 +82,7 @@ export async function batchExtractPromise(options: BatchExtractOptions): Promise smCallCount = smartScrapeCallCount; oCallCount = otherCallCount; } catch (error) { - console.error(">>>>>>>error>>>>>\n", error); + logger.error("extractData failed", { error }); } // await fs.writeFile( From 0abe60085b7464a674fde67e466a10272aaa491a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 15 Apr 2025 20:29:01 -0700 Subject: [PATCH 121/160] fix --- apps/api/src/lib/extract/extraction-service.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/lib/extract/extraction-service.ts b/apps/api/src/lib/extract/extraction-service.ts index 882d115d..22d6f6c7 100644 --- a/apps/api/src/lib/extract/extraction-service.ts +++ b/apps/api/src/lib/extract/extraction-service.ts @@ -438,7 +438,7 @@ export async function performExtraction( systemPrompt: request.systemPrompt ?? "", doc, useAgent: isAgentExtractModelValid(request.agent?.model) - }); + }, logger); // Race between timeout and completion const multiEntityCompletion = (await completionPromise) as Awaited< From 13bd50ad2bc9d7f10f93ba0868bf69a56e5a7fc4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 15 Apr 2025 21:57:05 -0700 Subject: [PATCH 122/160] feat(fetch): don't time out (for smart scrape) --- apps/api/src/scraper/scrapeURL/lib/fetch.ts | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/apps/api/src/scraper/scrapeURL/lib/fetch.ts b/apps/api/src/scraper/scrapeURL/lib/fetch.ts index d6bd5157..c64e7953 100644 --- a/apps/api/src/scraper/scrapeURL/lib/fetch.ts +++ b/apps/api/src/scraper/scrapeURL/lib/fetch.ts @@ -4,6 +4,7 @@ import * as Sentry from "@sentry/node"; import { MockState, saveMock } from "./mock"; import { TimeoutSignal } from "../../../controllers/v1/types"; import { fireEngineURL } from "../engines/fire-engine/scrape"; +import { fetch, RequestInit, Response, FormData, Agent } from "undici"; export type RobustFetchParams> = { url: string; @@ -78,6 +79,10 @@ export async function robustFetch< ...(headers !== undefined ? headers : {}), }, signal: abort, + dispatcher: new Agent({ + headersTimeout: 0, + bodyTimeout: 0, + }), ...(body instanceof FormData ? { body, From 512a2b1cd494d64a8139f089116b224cd7790a0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 15 Apr 2025 22:57:20 -0700 Subject: [PATCH 123/160] feat(extract): run on original links if reranker is weird --- .../api/src/lib/extract/extraction-service.ts | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/apps/api/src/lib/extract/extraction-service.ts b/apps/api/src/lib/extract/extraction-service.ts index 22d6f6c7..1ea5651f 100644 --- a/apps/api/src/lib/extract/extraction-service.ts +++ b/apps/api/src/lib/extract/extraction-service.ts @@ -273,28 +273,21 @@ export async function performExtraction( ); const processedUrls = await Promise.all(urlPromises); - const links = processedUrls.flat().filter((url) => url); + let links = processedUrls.flat().filter((url) => url); logger.debug("Processed URLs.", { linkCount: links.length, }); - log["links"] = links; - log["linksLength"] = links.length; - if (links.length === 0) { - logger.error("0 links! Bailing.", { + links = urls.map(x => x.replace(/\*$/g, "")); + logger.warn("0 links! Doing just the original URLs. (without * wildcard)", { linkCount: links.length, }); - return { - success: false, - error: - "No valid URLs found to scrape. Try adjusting your search criteria or including more URLs.", - extractId, - urlTrace: urlTraces, - totalUrlsScraped: 0, - }; } + log["links"] = links; + log["linksLength"] = links.length; + await updateExtract(extractId, { status: "processing", steps: [ From 80b507e64e0ae81712e5f2395d0496b0f300d344 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 15 Apr 2025 23:06:13 -0700 Subject: [PATCH 124/160] correlate with eid --- apps/api/src/lib/extract/completions/batchExtract.ts | 4 +++- apps/api/src/lib/extract/completions/singleAnswer.ts | 5 ++++- apps/api/src/lib/extract/extraction-service.ts | 4 +++- apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts | 5 ++++- apps/api/src/scraper/scrapeURL/lib/smartScrape.ts | 1 + 5 files changed, 15 insertions(+), 4 deletions(-) diff --git a/apps/api/src/lib/extract/completions/batchExtract.ts b/apps/api/src/lib/extract/completions/batchExtract.ts index aecad61f..1bffeaa2 100644 --- a/apps/api/src/lib/extract/completions/batchExtract.ts +++ b/apps/api/src/lib/extract/completions/batchExtract.ts @@ -22,6 +22,7 @@ type BatchExtractOptions = { systemPrompt: string; doc: Document; useAgent: boolean; + extractId?: string; }; /** @@ -44,7 +45,7 @@ export async function batchExtractPromise(options: BatchExtractOptions, logger: smartScrapeCallCount: number; otherCallCount: number; }> { - const { multiEntitySchema, links, prompt, systemPrompt, doc, useAgent } = options; + const { multiEntitySchema, links, prompt, systemPrompt, doc, useAgent, extractId } = options; const generationOptions: GenerateCompletionsOptions = { @@ -74,6 +75,7 @@ export async function batchExtractPromise(options: BatchExtractOptions, logger: extractOptions: generationOptions, urls: [doc.metadata.sourceURL || doc.metadata.url || ""], useAgent, + extractId, }); extractedDataArray = e; warning = w; diff --git a/apps/api/src/lib/extract/completions/singleAnswer.ts b/apps/api/src/lib/extract/completions/singleAnswer.ts index 27e3cad0..5e76954e 100644 --- a/apps/api/src/lib/extract/completions/singleAnswer.ts +++ b/apps/api/src/lib/extract/completions/singleAnswer.ts @@ -14,7 +14,8 @@ export async function singleAnswerCompletion({ links, prompt, systemPrompt, - useAgent + useAgent, + extractId, }: { singleAnswerDocs: Document[]; rSchema: any; @@ -22,6 +23,7 @@ export async function singleAnswerCompletion({ prompt: string; systemPrompt: string; useAgent: boolean; + extractId?: string; }): Promise<{ extract: any; tokenUsage: TokenUsage; @@ -51,6 +53,7 @@ export async function singleAnswerCompletion({ extractOptions: generationOptions, urls: singleAnswerDocs.map(doc => doc.metadata.url || doc.metadata.sourceURL || ""), useAgent, + extractId, }); const completion = { diff --git a/apps/api/src/lib/extract/extraction-service.ts b/apps/api/src/lib/extract/extraction-service.ts index 1ea5651f..1a6a4262 100644 --- a/apps/api/src/lib/extract/extraction-service.ts +++ b/apps/api/src/lib/extract/extraction-service.ts @@ -430,7 +430,8 @@ export async function performExtraction( prompt: request.prompt ?? "", systemPrompt: request.systemPrompt ?? "", doc, - useAgent: isAgentExtractModelValid(request.agent?.model) + useAgent: isAgentExtractModelValid(request.agent?.model), + extractId, }, logger); // Race between timeout and completion @@ -741,6 +742,7 @@ export async function performExtraction( prompt: request.prompt ?? "", systemPrompt: request.systemPrompt ?? "", useAgent: isAgentExtractModelValid(request.agent?.model), + extractId, }); costTracking.smartScrapeCost += singleAnswerSmartScrapeCost; costTracking.smartScrapeCallCount += singleAnswerSmartScrapeCallCount; diff --git a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts index 840c56fc..7380f380 100644 --- a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts +++ b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts @@ -184,10 +184,12 @@ export async function extractData({ extractOptions, urls, useAgent, + extractId, }: { extractOptions: GenerateCompletionsOptions; urls: string[]; useAgent: boolean; + extractId?: string; }): Promise<{ extractedDataArray: any[]; warning: any; @@ -273,7 +275,7 @@ export async function extractData({ let smartscrapeResults: SmartScrapeResult[]; if (isSingleUrl) { smartscrapeResults = [ - await smartScrape(urls[0], extract?.smartscrape_prompt), + await smartScrape(urls[0], extract?.smartscrape_prompt, extractId), ]; smartScrapeCost += smartscrapeResults[0].tokenUsage; smartScrapeCallCount++; @@ -285,6 +287,7 @@ export async function extractData({ return await smartScrape( urls[page.page_index], page.smartscrape_prompt, + extractId, ); }), ); diff --git a/apps/api/src/scraper/scrapeURL/lib/smartScrape.ts b/apps/api/src/scraper/scrapeURL/lib/smartScrape.ts index 7ccebe75..8e0b45f6 100644 --- a/apps/api/src/scraper/scrapeURL/lib/smartScrape.ts +++ b/apps/api/src/scraper/scrapeURL/lib/smartScrape.ts @@ -49,6 +49,7 @@ export async function smartScrape( url: string, prompt: string, sessionId?: string, + extractId?: string, ): Promise { try { logger.info("Initiating smart scrape request", { url, prompt, sessionId }); From b6abe4f26b9a4a4441252ba326147fcc5b08b662 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 15 Apr 2025 23:23:18 -0700 Subject: [PATCH 125/160] fix(smartScrape): pass extract id --- apps/api/src/scraper/scrapeURL/lib/smartScrape.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/apps/api/src/scraper/scrapeURL/lib/smartScrape.ts b/apps/api/src/scraper/scrapeURL/lib/smartScrape.ts index 8e0b45f6..ff0316c7 100644 --- a/apps/api/src/scraper/scrapeURL/lib/smartScrape.ts +++ b/apps/api/src/scraper/scrapeURL/lib/smartScrape.ts @@ -62,6 +62,7 @@ export async function smartScrape( url, prompt, userProvidedId: sessionId ?? undefined, + extractId, models: { thinkingModel: { model: "gemini-2.5-pro-preview-03-25", From 0935ec210e6ec7694d2ed46785e29dc09f3b5847 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 15 Apr 2025 23:34:00 -0700 Subject: [PATCH 126/160] feat(smartScrape): better loggin --- .../api/src/scraper/scrapeURL/lib/smartScrape.ts | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/apps/api/src/scraper/scrapeURL/lib/smartScrape.ts b/apps/api/src/scraper/scrapeURL/lib/smartScrape.ts index ff0316c7..4e4cbb20 100644 --- a/apps/api/src/scraper/scrapeURL/lib/smartScrape.ts +++ b/apps/api/src/scraper/scrapeURL/lib/smartScrape.ts @@ -1,5 +1,5 @@ import { z } from "zod"; -import { logger } from "../../../lib/logger"; +import { logger as _logger } from "../../../lib/logger"; import { robustFetch } from "./fetch"; import fs from "fs/promises"; import { configDotenv } from "dotenv"; @@ -51,8 +51,16 @@ export async function smartScrape( sessionId?: string, extractId?: string, ): Promise { + let logger = _logger.child({ + method: "smartScrape", + module: "smartScrape", + extractId, + url, + prompt, + sessionId, + }); try { - logger.info("Initiating smart scrape request", { url, prompt, sessionId }); + logger.info("Initiating smart scrape request"); // Pass schema type as generic parameter to robustFeth const response = await robustFetch({ @@ -116,8 +124,6 @@ export async function smartScrape( } logger.info("Smart scrape successful", { - url, - prompt, sessionId: response.sessionId, }); @@ -155,8 +161,6 @@ export async function smartScrape( }; logger.error("Smart scrape request failed", { - url, - prompt, error: JSON.stringify(errorInfo), }); From 2193bee13356b814d280d7c799e6d0e851edb09b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 15 Apr 2025 23:52:20 -0700 Subject: [PATCH 127/160] Improve logging --- .../lib/extract/completions/singleAnswer.ts | 8 ++++++-- .../scrapeURL/lib/extractSmartScrape.ts | 18 +++++++----------- .../scrapeURL/transformers/llmExtract.ts | 1 + 3 files changed, 14 insertions(+), 13 deletions(-) diff --git a/apps/api/src/lib/extract/completions/singleAnswer.ts b/apps/api/src/lib/extract/completions/singleAnswer.ts index 5e76954e..4b36a58f 100644 --- a/apps/api/src/lib/extract/completions/singleAnswer.ts +++ b/apps/api/src/lib/extract/completions/singleAnswer.ts @@ -23,7 +23,7 @@ export async function singleAnswerCompletion({ prompt: string; systemPrompt: string; useAgent: boolean; - extractId?: string; + extractId: string; }): Promise<{ extract: any; tokenUsage: TokenUsage; @@ -35,7 +35,11 @@ export async function singleAnswerCompletion({ }> { const docsPrompt = `Today is: ` + new Date().toISOString() + `.\n` + prompt; const generationOptions: GenerateCompletionsOptions = { - logger: logger.child({ module: "extract", method: "generateCompletions" }), + logger: logger.child({ + module: "extract", + method: "generateCompletions", + extractId, + }), options: { mode: "llm", systemPrompt: diff --git a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts index 7380f380..955832ff 100644 --- a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts +++ b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts @@ -211,7 +211,7 @@ export async function extractData({ if (!schema && extractOptions.options.prompt) { logger.info("Generating schema from prompt"); - const genRes = await generateSchemaFromPrompt(extractOptions.options.prompt); + const genRes = await generateSchemaFromPrompt(extractOptions.options.prompt, logger); otherCallCount++; otherCost += genRes.cost; schema = genRes.extract; @@ -249,7 +249,7 @@ export async function extractData({ } catch (error) { logger.error( "failed during extractSmartScrape.ts:generateCompletions", - error, + { error }, ); // console.log("failed during extractSmartScrape.ts:generateCompletions", error); } @@ -260,16 +260,12 @@ export async function extractData({ // console.log("smartscrape_reasoning", extract?.smartscrape_reasoning); // console.log("smartscrape_prompt", extract?.smartscrape_prompt); try { - console.log("========================================="); - console.log( - "useAgent:", + logger.info("Smart schema resolved", { useAgent, - "shouldUseSmartscrape:", - extract?.shouldUseSmartscrape, - ); - console.log("url:", urls); - console.log("prompt:", extract?.smartscrape_prompt); - console.log("========================================="); + shouldUseSmartscrape: extract?.shouldUseSmartscrape, + url: urls, + prompt: extract?.smartscrape_prompt, + }) if (useAgent && extract?.shouldUseSmartscrape) { let smartscrapeResults: SmartScrapeResult[]; diff --git a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts index 502e268e..ebdb7a6b 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts @@ -761,6 +761,7 @@ export function removeDefaultProperty(schema: any): any { export async function generateSchemaFromPrompt( prompt: string, + logger: Logger, ): Promise<{ extract: any; cost: number }> { const model = getModel("gpt-4o", "openai"); const retryModel = getModel("gpt-4o-mini", "openai"); From 2245650bc3db47ad8c5ef12e40fe24bccc475375 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 15 Apr 2025 23:54:56 -0700 Subject: [PATCH 128/160] fix --- .../src/lib/extract/completions/analyzeSchemaAndPrompt.ts | 5 +++-- apps/api/src/lib/extract/extraction-service.ts | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/apps/api/src/lib/extract/completions/analyzeSchemaAndPrompt.ts b/apps/api/src/lib/extract/completions/analyzeSchemaAndPrompt.ts index b02588be..98e3ccc0 100644 --- a/apps/api/src/lib/extract/completions/analyzeSchemaAndPrompt.ts +++ b/apps/api/src/lib/extract/completions/analyzeSchemaAndPrompt.ts @@ -8,14 +8,15 @@ import { buildAnalyzeSchemaPrompt, buildAnalyzeSchemaUserPrompt, } from "../build-prompts"; -import { logger } from "../../../lib/logger"; import { jsonSchema } from "ai"; import { getModel } from "../../../lib/generic-ai"; +import { Logger } from "winston"; export async function analyzeSchemaAndPrompt( urls: string[], schema: any, prompt: string, + logger: Logger, ): Promise<{ isMultiEntity: boolean; multiEntityKeys: string[]; @@ -26,7 +27,7 @@ export async function analyzeSchemaAndPrompt( }> { let cost = 0; if (!schema) { - const genRes = await generateSchemaFromPrompt(prompt); + const genRes = await generateSchemaFromPrompt(prompt, logger); schema = genRes.extract; cost = genRes.cost; } diff --git a/apps/api/src/lib/extract/extraction-service.ts b/apps/api/src/lib/extract/extraction-service.ts index 1a6a4262..87f4f76a 100644 --- a/apps/api/src/lib/extract/extraction-service.ts +++ b/apps/api/src/lib/extract/extraction-service.ts @@ -178,7 +178,7 @@ export async function performExtraction( let reqSchema = request.schema; if (!reqSchema && request.prompt) { - const schemaGenRes = await generateSchemaFromPrompt(request.prompt); + const schemaGenRes = await generateSchemaFromPrompt(request.prompt, logger); reqSchema = schemaGenRes.extract; costTracking.otherCallCount++; costTracking.otherCost += schemaGenRes.cost; @@ -214,7 +214,7 @@ export async function performExtraction( keyIndicators, tokenUsage: schemaAnalysisTokenUsage, cost: schemaAnalysisCost, - } = await analyzeSchemaAndPrompt(urls, reqSchema, request.prompt ?? ""); + } = await analyzeSchemaAndPrompt(urls, reqSchema, request.prompt ?? "", logger); logger.debug("Analyzed schema.", { isMultiEntity, From a06910115b8e4f0486b6015fcc7e316edecbe141 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 16 Apr 2025 00:03:07 -0700 Subject: [PATCH 129/160] asd --- apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts index 955832ff..cc63a09f 100644 --- a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts +++ b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts @@ -265,6 +265,7 @@ export async function extractData({ shouldUseSmartscrape: extract?.shouldUseSmartscrape, url: urls, prompt: extract?.smartscrape_prompt, + providedExtractId: extractId, }) if (useAgent && extract?.shouldUseSmartscrape) { From edd4c3090800ba3754a16fe5bb7922dbd4d866e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 16 Apr 2025 00:11:27 -0700 Subject: [PATCH 130/160] FIX IT --- apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts index cc63a09f..d2ad2a06 100644 --- a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts +++ b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts @@ -272,7 +272,7 @@ export async function extractData({ let smartscrapeResults: SmartScrapeResult[]; if (isSingleUrl) { smartscrapeResults = [ - await smartScrape(urls[0], extract?.smartscrape_prompt, extractId), + await smartScrape(urls[0], extract?.smartscrape_prompt, undefined, extractId), ]; smartScrapeCost += smartscrapeResults[0].tokenUsage; smartScrapeCallCount++; @@ -284,6 +284,7 @@ export async function extractData({ return await smartScrape( urls[page.page_index], page.smartscrape_prompt, + undefined, extractId, ); }), From 0d813b628bd4e2a2c81ba87a0b1d96fb8657b3f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 16 Apr 2025 00:25:42 -0700 Subject: [PATCH 131/160] feat: correlate smart scrape --- apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts | 5 ++++- apps/api/src/scraper/scrapeURL/lib/smartScrape.ts | 3 +++ apps/api/src/scraper/scrapeURL/transformers/agent.ts | 2 +- apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts | 1 + 4 files changed, 9 insertions(+), 2 deletions(-) diff --git a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts index d2ad2a06..cee1d5a6 100644 --- a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts +++ b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts @@ -185,11 +185,13 @@ export async function extractData({ urls, useAgent, extractId, + scrapeId, }: { extractOptions: GenerateCompletionsOptions; urls: string[]; useAgent: boolean; extractId?: string; + scrapeId?: string; }): Promise<{ extractedDataArray: any[]; warning: any; @@ -272,7 +274,7 @@ export async function extractData({ let smartscrapeResults: SmartScrapeResult[]; if (isSingleUrl) { smartscrapeResults = [ - await smartScrape(urls[0], extract?.smartscrape_prompt, undefined, extractId), + await smartScrape(urls[0], extract?.smartscrape_prompt, undefined, extractId, scrapeId), ]; smartScrapeCost += smartscrapeResults[0].tokenUsage; smartScrapeCallCount++; @@ -286,6 +288,7 @@ export async function extractData({ page.smartscrape_prompt, undefined, extractId, + scrapeId, ); }), ); diff --git a/apps/api/src/scraper/scrapeURL/lib/smartScrape.ts b/apps/api/src/scraper/scrapeURL/lib/smartScrape.ts index 4e4cbb20..046a7b5e 100644 --- a/apps/api/src/scraper/scrapeURL/lib/smartScrape.ts +++ b/apps/api/src/scraper/scrapeURL/lib/smartScrape.ts @@ -50,6 +50,7 @@ export async function smartScrape( prompt: string, sessionId?: string, extractId?: string, + scrapeId?: string, ): Promise { let logger = _logger.child({ method: "smartScrape", @@ -58,6 +59,7 @@ export async function smartScrape( url, prompt, sessionId, + scrapeId, }); try { logger.info("Initiating smart scrape request"); @@ -71,6 +73,7 @@ export async function smartScrape( prompt, userProvidedId: sessionId ?? undefined, extractId, + scrapeId, models: { thinkingModel: { model: "gemini-2.5-pro-preview-03-25", diff --git a/apps/api/src/scraper/scrapeURL/transformers/agent.ts b/apps/api/src/scraper/scrapeURL/transformers/agent.ts index 6ab32862..30a0f46f 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/agent.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/agent.ts @@ -25,7 +25,7 @@ export async function performAgent( let smartscrapeResults: SmartScrapeResult; try { - smartscrapeResults = await smartScrape(url, prompt, sessionId) + smartscrapeResults = await smartScrape(url, prompt, sessionId, undefined, meta.id) } catch (error) { if (error instanceof Error && error.message === "Cost limit exceeded") { logger.error("Cost limit exceeded", { error }) diff --git a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts index ebdb7a6b..804ead2e 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts @@ -601,6 +601,7 @@ export async function performLLMExtract( extractOptions: generationOptions, urls: [meta.url], useAgent: isAgentExtractModelValid(meta.options.extract?.agent?.model), + scrapeId: meta.id, }); if (warning) { From f92217e3b66a730beb88b565b92e72252556a6cd Mon Sep 17 00:00:00 2001 From: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com> Date: Wed, 16 Apr 2025 00:38:35 -0700 Subject: [PATCH 132/160] wip --- apps/api/src/controllers/v1/extract-status.ts | 1 + .../lib/extract/completions/batchExtract.ts | 23 ++++++++++++++++--- apps/api/src/lib/extract/extract-redis.ts | 6 ++++- .../api/src/lib/extract/extraction-service.ts | 18 ++++++++++++++- .../scrapeURL/lib/extractSmartScrape.ts | 5 +++- 5 files changed, 47 insertions(+), 6 deletions(-) diff --git a/apps/api/src/controllers/v1/extract-status.ts b/apps/api/src/controllers/v1/extract-status.ts index 47e02f7b..76a611f8 100644 --- a/apps/api/src/controllers/v1/extract-status.ts +++ b/apps/api/src/controllers/v1/extract-status.ts @@ -73,5 +73,6 @@ export async function extractStatusController( llmUsage: extract.showLLMUsage ? extract.llmUsage : undefined, sources: extract.showSources ? extract.sources : undefined, costTracking: extract.showCostTracking ? extract.costTracking : undefined, + sessionIds: extract.sessionIds ? extract.sessionIds : undefined, }); } diff --git a/apps/api/src/lib/extract/completions/batchExtract.ts b/apps/api/src/lib/extract/completions/batchExtract.ts index 1bffeaa2..7e254a0f 100644 --- a/apps/api/src/lib/extract/completions/batchExtract.ts +++ b/apps/api/src/lib/extract/completions/batchExtract.ts @@ -23,6 +23,7 @@ type BatchExtractOptions = { doc: Document; useAgent: boolean; extractId?: string; + sessionId?: string; }; /** @@ -44,9 +45,17 @@ export async function batchExtractPromise(options: BatchExtractOptions, logger: otherCost: number; smartScrapeCallCount: number; otherCallCount: number; + sessionId?: string; }> { - const { multiEntitySchema, links, prompt, systemPrompt, doc, useAgent, extractId } = options; - + const { + multiEntitySchema, + links, + prompt, + systemPrompt, + doc, + useAgent, + extractId, + sessionId } = options; const generationOptions: GenerateCompletionsOptions = { logger: logger.child({ @@ -71,11 +80,19 @@ export async function batchExtractPromise(options: BatchExtractOptions, logger: let warning: string | undefined; let smCost = 0, oCost = 0, smCallCount = 0, oCallCount = 0; try { - const { extractedDataArray: e, warning: w, smartScrapeCost, otherCost, smartScrapeCallCount, otherCallCount } = await extractData({ + const { + extractedDataArray: e, + warning: w, + smartScrapeCost, + otherCost, + smartScrapeCallCount, + otherCallCount + } = await extractData({ extractOptions: generationOptions, urls: [doc.metadata.sourceURL || doc.metadata.url || ""], useAgent, extractId, + sessionId }); extractedDataArray = e; warning = w; diff --git a/apps/api/src/lib/extract/extract-redis.ts b/apps/api/src/lib/extract/extract-redis.ts index e560f18d..d256c582 100644 --- a/apps/api/src/lib/extract/extract-redis.ts +++ b/apps/api/src/lib/extract/extract-redis.ts @@ -8,6 +8,7 @@ export enum ExtractStep { MAP_RERANK = "map-rerank", MULTI_ENTITY = "multi-entity", MULTI_ENTITY_SCRAPE = "multi-entity-scrape", + MULTI_ENTITY_AGENT_SCRAPE = "multi-entity-agent-scrape", MULTI_ENTITY_EXTRACT = "multi-entity-extract", SCRAPE = "scrape", EXTRACT = "extract", @@ -17,7 +18,7 @@ export enum ExtractStep { export type ExtractedStep = { step: ExtractStep; startedAt: number; - finishedAt: number; + finishedAt: number | null; error?: any; discoveredLinks?: string[]; }; @@ -38,6 +39,7 @@ export type StoredExtract = { sources?: { [key: string]: string[]; }; + sessionIds?: string[]; }; // Reduce TTL to 6 hours instead of 24 @@ -107,6 +109,8 @@ export async function updateExtract( })) }; + console.log(minimalExtract.sessionIds) + await redisConnection.set("extract:" + id, JSON.stringify(minimalExtract)); await redisConnection.expire("extract:" + id, EXTRACT_TTL); } diff --git a/apps/api/src/lib/extract/extraction-service.ts b/apps/api/src/lib/extract/extraction-service.ts index 1a6a4262..dd828f4f 100644 --- a/apps/api/src/lib/extract/extraction-service.ts +++ b/apps/api/src/lib/extract/extraction-service.ts @@ -413,8 +413,23 @@ export async function performExtraction( chunks.push(multyEntityDocs.slice(i, i + chunkSize)); } + const sessionIds = chunks.map(() => 'fc-' + crypto.randomUUID()); + await updateExtract(extractId, { + status: "processing", + steps: [ + { + step: ExtractStep.MULTI_ENTITY_AGENT_SCRAPE, + startedAt: Date.now(), + finishedAt: null + }, + ], + sessionIds + }); + // Process chunks sequentially with timeout - for (const chunk of chunks) { + for (let i = 0; i < chunks.length; i++) { + const chunk = chunks[i]; + const sessionId = sessionIds[i]; const chunkPromises = chunk.map(async (doc) => { try { ajv.compile(multiEntitySchema); @@ -432,6 +447,7 @@ export async function performExtraction( doc, useAgent: isAgentExtractModelValid(request.agent?.model), extractId, + sessionId }, logger); // Race between timeout and completion diff --git a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts index 7380f380..f642383a 100644 --- a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts +++ b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts @@ -10,6 +10,7 @@ import { parseMarkdown } from "../../../lib/html-to-markdown"; import { getModel } from "../../../lib/generic-ai"; import { TokenUsage } from "../../../controllers/v1/types"; import type { SmartScrapeResult } from "./smartScrape"; +import { ExtractStep } from "src/lib/extract/extract-redis"; const commonSmartScrapeProperties = { shouldUseSmartscrape: { @@ -185,11 +186,13 @@ export async function extractData({ urls, useAgent, extractId, + sessionId }: { extractOptions: GenerateCompletionsOptions; urls: string[]; useAgent: boolean; extractId?: string; + sessionId?: string; }): Promise<{ extractedDataArray: any[]; warning: any; @@ -275,7 +278,7 @@ export async function extractData({ let smartscrapeResults: SmartScrapeResult[]; if (isSingleUrl) { smartscrapeResults = [ - await smartScrape(urls[0], extract?.smartscrape_prompt, extractId), + await smartScrape(urls[0], extract?.smartscrape_prompt, sessionId, extractId), ]; smartScrapeCost += smartscrapeResults[0].tokenUsage; smartScrapeCallCount++; From 129b10e4789b33fcda73542b0a5cf3023f798d1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 16 Apr 2025 01:04:48 -0700 Subject: [PATCH 133/160] fix(llmExtract): cost calculation --- .../scrapeURL/transformers/llmExtract.ts | 39 ++++++------------- 1 file changed, 11 insertions(+), 28 deletions(-) diff --git a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts index 804ead2e..fe65d591 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts @@ -259,23 +259,6 @@ export async function generateCompletions({ throw new Error("document.markdown is undefined -- this is unexpected"); } - const { maxInputTokens, maxOutputTokens } = getModelLimits( - currentModel.modelId, - ); - // Calculate 80% of max input tokens (for content) - const maxTokensSafe = Math.floor(maxInputTokens * 0.8); - - // Use the new trimming function - const { - text: trimmedMarkdown, - numTokens, - warning: trimWarning, - } = trimToTokenLimit(markdown, maxTokensSafe, model.modelId, previousWarning); - - // WE USE BIG MODELS NOW - // markdown = trimmedMarkdown; - // warning = trimWarning; - try { const prompt = options.prompt !== undefined @@ -300,16 +283,16 @@ export async function generateCompletions({ return { extract, warning, - numTokens, + numTokens: result.usage?.promptTokens ?? 0, totalUsage: { - promptTokens: numTokens, + promptTokens: result.usage?.promptTokens ?? 0, completionTokens: result.usage?.completionTokens ?? 0, - totalTokens: numTokens + (result.usage?.completionTokens ?? 0), + totalTokens: result.usage?.promptTokens ?? 0 + (result.usage?.completionTokens ?? 0), }, model: currentModel.modelId, cost: calculateCost( currentModel.modelId, - numTokens, + result.usage?.promptTokens ?? 0, result.usage?.completionTokens ?? 0, ), }; @@ -341,16 +324,16 @@ export async function generateCompletions({ return { extract, warning, - numTokens, + numTokens: result.usage?.promptTokens ?? 0, totalUsage: { - promptTokens: numTokens, + promptTokens: result.usage?.promptTokens ?? 0, completionTokens: result.usage?.completionTokens ?? 0, - totalTokens: numTokens + (result.usage?.completionTokens ?? 0), + totalTokens: result.usage?.promptTokens ?? 0 + (result.usage?.completionTokens ?? 0), }, model: currentModel.modelId, cost: calculateCost( currentModel.modelId, - numTokens, + result.usage?.promptTokens ?? 0, result.usage?.completionTokens ?? 0, ), }; @@ -541,13 +524,13 @@ export async function generateCompletions({ } // Since generateObject doesn't provide token usage, we'll estimate it - const promptTokens = numTokens; - const completionTokens = result?.usage?.completionTokens ?? 0; + const promptTokens = result.usage?.promptTokens ?? 0; + const completionTokens = result.usage?.completionTokens ?? 0; return { extract, warning, - numTokens, + numTokens: promptTokens, totalUsage: { promptTokens, completionTokens, From dcef6fbc13fcc673f3fea91385996b95beebacf2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 16 Apr 2025 01:13:49 -0700 Subject: [PATCH 134/160] feat(extractSmartScrape): mog it to 100 pages max --- .../src/scraper/scrapeURL/lib/extractSmartScrape.ts | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts index cee1d5a6..a20c5d2c 100644 --- a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts +++ b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts @@ -279,10 +279,18 @@ export async function extractData({ smartScrapeCost += smartscrapeResults[0].tokenUsage; smartScrapeCallCount++; } else { - const pages = extract?.smartscrapePages; + const pages = extract?.smartscrapePages ?? []; //do it async promiseall instead + if (pages.length > 100) { + logger.warn("Smart scrape pages limit exceeded, only first 100 pages will be scraped", { + pagesLength: pages.length, + extractId, + scrapeId, + }); + } + smartscrapeResults = await Promise.all( - pages.map(async (page) => { + pages.slice(0, 100).map(async (page) => { return await smartScrape( urls[page.page_index], page.smartscrape_prompt, From aa2024657105ad8b6d0b47561d28bd54e01dfa5a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 16 Apr 2025 02:06:39 -0700 Subject: [PATCH 135/160] minor cost tracking fix --- apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts index b6f19d17..b7251494 100644 --- a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts +++ b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts @@ -307,7 +307,7 @@ export async function extractData({ (acc, result) => acc + result.tokenUsage, 0, ); - smartScrapeCallCount += pages.length; + smartScrapeCallCount += smartscrapeResults.length; } // console.log("smartscrapeResults", smartscrapeResults); From d82f44c93e89cdc54a97d45f15f634b7de355407 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 16 Apr 2025 03:13:02 -0700 Subject: [PATCH 136/160] feat(extract): log failed extracts --- .../api/src/lib/extract/extraction-service.ts | 1622 +++++++++-------- 1 file changed, 832 insertions(+), 790 deletions(-) diff --git a/apps/api/src/lib/extract/extraction-service.ts b/apps/api/src/lib/extract/extraction-service.ts index 0b72f379..6f14306a 100644 --- a/apps/api/src/lib/extract/extraction-service.ts +++ b/apps/api/src/lib/extract/extraction-service.ts @@ -36,7 +36,7 @@ import { getCachedDocs, saveCachedDocs } from "./helpers/cached-docs"; import { normalizeUrl } from "../canonical-url"; import { search } from "../../search"; import { buildRephraseToSerpPrompt } from "./build-prompts"; -import fs from "fs/promises"; + interface ExtractServiceOptions { request: ExtractRequest; teamId: string; @@ -109,845 +109,887 @@ export async function performExtraction( teamId, }); - // If no URLs are provided, generate URLs from the prompt - if ((!request.urls || request.urls.length === 0) && request.prompt) { - logger.debug("Generating URLs from prompt...", { - prompt: request.prompt, - }); - const rephrasedPrompt = await generateBasicCompletion( - buildRephraseToSerpPrompt(request.prompt), - ); - let rptxt = rephrasedPrompt?.text.replace('"', "").replace("'", "") || ""; - if (rephrasedPrompt) { - costTracking.otherCallCount++; - costTracking.otherCost += rephrasedPrompt.cost; - costTracking.totalCost += rephrasedPrompt.cost; - } - const searchResults = await search({ - query: rptxt, - num_results: 10, - }); + try { - request.urls = searchResults.map((result) => result.url) as string[]; - } - if (request.urls && request.urls.length === 0) { - logger.error("No search results found", { - query: request.prompt, - }); - return { - success: false, - error: "No search results found", - extractId, - }; - } - - const urls = request.urls || ([] as string[]); - - if ( - request.__experimental_cacheMode == "load" && - request.__experimental_cacheKey && - urls - ) { - logger.debug("Loading cached docs..."); - try { - const cache = await getCachedDocs(urls, request.__experimental_cacheKey); - for (const doc of cache) { - if (doc.metadata.url) { - docsMap.set(normalizeUrl(doc.metadata.url), doc); - } - } - } catch (error) { - logger.error("Error loading cached docs", { error }); - } - } - - // Token tracking - let tokenUsage: TokenUsage[] = []; - - await updateExtract(extractId, { - status: "processing", - steps: [ - { - step: ExtractStep.INITIAL, - startedAt: Date.now(), - finishedAt: Date.now(), - discoveredLinks: request.urls, - }, - ], - }); - - let reqSchema = request.schema; - if (!reqSchema && request.prompt) { - const schemaGenRes = await generateSchemaFromPrompt(request.prompt, logger); - reqSchema = schemaGenRes.extract; - costTracking.otherCallCount++; - costTracking.otherCost += schemaGenRes.cost; - costTracking.totalCost += schemaGenRes.cost; - - logger.debug("Generated request schema.", { - originalSchema: request.schema, - schema: reqSchema, - }); - } - - if (reqSchema) { - reqSchema = await dereferenceSchema(reqSchema); - } - - logger.debug("Transformed schema.", { - originalSchema: request.schema, - schema: reqSchema, - }); - - let rSchema = reqSchema; - - // agent evaluates if the schema or the prompt has an array with big amount of items - // also it checks if the schema any other properties that are not arrays - // if so, it splits the results into 2 types of completions: - // 1. the first one is a completion that will extract the array of items - // 2. the second one is multiple completions that will extract the items from the array - let startAnalyze = Date.now(); - const { - isMultiEntity, - multiEntityKeys, - reasoning, - keyIndicators, - tokenUsage: schemaAnalysisTokenUsage, - cost: schemaAnalysisCost, - } = await analyzeSchemaAndPrompt(urls, reqSchema, request.prompt ?? "", logger); - - logger.debug("Analyzed schema.", { - isMultiEntity, - multiEntityKeys, - reasoning, - keyIndicators, - }); - - costTracking.otherCallCount++; - costTracking.otherCost += schemaAnalysisCost; - costTracking.totalCost += schemaAnalysisCost; - - // Track schema analysis tokens - tokenUsage.push(schemaAnalysisTokenUsage); - - let startMap = Date.now(); - let aggMapLinks: string[] = []; - logger.debug("Processing URLs...", { - urlCount: request.urls?.length || 0, - }); - - const urlPromises = urls.map((url) => - processUrl( - { - url, + // If no URLs are provided, generate URLs from the prompt + if ((!request.urls || request.urls.length === 0) && request.prompt) { + logger.debug("Generating URLs from prompt...", { prompt: request.prompt, - teamId, - allowExternalLinks: request.allowExternalLinks, - origin: request.origin, - limit: request.limit, - includeSubdomains: request.includeSubdomains, - schema: request.schema, - log, - isMultiEntity, - reasoning, - multiEntityKeys, - keyIndicators, - }, - urlTraces, - (links: string[]) => { - aggMapLinks.push(...links); - updateExtract(extractId, { - steps: [ - { - step: ExtractStep.MAP, - startedAt: startMap, - finishedAt: Date.now(), - discoveredLinks: aggMapLinks, - }, - ], - }); - }, - logger.child({ module: "extract", method: "processUrl", url }), - costTracking, - ), - ); - - const processedUrls = await Promise.all(urlPromises); - let links = processedUrls.flat().filter((url) => url); - logger.debug("Processed URLs.", { - linkCount: links.length, - }); - - if (links.length === 0) { - links = urls.map(x => x.replace(/\*$/g, "")); - logger.warn("0 links! Doing just the original URLs. (without * wildcard)", { - linkCount: links.length, - }); - } - - log["links"] = links; - log["linksLength"] = links.length; - - await updateExtract(extractId, { - status: "processing", - steps: [ - { - step: ExtractStep.MAP_RERANK, - startedAt: startMap, - finishedAt: Date.now(), - discoveredLinks: links, - }, - ], - }); - - if (isMultiEntity && reqSchema) { - log["isMultiEntity"] = true; - logger.debug("=== MULTI-ENTITY ==="); - - const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas( - reqSchema, - multiEntityKeys, - ); - rSchema = singleAnswerSchema; - logger.debug("Spread schemas.", { singleAnswerSchema, multiEntitySchema }); - - await updateExtract(extractId, { - status: "processing", - steps: [ - { - step: ExtractStep.MULTI_ENTITY, - startedAt: startAnalyze, - finishedAt: Date.now(), - discoveredLinks: [], - }, - ], - }); - - const timeout = 60000; - - await updateExtract(extractId, { - status: "processing", - steps: [ - { - step: ExtractStep.MULTI_ENTITY_SCRAPE, - startedAt: startAnalyze, - finishedAt: Date.now(), - discoveredLinks: links, - }, - ], - }); - - logger.debug("Starting multi-entity scrape..."); - let startScrape = Date.now(); - log["docsSizeBeforeMultiEntityScrape"] = docsMap.size; - - const scrapePromises = links.map((url) => { - if (!docsMap.has(normalizeUrl(url))) { - return scrapeDocument( - { - url, - teamId, - origin: request.origin || "api", - timeout, - }, - urlTraces, - logger.child({ - module: "extract", - method: "scrapeDocument", - url, - isMultiEntity: true, - }), - { - ...request.scrapeOptions, - - // Needs to be true for multi-entity to work properly - onlyMainContent: true, - }, - ); - } - return docsMap.get(normalizeUrl(url)); - }); - - let multyEntityDocs = (await Promise.all(scrapePromises)).filter( - (doc): doc is Document => doc !== null, - ); - - log["docsSizeAfterMultiEntityScrape"] = scrapePromises.length; - - logger.debug("Multi-entity scrape finished.", { - docCount: multyEntityDocs.length, - }); - - totalUrlsScraped += multyEntityDocs.length; - - let endScrape = Date.now(); - - await updateExtract(extractId, { - status: "processing", - steps: [ - { - step: ExtractStep.MULTI_ENTITY_SCRAPE, - startedAt: startScrape, - finishedAt: endScrape, - discoveredLinks: links, - }, - ], - }); - - for (const doc of multyEntityDocs) { - if (doc?.metadata?.url) { - docsMap.set(normalizeUrl(doc.metadata.url), doc); - } - } - - logger.debug("Updated docsMap.", { docsMapSize: docsMap.size }); // useful for error probing - - // Process docs in chunks with queue style processing - const chunkSize = 50; - const timeoutCompletion = 45000; // 45 second timeout - const chunks: Document[][] = []; - const extractionResults: { extract: any; url: string }[] = []; - - // Split into chunks - for (let i = 0; i < multyEntityDocs.length; i += chunkSize) { - chunks.push(multyEntityDocs.slice(i, i + chunkSize)); - } - - const sessionIds = chunks.map(() => 'fc-' + crypto.randomUUID()); - await updateExtract(extractId, { - status: "processing", - steps: [ - { - step: ExtractStep.MULTI_ENTITY_AGENT_SCRAPE, - startedAt: Date.now(), - finishedAt: null - }, - ], - sessionIds - }); - - // Process chunks sequentially with timeout - for (let i = 0; i < chunks.length; i++) { - const chunk = chunks[i]; - const sessionId = sessionIds[i]; - const chunkPromises = chunk.map(async (doc) => { - try { - ajv.compile(multiEntitySchema); - - // Wrap in timeout promise - const timeoutPromise = new Promise((resolve) => { - setTimeout(() => resolve(null), timeoutCompletion); - }); - - const completionPromise = batchExtractPromise({ - multiEntitySchema, - links, - prompt: request.prompt ?? "", - systemPrompt: request.systemPrompt ?? "", - doc, - useAgent: isAgentExtractModelValid(request.agent?.model), - extractId, - sessionId - }, logger); - - // Race between timeout and completion - const multiEntityCompletion = (await completionPromise) as Awaited< - ReturnType - >; - - // TODO: merge multiEntityCompletion.extract to fit the multiEntitySchema - - // Track multi-entity extraction tokens - if (multiEntityCompletion) { - tokenUsage.push(multiEntityCompletion.totalUsage); - - costTracking.smartScrapeCallCount += multiEntityCompletion.smartScrapeCallCount; - costTracking.smartScrapeCost += multiEntityCompletion.smartScrapeCost; - costTracking.otherCallCount += multiEntityCompletion.otherCallCount; - costTracking.otherCost += multiEntityCompletion.otherCost; - costTracking.totalCost += multiEntityCompletion.smartScrapeCost + multiEntityCompletion.otherCost; - - if (multiEntityCompletion.extract) { - return { - extract: multiEntityCompletion.extract, - url: doc.metadata.url || doc.metadata.sourceURL || "", - }; - } - } - - // console.log(multiEntityCompletion.extract) - // if (!multiEntityCompletion.extract?.is_content_relevant) { - // console.log(`Skipping extraction for ${doc.metadata.url} as content is not relevant`); - // return null; - // } - - // Update token usage in traces - // if (multiEntityCompletion && multiEntityCompletion.numTokens) { - // const totalLength = docs.reduce( - // (sum, doc) => sum + (doc.markdown?.length || 0), - // 0, - // ); - // docs.forEach((doc) => { - // if (doc.metadata?.sourceURL) { - // const trace = urlTraces.find( - // (t) => t.url === doc.metadata.sourceURL, - // ); - // if (trace && trace.contentStats) { - // trace.contentStats.tokensUsed = Math.floor( - // ((doc.markdown?.length || 0) / totalLength) * - // (multiEntityCompletion?.numTokens || 0), - // ); - // } - // } - // }); - // } - - // if (multiEntityCompletion.extract && multiEntityCompletion.extract.extraction_confidence < 3) { - // console.log(`Skipping extraction for ${doc.metadata.url} as confidence is too low (${multiEntityCompletion.extract.extraction_confidence})`); - // return null; - // } - - return null; - } catch (error) { - logger.error(`Failed to process document.`, { - error, - url: doc.metadata.url ?? doc.metadata.sourceURL!, - }); - return null; - } }); - // Wait for current chunk to complete before processing next chunk - const chunkResults = await Promise.all(chunkPromises); - const validResults = chunkResults.filter( - (result): result is { extract: any; url: string } => result !== null, + const rephrasedPrompt = await generateBasicCompletion( + buildRephraseToSerpPrompt(request.prompt), ); - extractionResults.push(...validResults); - // Merge all extracts from valid results into a single array - const extractArrays = validResults.map((r) => - Array.isArray(r.extract) ? r.extract : [r.extract], - ); - const mergedExtracts = extractArrays.flat(); - multiEntityCompletions.push(...mergedExtracts); - multiEntityCompletions = multiEntityCompletions.filter((c) => c !== null); - logger.debug("All multi-entity completion chunks finished.", { - completionCount: multiEntityCompletions.length, + let rptxt = rephrasedPrompt?.text.replace('"', "").replace("'", "") || ""; + if (rephrasedPrompt) { + costTracking.otherCallCount++; + costTracking.otherCost += rephrasedPrompt.cost; + costTracking.totalCost += rephrasedPrompt.cost; + } + const searchResults = await search({ + query: rptxt, + num_results: 10, }); - log["multiEntityCompletionsLength"] = multiEntityCompletions.length; + + request.urls = searchResults.map((result) => result.url) as string[]; } - - try { - // Use SourceTracker to handle source tracking - const sourceTracker = new SourceTracker(); - logger.debug("Created SourceTracker instance"); - - // Transform and merge results while preserving sources - try { - sourceTracker.transformResults( - extractionResults, - multiEntitySchema, - false, - ); - logger.debug("Successfully transformed results with sourceTracker"); - } catch (error) { - const errorLog = `[${new Date().toISOString()}] Error in sourceTracker.transformResults: ${JSON.stringify(error, null, 2)}\n`; - await fs.appendFile('logs/extraction-errors.log', errorLog); - logger.error(`Error in sourceTracker.transformResults:`, { error }); - throw error; - } - - try { - multiEntityResult = transformArrayToObject( - multiEntitySchema, - multiEntityCompletions, - ); - logger.debug("Successfully transformed array to object"); - } catch (error) { - const errorLog = `[${new Date().toISOString()}] Error in transformArrayToObject: ${JSON.stringify(error, null, 2)}\n`; - await fs.appendFile('logs/extraction-errors.log', errorLog); - logger.error(`Error in transformArrayToObject:`, { error }); - throw error; - } - - // Track sources before deduplication - try { - sourceTracker.trackPreDeduplicationSources(multiEntityResult); - logger.debug("Successfully tracked pre-deduplication sources"); - } catch (error) { - const errorLog = `[${new Date().toISOString()}] Error in trackPreDeduplicationSources: ${JSON.stringify(error, null, 2)}\n`; - await fs.appendFile('logs/extraction-errors.log', errorLog); - logger.error(`Error in trackPreDeduplicationSources:`, { error }); - throw error; - } - - // Apply deduplication and merge - try { - multiEntityResult = deduplicateObjectsArray(multiEntityResult); - logger.debug("Successfully deduplicated objects array"); - } catch (error) { - const errorLog = `[${new Date().toISOString()}] Error in deduplicateObjectsArray: ${JSON.stringify(error, null, 2)}\n`; - await fs.appendFile('logs/extraction-errors.log', errorLog); - logger.error(`Error in deduplicateObjectsArray:`, { error }); - throw error; - } - - try { - multiEntityResult = mergeNullValObjs(multiEntityResult); - logger.debug("Successfully merged null value objects"); - } catch (error) { - const errorLog = `[${new Date().toISOString()}] Error in mergeNullValObjs: ${JSON.stringify(error, null, 2)}\n`; - await fs.appendFile('logs/extraction-errors.log', errorLog); - logger.error(`Error in mergeNullValObjs:`, { error }); - throw error; - } - - // Map sources to final deduplicated/merged items - try { - const multiEntitySources = sourceTracker.mapSourcesToFinalItems( - multiEntityResult, - multiEntityKeys, - ); - Object.assign(sources, multiEntitySources); - logger.debug("Successfully mapped sources to final items"); - } catch (error) { - const errorLog = `[${new Date().toISOString()}] Error in mapSourcesToFinalItems: ${JSON.stringify(error, null, 2)}\n`; - await fs.appendFile('logs/extraction-errors.log', errorLog); - logger.error(`Error in mapSourcesToFinalItems:`, { error }); - throw error; - } - } catch (error) { - const errorLog = `[${new Date().toISOString()}] Failed to transform array to object\nError: ${JSON.stringify(error, null, 2)}\nStack: ${error.stack}\nMultiEntityResult: ${JSON.stringify(multiEntityResult, null, 2)}\nMultiEntityCompletions: ${JSON.stringify(multiEntityCompletions, null, 2)}\nMultiEntitySchema: ${JSON.stringify(multiEntitySchema, null, 2)}\n\n`; - await fs.appendFile('logs/extraction-errors.log', errorLog); - logger.error(`Failed to transform array to object`, { - error, - errorMessage: error.message, - errorStack: error.stack, - multiEntityResult: JSON.stringify(multiEntityResult), - multiEntityCompletions: JSON.stringify(multiEntityCompletions), - multiEntitySchema: JSON.stringify(multiEntitySchema) + if (request.urls && request.urls.length === 0) { + logger.error("No search results found", { + query: request.prompt, + }); + logJob({ + job_id: extractId, + success: false, + message: "No search results found", + num_docs: 1, + docs: [], + time_taken: (new Date().getTime() - Date.now()) / 1000, + team_id: teamId, + mode: "extract", + url: request.urls?.join(", ") || "", + scrapeOptions: request, + origin: request.origin ?? "api", + num_tokens: 0, + tokens_billed: 0, + sources, + cost_tracking: costTracking, }); return { success: false, - error: - "An unexpected error occurred. Please contact help@firecrawl.com for help.", + error: "No search results found", extractId, - urlTrace: urlTraces, - totalUrlsScraped, }; } - } - if ( - rSchema && - Object.keys(rSchema).length > 0 && - rSchema.properties && - Object.keys(rSchema.properties).length > 0 - ) { - log["isSingleEntity"] = true; - logger.debug("=== SINGLE PAGES ===", { - linkCount: links.length, - schema: rSchema, - }); - // Scrape documents - const timeout = 60000; - let singleAnswerDocs: Document[] = []; + const urls = request.urls || ([] as string[]); - // let rerank = await rerankLinks(links.map((url) => ({ url })), request.prompt ?? JSON.stringify(request.schema), urlTraces); + if ( + request.__experimental_cacheMode == "load" && + request.__experimental_cacheKey && + urls + ) { + logger.debug("Loading cached docs..."); + try { + const cache = await getCachedDocs(urls, request.__experimental_cacheKey); + for (const doc of cache) { + if (doc.metadata.url) { + docsMap.set(normalizeUrl(doc.metadata.url), doc); + } + } + } catch (error) { + logger.error("Error loading cached docs", { error }); + } + } + + // Token tracking + let tokenUsage: TokenUsage[] = []; await updateExtract(extractId, { status: "processing", steps: [ { - step: ExtractStep.SCRAPE, + step: ExtractStep.INITIAL, startedAt: Date.now(), finishedAt: Date.now(), + discoveredLinks: request.urls, + }, + ], + }); + + let reqSchema = request.schema; + if (!reqSchema && request.prompt) { + const schemaGenRes = await generateSchemaFromPrompt(request.prompt, logger); + reqSchema = schemaGenRes.extract; + costTracking.otherCallCount++; + costTracking.otherCost += schemaGenRes.cost; + costTracking.totalCost += schemaGenRes.cost; + + logger.debug("Generated request schema.", { + originalSchema: request.schema, + schema: reqSchema, + }); + } + + if (reqSchema) { + reqSchema = await dereferenceSchema(reqSchema); + } + + logger.debug("Transformed schema.", { + originalSchema: request.schema, + schema: reqSchema, + }); + + let rSchema = reqSchema; + + // agent evaluates if the schema or the prompt has an array with big amount of items + // also it checks if the schema any other properties that are not arrays + // if so, it splits the results into 2 types of completions: + // 1. the first one is a completion that will extract the array of items + // 2. the second one is multiple completions that will extract the items from the array + let startAnalyze = Date.now(); + const { + isMultiEntity, + multiEntityKeys, + reasoning, + keyIndicators, + tokenUsage: schemaAnalysisTokenUsage, + cost: schemaAnalysisCost, + } = await analyzeSchemaAndPrompt(urls, reqSchema, request.prompt ?? "", logger); + + logger.debug("Analyzed schema.", { + isMultiEntity, + multiEntityKeys, + reasoning, + keyIndicators, + }); + + costTracking.otherCallCount++; + costTracking.otherCost += schemaAnalysisCost; + costTracking.totalCost += schemaAnalysisCost; + + // Track schema analysis tokens + tokenUsage.push(schemaAnalysisTokenUsage); + + let startMap = Date.now(); + let aggMapLinks: string[] = []; + logger.debug("Processing URLs...", { + urlCount: request.urls?.length || 0, + }); + + const urlPromises = urls.map((url) => + processUrl( + { + url, + prompt: request.prompt, + teamId, + allowExternalLinks: request.allowExternalLinks, + origin: request.origin, + limit: request.limit, + includeSubdomains: request.includeSubdomains, + schema: request.schema, + log, + isMultiEntity, + reasoning, + multiEntityKeys, + keyIndicators, + }, + urlTraces, + (links: string[]) => { + aggMapLinks.push(...links); + updateExtract(extractId, { + steps: [ + { + step: ExtractStep.MAP, + startedAt: startMap, + finishedAt: Date.now(), + discoveredLinks: aggMapLinks, + }, + ], + }); + }, + logger.child({ module: "extract", method: "processUrl", url }), + costTracking, + ), + ); + + const processedUrls = await Promise.all(urlPromises); + let links = processedUrls.flat().filter((url) => url); + logger.debug("Processed URLs.", { + linkCount: links.length, + }); + + if (links.length === 0) { + links = urls.map(x => x.replace(/\*$/g, "")); + logger.warn("0 links! Doing just the original URLs. (without * wildcard)", { + linkCount: links.length, + }); + } + + log["links"] = links; + log["linksLength"] = links.length; + + await updateExtract(extractId, { + status: "processing", + steps: [ + { + step: ExtractStep.MAP_RERANK, + startedAt: startMap, + finishedAt: Date.now(), discoveredLinks: links, }, ], }); - log["docsSizeBeforeSingleEntityScrape"] = docsMap.size; - const scrapePromises = links.map((url) => { - if (!docsMap.has(normalizeUrl(url))) { - return scrapeDocument( + + if (isMultiEntity && reqSchema) { + log["isMultiEntity"] = true; + logger.debug("=== MULTI-ENTITY ==="); + + const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas( + reqSchema, + multiEntityKeys, + ); + rSchema = singleAnswerSchema; + logger.debug("Spread schemas.", { singleAnswerSchema, multiEntitySchema }); + + await updateExtract(extractId, { + status: "processing", + steps: [ { - url, - teamId, - origin: request.origin || "api", - timeout, + step: ExtractStep.MULTI_ENTITY, + startedAt: startAnalyze, + finishedAt: Date.now(), + discoveredLinks: [], }, - urlTraces, - logger.child({ - module: "extract", - method: "scrapeDocument", - url, - isMultiEntity: false, - }), - request.scrapeOptions, - ); - } - return docsMap.get(normalizeUrl(url)); - }); + ], + }); - try { - const results = await Promise.all(scrapePromises); - log["docsSizeAfterSingleEntityScrape"] = docsMap.size; + const timeout = 60000; - for (const doc of results) { + await updateExtract(extractId, { + status: "processing", + steps: [ + { + step: ExtractStep.MULTI_ENTITY_SCRAPE, + startedAt: startAnalyze, + finishedAt: Date.now(), + discoveredLinks: links, + }, + ], + }); + + logger.debug("Starting multi-entity scrape..."); + let startScrape = Date.now(); + log["docsSizeBeforeMultiEntityScrape"] = docsMap.size; + + const scrapePromises = links.map((url) => { + if (!docsMap.has(normalizeUrl(url))) { + return scrapeDocument( + { + url, + teamId, + origin: request.origin || "api", + timeout, + }, + urlTraces, + logger.child({ + module: "extract", + method: "scrapeDocument", + url, + isMultiEntity: true, + }), + { + ...request.scrapeOptions, + + // Needs to be true for multi-entity to work properly + onlyMainContent: true, + }, + ); + } + return docsMap.get(normalizeUrl(url)); + }); + + let multyEntityDocs = (await Promise.all(scrapePromises)).filter( + (doc): doc is Document => doc !== null, + ); + + log["docsSizeAfterMultiEntityScrape"] = scrapePromises.length; + + logger.debug("Multi-entity scrape finished.", { + docCount: multyEntityDocs.length, + }); + + totalUrlsScraped += multyEntityDocs.length; + + let endScrape = Date.now(); + + await updateExtract(extractId, { + status: "processing", + steps: [ + { + step: ExtractStep.MULTI_ENTITY_SCRAPE, + startedAt: startScrape, + finishedAt: endScrape, + discoveredLinks: links, + }, + ], + }); + + for (const doc of multyEntityDocs) { if (doc?.metadata?.url) { docsMap.set(normalizeUrl(doc.metadata.url), doc); } } + logger.debug("Updated docsMap.", { docsMapSize: docsMap.size }); // useful for error probing - const validResults = results.filter( - (doc): doc is Document => doc !== null, - ); - singleAnswerDocs.push(...validResults); - totalUrlsScraped += validResults.length; + // Process docs in chunks with queue style processing + const chunkSize = 50; + const timeoutCompletion = 45000; // 45 second timeout + const chunks: Document[][] = []; + const extractionResults: { extract: any; url: string }[] = []; - logger.debug("Scrapes finished.", { docCount: validResults.length }); - } catch (error) { - return { - success: false, - error: error.message, - extractId, - urlTrace: urlTraces, - totalUrlsScraped, - }; - } + // Split into chunks + for (let i = 0; i < multyEntityDocs.length; i += chunkSize) { + chunks.push(multyEntityDocs.slice(i, i + chunkSize)); + } - if (docsMap.size == 0) { - // All urls are invalid - logger.error("All provided URLs are invalid!"); - return { - success: false, - error: - "All provided URLs are invalid. Please check your input and try again.", - extractId, - urlTrace: request.urlTrace ? urlTraces : undefined, - totalUrlsScraped: 0, - }; - } + const sessionIds = chunks.map(() => 'fc-' + crypto.randomUUID()); + await updateExtract(extractId, { + status: "processing", + steps: [ + { + step: ExtractStep.MULTI_ENTITY_AGENT_SCRAPE, + startedAt: Date.now(), + finishedAt: null + }, + ], + sessionIds + }); - await updateExtract(extractId, { - status: "processing", - steps: [ - { - step: ExtractStep.EXTRACT, - startedAt: Date.now(), - finishedAt: Date.now(), - discoveredLinks: links, - }, - ], - }); + // Process chunks sequentially with timeout + for (let i = 0; i < chunks.length; i++) { + const chunk = chunks[i]; + const sessionId = sessionIds[i]; + const chunkPromises = chunk.map(async (doc) => { + try { + ajv.compile(multiEntitySchema); - // Generate completions - logger.debug("Generating singleAnswer completions..."); - log["singleAnswerDocsLength"] = singleAnswerDocs.length; - let { - extract: completionResult, - tokenUsage: singleAnswerTokenUsage, - sources: singleAnswerSources, - smartScrapeCost: singleAnswerSmartScrapeCost, - otherCost: singleAnswerOtherCost, - smartScrapeCallCount: singleAnswerSmartScrapeCallCount, - otherCallCount: singleAnswerOtherCallCount, - } = await singleAnswerCompletion({ - singleAnswerDocs, - rSchema, - links, - prompt: request.prompt ?? "", - systemPrompt: request.systemPrompt ?? "", - useAgent: isAgentExtractModelValid(request.agent?.model), - extractId, - }); - costTracking.smartScrapeCost += singleAnswerSmartScrapeCost; - costTracking.smartScrapeCallCount += singleAnswerSmartScrapeCallCount; - costTracking.otherCost += singleAnswerOtherCost; - costTracking.otherCallCount += singleAnswerOtherCallCount; - costTracking.totalCost += singleAnswerSmartScrapeCost + singleAnswerOtherCost; - logger.debug("Done generating singleAnswer completions."); + // Wrap in timeout promise + const timeoutPromise = new Promise((resolve) => { + setTimeout(() => resolve(null), timeoutCompletion); + }); - singleAnswerResult = transformArrayToObject(rSchema, completionResult); + const completionPromise = batchExtractPromise({ + multiEntitySchema, + links, + prompt: request.prompt ?? "", + systemPrompt: request.systemPrompt ?? "", + doc, + useAgent: isAgentExtractModelValid(request.agent?.model), + extractId, + sessionId + }, logger); - singleAnswerResult = deduplicateObjectsArray(singleAnswerResult); - // Track single answer extraction tokens and sources - if (completionResult) { - tokenUsage.push(singleAnswerTokenUsage); + // Race between timeout and completion + const multiEntityCompletion = (await completionPromise) as Awaited< + ReturnType + >; - // Add sources for top-level properties in single answer - if (rSchema?.properties) { - Object.keys(rSchema.properties).forEach((key) => { - if (completionResult[key] !== undefined) { - sources[key] = - singleAnswerSources || - singleAnswerDocs.map( - (doc) => doc.metadata.url || doc.metadata.sourceURL || "", - ); + // TODO: merge multiEntityCompletion.extract to fit the multiEntitySchema + + // Track multi-entity extraction tokens + if (multiEntityCompletion) { + tokenUsage.push(multiEntityCompletion.totalUsage); + + costTracking.smartScrapeCallCount += multiEntityCompletion.smartScrapeCallCount; + costTracking.smartScrapeCost += multiEntityCompletion.smartScrapeCost; + costTracking.otherCallCount += multiEntityCompletion.otherCallCount; + costTracking.otherCost += multiEntityCompletion.otherCost; + costTracking.totalCost += multiEntityCompletion.smartScrapeCost + multiEntityCompletion.otherCost; + + if (multiEntityCompletion.extract) { + return { + extract: multiEntityCompletion.extract, + url: doc.metadata.url || doc.metadata.sourceURL || "", + }; + } + } + + // console.log(multiEntityCompletion.extract) + // if (!multiEntityCompletion.extract?.is_content_relevant) { + // console.log(`Skipping extraction for ${doc.metadata.url} as content is not relevant`); + // return null; + // } + + // Update token usage in traces + // if (multiEntityCompletion && multiEntityCompletion.numTokens) { + // const totalLength = docs.reduce( + // (sum, doc) => sum + (doc.markdown?.length || 0), + // 0, + // ); + // docs.forEach((doc) => { + // if (doc.metadata?.sourceURL) { + // const trace = urlTraces.find( + // (t) => t.url === doc.metadata.sourceURL, + // ); + // if (trace && trace.contentStats) { + // trace.contentStats.tokensUsed = Math.floor( + // ((doc.markdown?.length || 0) / totalLength) * + // (multiEntityCompletion?.numTokens || 0), + // ); + // } + // } + // }); + // } + + // if (multiEntityCompletion.extract && multiEntityCompletion.extract.extraction_confidence < 3) { + // console.log(`Skipping extraction for ${doc.metadata.url} as confidence is too low (${multiEntityCompletion.extract.extraction_confidence})`); + // return null; + // } + + return null; + } catch (error) { + logger.error(`Failed to process document.`, { + error, + url: doc.metadata.url ?? doc.metadata.sourceURL!, + }); + return null; } }); + // Wait for current chunk to complete before processing next chunk + const chunkResults = await Promise.all(chunkPromises); + const validResults = chunkResults.filter( + (result): result is { extract: any; url: string } => result !== null, + ); + extractionResults.push(...validResults); + // Merge all extracts from valid results into a single array + const extractArrays = validResults.map((r) => + Array.isArray(r.extract) ? r.extract : [r.extract], + ); + const mergedExtracts = extractArrays.flat(); + multiEntityCompletions.push(...mergedExtracts); + multiEntityCompletions = multiEntityCompletions.filter((c) => c !== null); + logger.debug("All multi-entity completion chunks finished.", { + completionCount: multiEntityCompletions.length, + }); + log["multiEntityCompletionsLength"] = multiEntityCompletions.length; + } + + try { + // Use SourceTracker to handle source tracking + const sourceTracker = new SourceTracker(); + logger.debug("Created SourceTracker instance"); + + // Transform and merge results while preserving sources + try { + sourceTracker.transformResults( + extractionResults, + multiEntitySchema, + false, + ); + logger.debug("Successfully transformed results with sourceTracker"); + } catch (error) { + logger.error(`Error in sourceTracker.transformResults:`, { error }); + throw error; + } + + try { + multiEntityResult = transformArrayToObject( + multiEntitySchema, + multiEntityCompletions, + ); + logger.debug("Successfully transformed array to object"); + } catch (error) { + logger.error(`Error in transformArrayToObject:`, { error }); + throw error; + } + + // Track sources before deduplication + try { + sourceTracker.trackPreDeduplicationSources(multiEntityResult); + logger.debug("Successfully tracked pre-deduplication sources"); + } catch (error) { + logger.error(`Error in trackPreDeduplicationSources:`, { error }); + throw error; + } + + // Apply deduplication and merge + try { + multiEntityResult = deduplicateObjectsArray(multiEntityResult); + logger.debug("Successfully deduplicated objects array"); + } catch (error) { + logger.error(`Error in deduplicateObjectsArray:`, { error }); + throw error; + } + + try { + multiEntityResult = mergeNullValObjs(multiEntityResult); + logger.debug("Successfully merged null value objects"); + } catch (error) { + logger.error(`Error in mergeNullValObjs:`, { error }); + throw error; + } + + // Map sources to final deduplicated/merged items + try { + const multiEntitySources = sourceTracker.mapSourcesToFinalItems( + multiEntityResult, + multiEntityKeys, + ); + Object.assign(sources, multiEntitySources); + logger.debug("Successfully mapped sources to final items"); + } catch (error) { + logger.error(`Error in mapSourcesToFinalItems:`, { error }); + throw error; + } + } catch (error) { + logger.error(`Failed to transform array to object`, { + error, + errorMessage: error.message, + errorStack: error.stack, + multiEntityResult: JSON.stringify(multiEntityResult), + multiEntityCompletions: JSON.stringify(multiEntityCompletions), + multiEntitySchema: JSON.stringify(multiEntitySchema) + }); + logJob({ + job_id: extractId, + success: false, + message: (error instanceof Error ? error.message : "Failed to transform array to object"), + num_docs: 1, + docs: [], + time_taken: (new Date().getTime() - Date.now()) / 1000, + team_id: teamId, + mode: "extract", + url: request.urls?.join(", ") || "", + scrapeOptions: request, + origin: request.origin ?? "api", + num_tokens: 0, + tokens_billed: 0, + sources, + cost_tracking: costTracking, + }); + return { + success: false, + error: + "An unexpected error occurred. Please contact help@firecrawl.com for help.", + extractId, + urlTrace: urlTraces, + totalUrlsScraped, + }; + } + } + if ( + rSchema && + Object.keys(rSchema).length > 0 && + rSchema.properties && + Object.keys(rSchema.properties).length > 0 + ) { + log["isSingleEntity"] = true; + logger.debug("=== SINGLE PAGES ===", { + linkCount: links.length, + schema: rSchema, + }); + + // Scrape documents + const timeout = 60000; + let singleAnswerDocs: Document[] = []; + + // let rerank = await rerankLinks(links.map((url) => ({ url })), request.prompt ?? JSON.stringify(request.schema), urlTraces); + + await updateExtract(extractId, { + status: "processing", + steps: [ + { + step: ExtractStep.SCRAPE, + startedAt: Date.now(), + finishedAt: Date.now(), + discoveredLinks: links, + }, + ], + }); + log["docsSizeBeforeSingleEntityScrape"] = docsMap.size; + const scrapePromises = links.map((url) => { + if (!docsMap.has(normalizeUrl(url))) { + return scrapeDocument( + { + url, + teamId, + origin: request.origin || "api", + timeout, + }, + urlTraces, + logger.child({ + module: "extract", + method: "scrapeDocument", + url, + isMultiEntity: false, + }), + request.scrapeOptions, + ); + } + return docsMap.get(normalizeUrl(url)); + }); + + try { + const results = await Promise.all(scrapePromises); + log["docsSizeAfterSingleEntityScrape"] = docsMap.size; + + for (const doc of results) { + if (doc?.metadata?.url) { + docsMap.set(normalizeUrl(doc.metadata.url), doc); + } + } + logger.debug("Updated docsMap.", { docsMapSize: docsMap.size }); // useful for error probing + + const validResults = results.filter( + (doc): doc is Document => doc !== null, + ); + singleAnswerDocs.push(...validResults); + totalUrlsScraped += validResults.length; + + logger.debug("Scrapes finished.", { docCount: validResults.length }); + } catch (error) { + return { + success: false, + error: error.message, + extractId, + urlTrace: urlTraces, + totalUrlsScraped, + }; + } + + if (docsMap.size == 0) { + // All urls are invalid + logger.error("All provided URLs are invalid!"); + return { + success: false, + error: + "All provided URLs are invalid. Please check your input and try again.", + extractId, + urlTrace: request.urlTrace ? urlTraces : undefined, + totalUrlsScraped: 0, + }; + } + + await updateExtract(extractId, { + status: "processing", + steps: [ + { + step: ExtractStep.EXTRACT, + startedAt: Date.now(), + finishedAt: Date.now(), + discoveredLinks: links, + }, + ], + }); + + // Generate completions + logger.debug("Generating singleAnswer completions..."); + log["singleAnswerDocsLength"] = singleAnswerDocs.length; + let { + extract: completionResult, + tokenUsage: singleAnswerTokenUsage, + sources: singleAnswerSources, + smartScrapeCost: singleAnswerSmartScrapeCost, + otherCost: singleAnswerOtherCost, + smartScrapeCallCount: singleAnswerSmartScrapeCallCount, + otherCallCount: singleAnswerOtherCallCount, + } = await singleAnswerCompletion({ + singleAnswerDocs, + rSchema, + links, + prompt: request.prompt ?? "", + systemPrompt: request.systemPrompt ?? "", + useAgent: isAgentExtractModelValid(request.agent?.model), + extractId, + }); + costTracking.smartScrapeCost += singleAnswerSmartScrapeCost; + costTracking.smartScrapeCallCount += singleAnswerSmartScrapeCallCount; + costTracking.otherCost += singleAnswerOtherCost; + costTracking.otherCallCount += singleAnswerOtherCallCount; + costTracking.totalCost += singleAnswerSmartScrapeCost + singleAnswerOtherCost; + logger.debug("Done generating singleAnswer completions."); + + singleAnswerResult = transformArrayToObject(rSchema, completionResult); + + singleAnswerResult = deduplicateObjectsArray(singleAnswerResult); + // Track single answer extraction tokens and sources + if (completionResult) { + tokenUsage.push(singleAnswerTokenUsage); + + // Add sources for top-level properties in single answer + if (rSchema?.properties) { + Object.keys(rSchema.properties).forEach((key) => { + if (completionResult[key] !== undefined) { + sources[key] = + singleAnswerSources || + singleAnswerDocs.map( + (doc) => doc.metadata.url || doc.metadata.sourceURL || "", + ); + } + }); + } + } + + // singleAnswerResult = completionResult; + // singleAnswerCompletions = singleAnswerResult; + + // Update token usage in traces + // if (completions && completions.numTokens) { + // const totalLength = docs.reduce( + // (sum, doc) => sum + (doc.markdown?.length || 0), + // 0, + // ); + // docs.forEach((doc) => { + // if (doc.metadata?.sourceURL) { + // const trace = urlTraces.find((t) => t.url === doc.metadata.sourceURL); + // if (trace && trace.contentStats) { + // trace.contentStats.tokensUsed = Math.floor( + // ((doc.markdown?.length || 0) / totalLength) * + // (completions?.numTokens || 0), + // ); + // } + // } + // }); + // } + } + + log["singleAnswerResult"] = singleAnswerResult; + log["multiEntityResult"] = multiEntityResult; + + let finalResult = reqSchema + ? await mixSchemaObjects( + reqSchema, + singleAnswerResult, + multiEntityResult, + logger.child({ method: "mixSchemaObjects" }), + ) + : singleAnswerResult || multiEntityResult; + + // Tokenize final result to get token count + // let finalResultTokens = 0; + // if (finalResult) { + // const finalResultStr = JSON.stringify(finalResult); + // finalResultTokens = numTokensFromString(finalResultStr, "gpt-4o"); + + // } + // // Deduplicate and validate final result against schema + // if (reqSchema && finalResult && finalResult.length <= extractConfig.DEDUPLICATION.MAX_TOKENS) { + // const schemaValidation = await generateCompletions( + // logger.child({ method: "extractService/validateAndDeduplicate" }), + // { + // mode: "llm", + // systemPrompt: `You are a data validator and deduplicator. Your task is to: + // 1. Remove any duplicate entries in the data extracted by merging that into a single object according to the provided shcema + // 2. Ensure all data matches the provided schema + // 3. Keep only the highest quality and most complete entries when duplicates are found. + + // Do not change anything else. If data is null keep it null. If the schema is not provided, return the data as is.`, + // prompt: `Please validate and merge the duplicate entries in this data according to the schema provided:\n + + // + + // ${JSON.stringify(finalResult)} + + // + + // + + // ${JSON.stringify(reqSchema)} + + // + // `, + // schema: reqSchema, + // }, + // undefined, + // undefined, + // true, + // "gpt-4o" + // ); + // console.log("schemaValidation", schemaValidation); + + // console.log("schemaValidation", finalResult); + + // if (schemaValidation?.extract) { + // tokenUsage.push(schemaValidation.totalUsage); + // finalResult = schemaValidation.extract; + // } + // } + + const totalTokensUsed = tokenUsage.reduce((a, b) => a + b.totalTokens, 0); + const llmUsage = estimateTotalCost(tokenUsage); + let tokensToBill = calculateFinalResultCost(finalResult); + + if (CUSTOM_U_TEAMS.includes(teamId)) { + tokensToBill = 1; + } + + // Bill team for usage + billTeam(teamId, subId, tokensToBill, logger, true).catch((error) => { + logger.error( + `Failed to bill team ${teamId} for ${tokensToBill} tokens: ${error}`, + ); + }); + + // Log job with token usage and sources + logJob({ + job_id: extractId, + success: true, + message: "Extract completed", + num_docs: 1, + docs: finalResult ?? {}, + time_taken: (new Date().getTime() - Date.now()) / 1000, + team_id: teamId, + mode: "extract", + url: request.urls?.join(", ") || "", + scrapeOptions: request, + origin: request.origin ?? "api", + num_tokens: totalTokensUsed, + tokens_billed: tokensToBill, + sources, + cost_tracking: costTracking, + }).then(() => { + updateExtract(extractId, { + status: "completed", + llmUsage, + sources, + costTracking, + }).catch((error) => { + logger.error( + `Failed to update extract ${extractId} status to completed: ${error}`, + ); + }); + }); + + logger.debug("Done!"); + + if ( + request.__experimental_cacheMode == "save" && + request.__experimental_cacheKey + ) { + logger.debug("Saving cached docs..."); + try { + await saveCachedDocs( + [...docsMap.values()], + request.__experimental_cacheKey, + ); + } catch (error) { + logger.error("Error saving cached docs", { error }); } } - // singleAnswerResult = completionResult; - // singleAnswerCompletions = singleAnswerResult; + // fs.writeFile( + // `logs/${request.urls?.[0].replaceAll("https://", "").replaceAll("http://", "").replaceAll("/", "-").replaceAll(".", "-")}-extract-${extractId}.json`, + // JSON.stringify(log, null, 2), + // ); - // Update token usage in traces - // if (completions && completions.numTokens) { - // const totalLength = docs.reduce( - // (sum, doc) => sum + (doc.markdown?.length || 0), - // 0, - // ); - // docs.forEach((doc) => { - // if (doc.metadata?.sourceURL) { - // const trace = urlTraces.find((t) => t.url === doc.metadata.sourceURL); - // if (trace && trace.contentStats) { - // trace.contentStats.tokensUsed = Math.floor( - // ((doc.markdown?.length || 0) / totalLength) * - // (completions?.numTokens || 0), - // ); - // } - // } - // }); - // } - } - - log["singleAnswerResult"] = singleAnswerResult; - log["multiEntityResult"] = multiEntityResult; - - let finalResult = reqSchema - ? await mixSchemaObjects( - reqSchema, - singleAnswerResult, - multiEntityResult, - logger.child({ method: "mixSchemaObjects" }), - ) - : singleAnswerResult || multiEntityResult; - - // Tokenize final result to get token count - // let finalResultTokens = 0; - // if (finalResult) { - // const finalResultStr = JSON.stringify(finalResult); - // finalResultTokens = numTokensFromString(finalResultStr, "gpt-4o"); - - // } - // // Deduplicate and validate final result against schema - // if (reqSchema && finalResult && finalResult.length <= extractConfig.DEDUPLICATION.MAX_TOKENS) { - // const schemaValidation = await generateCompletions( - // logger.child({ method: "extractService/validateAndDeduplicate" }), - // { - // mode: "llm", - // systemPrompt: `You are a data validator and deduplicator. Your task is to: - // 1. Remove any duplicate entries in the data extracted by merging that into a single object according to the provided shcema - // 2. Ensure all data matches the provided schema - // 3. Keep only the highest quality and most complete entries when duplicates are found. - - // Do not change anything else. If data is null keep it null. If the schema is not provided, return the data as is.`, - // prompt: `Please validate and merge the duplicate entries in this data according to the schema provided:\n - - // - - // ${JSON.stringify(finalResult)} - - // - - // - - // ${JSON.stringify(reqSchema)} - - // - // `, - // schema: reqSchema, - // }, - // undefined, - // undefined, - // true, - // "gpt-4o" - // ); - // console.log("schemaValidation", schemaValidation); - - // console.log("schemaValidation", finalResult); - - // if (schemaValidation?.extract) { - // tokenUsage.push(schemaValidation.totalUsage); - // finalResult = schemaValidation.extract; - // } - // } - - const totalTokensUsed = tokenUsage.reduce((a, b) => a + b.totalTokens, 0); - const llmUsage = estimateTotalCost(tokenUsage); - let tokensToBill = calculateFinalResultCost(finalResult); - - if (CUSTOM_U_TEAMS.includes(teamId)) { - tokensToBill = 1; - } - - // Bill team for usage - billTeam(teamId, subId, tokensToBill, logger, true).catch((error) => { - logger.error( - `Failed to bill team ${teamId} for ${tokensToBill} tokens: ${error}`, - ); - }); - - // Log job with token usage and sources - logJob({ - job_id: extractId, - success: true, - message: "Extract completed", - num_docs: 1, - docs: finalResult ?? {}, - time_taken: (new Date().getTime() - Date.now()) / 1000, - team_id: teamId, - mode: "extract", - url: request.urls?.join(", ") || "", - scrapeOptions: request, - origin: request.origin ?? "api", - num_tokens: totalTokensUsed, - tokens_billed: tokensToBill, - sources, - cost_tracking: costTracking, - }).then(() => { - updateExtract(extractId, { - status: "completed", + return { + success: true, + data: finalResult ?? {}, + extractId, + warning: undefined, + urlTrace: request.urlTrace ? urlTraces : undefined, llmUsage, + totalUrlsScraped, sources, - costTracking, - }).catch((error) => { - logger.error( - `Failed to update extract ${extractId} status to completed: ${error}`, - ); + }; + } catch (error) { + await logJob({ + job_id: extractId, + success: false, + message: (error instanceof Error ? error.message : typeof error === "string" ? error : "An unexpected error occurred"), + num_docs: 1, + docs: [], + time_taken: (new Date().getTime() - Date.now()) / 1000, + team_id: teamId, + mode: "extract", + url: request.urls?.join(", ") || "", + scrapeOptions: request, + origin: request.origin ?? "api", + num_tokens: 0, + tokens_billed: 0, + sources, + cost_tracking: costTracking, }); - }); - - logger.debug("Done!"); - - if ( - request.__experimental_cacheMode == "save" && - request.__experimental_cacheKey - ) { - logger.debug("Saving cached docs..."); - try { - await saveCachedDocs( - [...docsMap.values()], - request.__experimental_cacheKey, - ); - } catch (error) { - logger.error("Error saving cached docs", { error }); - } + throw error; } - - // fs.writeFile( - // `logs/${request.urls?.[0].replaceAll("https://", "").replaceAll("http://", "").replaceAll("/", "-").replaceAll(".", "-")}-extract-${extractId}.json`, - // JSON.stringify(log, null, 2), - // ); - - return { - success: true, - data: finalResult ?? {}, - extractId, - warning: undefined, - urlTrace: request.urlTrace ? urlTraces : undefined, - llmUsage, - totalUrlsScraped, - sources, - }; } From defc80af323dd05ac7e16d3414cdeed672b542e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 16 Apr 2025 03:23:18 -0700 Subject: [PATCH 137/160] stream session IDs for single URLs --- apps/api/src/lib/extract/completions/singleAnswer.ts | 3 +++ apps/api/src/lib/extract/extraction-service.ts | 4 ++++ 2 files changed, 7 insertions(+) diff --git a/apps/api/src/lib/extract/completions/singleAnswer.ts b/apps/api/src/lib/extract/completions/singleAnswer.ts index 4b36a58f..3f15edfb 100644 --- a/apps/api/src/lib/extract/completions/singleAnswer.ts +++ b/apps/api/src/lib/extract/completions/singleAnswer.ts @@ -16,6 +16,7 @@ export async function singleAnswerCompletion({ systemPrompt, useAgent, extractId, + sessionId, }: { singleAnswerDocs: Document[]; rSchema: any; @@ -24,6 +25,7 @@ export async function singleAnswerCompletion({ systemPrompt: string; useAgent: boolean; extractId: string; + sessionId: string; }): Promise<{ extract: any; tokenUsage: TokenUsage; @@ -58,6 +60,7 @@ export async function singleAnswerCompletion({ urls: singleAnswerDocs.map(doc => doc.metadata.url || doc.metadata.sourceURL || ""), useAgent, extractId, + sessionId, }); const completion = { diff --git a/apps/api/src/lib/extract/extraction-service.ts b/apps/api/src/lib/extract/extraction-service.ts index 6f14306a..01035151 100644 --- a/apps/api/src/lib/extract/extraction-service.ts +++ b/apps/api/src/lib/extract/extraction-service.ts @@ -750,6 +750,8 @@ export async function performExtraction( }; } + let thisSessionId = 'fc-' + crypto.randomUUID(); + await updateExtract(extractId, { status: "processing", steps: [ @@ -760,6 +762,7 @@ export async function performExtraction( discoveredLinks: links, }, ], + sessionIds: [thisSessionId], }); // Generate completions @@ -781,6 +784,7 @@ export async function performExtraction( systemPrompt: request.systemPrompt ?? "", useAgent: isAgentExtractModelValid(request.agent?.model), extractId, + sessionId: thisSessionId, }); costTracking.smartScrapeCost += singleAnswerSmartScrapeCost; costTracking.smartScrapeCallCount += singleAnswerSmartScrapeCallCount; From 8766bc6f6b566a44b4f7a2c5396e5b85689ec924 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 16 Apr 2025 03:28:33 -0700 Subject: [PATCH 138/160] temp: don't... do that --- apps/api/src/lib/extract/extraction-service.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/lib/extract/extraction-service.ts b/apps/api/src/lib/extract/extraction-service.ts index 01035151..415d1362 100644 --- a/apps/api/src/lib/extract/extraction-service.ts +++ b/apps/api/src/lib/extract/extraction-service.ts @@ -762,7 +762,7 @@ export async function performExtraction( discoveredLinks: links, }, ], - sessionIds: [thisSessionId], + // sessionIds: [thisSessionId], }); // Generate completions From 39d10dc734c383f0e85c8872db65d5e5dbb05eac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 16 Apr 2025 09:55:32 -0700 Subject: [PATCH 139/160] feat: disable cost tracking --- apps/api/src/lib/extract/extraction-service.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/lib/extract/extraction-service.ts b/apps/api/src/lib/extract/extraction-service.ts index 415d1362..ae498061 100644 --- a/apps/api/src/lib/extract/extraction-service.ts +++ b/apps/api/src/lib/extract/extraction-service.ts @@ -936,7 +936,7 @@ export async function performExtraction( status: "completed", llmUsage, sources, - costTracking, + // costTracking, }).catch((error) => { logger.error( `Failed to update extract ${extractId} status to completed: ${error}`, From 7787a58bebba2c9db485f4864f5d31fa8ed6ef91 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 16 Apr 2025 10:48:03 -0700 Subject: [PATCH 140/160] default timeout --- apps/api/src/lib/extract/extraction-service.ts | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/apps/api/src/lib/extract/extraction-service.ts b/apps/api/src/lib/extract/extraction-service.ts index ae498061..c8f8271f 100644 --- a/apps/api/src/lib/extract/extraction-service.ts +++ b/apps/api/src/lib/extract/extraction-service.ts @@ -377,6 +377,7 @@ export async function performExtraction( isMultiEntity: true, }), { + timeout: 300000, ...request.scrapeOptions, // Needs to be true for multi-entity to work properly @@ -703,7 +704,10 @@ export async function performExtraction( url, isMultiEntity: false, }), - request.scrapeOptions, + { + timeout: 300000, + ...request.scrapeOptions, + }, ); } return docsMap.get(normalizeUrl(url)); From c71325126130218163f8f6cc4412112f2b2f3305 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 16 Apr 2025 12:28:35 -0700 Subject: [PATCH 141/160] feat(extraction-service): send thisSessionId for single entity --- apps/api/src/lib/extract/extraction-service.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/lib/extract/extraction-service.ts b/apps/api/src/lib/extract/extraction-service.ts index c8f8271f..50358328 100644 --- a/apps/api/src/lib/extract/extraction-service.ts +++ b/apps/api/src/lib/extract/extraction-service.ts @@ -766,7 +766,7 @@ export async function performExtraction( discoveredLinks: links, }, ], - // sessionIds: [thisSessionId], + sessionIds: [thisSessionId], }); // Generate completions From 4740254b89e72c54eaef68c1d867bd6065c57078 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 16 Apr 2025 12:32:02 -0700 Subject: [PATCH 142/160] feat(rquests.http): add extract --- apps/api/requests.http | 36 +++++++++++++++++++++++++++++++----- 1 file changed, 31 insertions(+), 5 deletions(-) diff --git a/apps/api/requests.http b/apps/api/requests.http index 9183ad24..7627979d 100644 --- a/apps/api/requests.http +++ b/apps/api/requests.http @@ -34,6 +34,37 @@ Authorization: Bearer {{$dotenv TEST_API_KEY}} DELETE {{baseUrl}}/v1/crawl/{{crawlId}} HTTP/1.1 Authorization: Bearer {{$dotenv TEST_API_KEY}} +### Extract website +# @name extract +POST {{baseUrl}}/v1/extract HTTP/1.1 +Authorization: Bearer {{$dotenv TEST_API_KEY}} +content-type: application/json + +{ + "urls": ["https://firecrawl.dev"], + "schema": { + "type": "object", + "properties": { + "companyName": { + "type": "string" + }, + "companyDescription": { + "type": "string" + } + } + }, + "agent": { + "model": "fire-1" + }, + "origin": "api-sdk" +} + +### Check Extract Status +@extractId = {{extract.response.body.$.id}} +# @name extractStatus +GET {{baseUrl}}/v1/extract/{{extractId}} HTTP/1.1 +Authorization: Bearer {{$dotenv TEST_API_KEY}} + ### Batch Scrape Websites # @name batchScrape POST {{baseUrl}}/v1/batch/scrape HTTP/1.1 @@ -53,7 +84,6 @@ content-type: application/json GET {{baseUrl}}/v1/crawl/{{batchScrapeId}} HTTP/1.1 Authorization: Bearer {{$dotenv TEST_API_KEY}} - ### Map Website # @name map POST {{baseUrl}}/v1/map HTTP/1.1 @@ -65,10 +95,6 @@ content-type: application/json "sitemapOnly": true } -### -DELETE {{baseUrl}}/v1/crawl/c94136f9-86c1-4a97-966c-1c8e0274778f HTTP/1.1 -Authorization: Bearer {{$dotenv TEST_API_KEY}} - ### Generate LLMs TXT # @name generateLlmsTxt POST {{baseUrl}}/v1/llmstxt HTTP/1.1 From 772a3ea751e56aee5a64b91b7ff5b463ea593e3b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 16 Apr 2025 12:36:43 -0700 Subject: [PATCH 143/160] Bump the SDK --- apps/js-sdk/firecrawl/package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 12b65dcf..905f0949 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.23.9", + "version": "1.24.0", "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts", From 5ee2434c9d5aeb3bd29fd1dea93b0577404f5e4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 16 Apr 2025 14:24:09 -0700 Subject: [PATCH 144/160] more logs --- apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts index b7251494..8e291a17 100644 --- a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts +++ b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts @@ -215,7 +215,6 @@ export async function extractData({ // TODO: remove the "required" fields here!! it breaks o3-mini if (!schema && extractOptions.options.prompt) { - logger.info("Generating schema from prompt"); const genRes = await generateSchemaFromPrompt(extractOptions.options.prompt, logger); otherCallCount++; otherCost += genRes.cost; @@ -229,6 +228,9 @@ export async function extractData({ }; // console.log("schema", schema); // console.log("schemaToUse", schemaToUse); + logger.info("Generated schema from prompt", { + schemaToUse, + }); let extract: any, warning: string | undefined, From ad7e3f62d1c324153a70d72882f16fe21cd9fbcd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 16 Apr 2025 14:32:00 -0700 Subject: [PATCH 145/160] feat(extractSmartScrape): resolve refs in provided schema --- .../scrapeURL/lib/extractSmartScrape.ts | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts index 8e291a17..0e7ac878 100644 --- a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts +++ b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts @@ -181,6 +181,33 @@ export function prepareSmartScrapeSchema( return { schemaToUse: wrappedSchema }; } +// Resolve all $defs references in the schema +const resolveRefs = (obj: any, defs: any): any => { + if (!obj || typeof obj !== 'object') return obj; + + if (obj.$ref && typeof obj.$ref === 'string') { + // Handle $ref references + const refPath = obj.$ref.split('/'); + if (refPath[0] === '#' && refPath[1] === '$defs') { + const defName = refPath[refPath.length - 1]; + return resolveRefs({ ...defs[defName] }, defs); + } + } + + // Handle arrays + if (Array.isArray(obj)) { + return obj.map(item => resolveRefs(item, defs)); + } + + // Handle objects + const resolved: any = {}; + for (const [key, value] of Object.entries(obj)) { + if (key === '$defs') continue; + resolved[key] = resolveRefs(value, defs); + } + return resolved; +}; + export async function extractData({ extractOptions, urls, @@ -221,6 +248,15 @@ export async function extractData({ schema = genRes.extract; } + if (schema) { + const defs = schema.$defs || {}; + schema = resolveRefs(schema, defs); + delete schema.$defs; + logger.info("Resolved schema refs", { + schema, + }); + } + const { schemaToUse } = prepareSmartScrapeSchema(schema, logger, isSingleUrl); const extractOptionsNewSchema = { ...extractOptions, From b45e3bda133d940b421ebbea3bcb3b087fd1adfd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 16 Apr 2025 15:08:05 -0700 Subject: [PATCH 146/160] better logs --- apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts index fe65d591..1f0fec18 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts @@ -401,7 +401,9 @@ export async function generateCompletions({ try { JSON.parse(text); return text; - } catch (_) {} + } catch (e) { + logger.error("Even after repairing, failed to parse JSON", { error: e }); + } } try { @@ -545,7 +547,7 @@ export async function generateCompletions({ throw new LLMRefusalError(error.message); } logger.error("LLM extraction failed", { - error: lastError.message, + error: lastError, model: currentModel.modelId, mode, }); From 4c5120e0818d499cb157821bd345a9328d67b872 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 16 Apr 2025 15:10:17 -0700 Subject: [PATCH 147/160] feat(llm-extract): do more logging, even more --- apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts index 1f0fec18..041fd036 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts @@ -451,6 +451,12 @@ export async function generateCompletions({ // JSON.stringify(generateObjectConfig, null, 2), // ); + logger.debug("Generating object...", { generateObjectConfig: { + ...generateObjectConfig, + prompt: generateObjectConfig.prompt.slice(0, 100) + "...", + system: generateObjectConfig.system?.slice(0, 100) + "...", + }, model, retryModel }); + let result: { object: any; usage: TokenUsage } | undefined; try { result = await generateObject(generateObjectConfig); From 509e6e658c9b72bfbb8384530e6296346a5bb1b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 16 Apr 2025 15:40:07 -0700 Subject: [PATCH 148/160] feat(llmExtract): more logging --- apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts index 041fd036..21572c9d 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts @@ -386,6 +386,8 @@ export async function generateCompletions({ const repairConfig = { experimental_repairText: async ({ text, error }) => { // AI may output a markdown JSON code block. Remove it - mogery + logger.debug("Repairing text", { textType: typeof text, error }); + if (typeof text === "string" && text.trim().startsWith("```")) { if (text.trim().startsWith("```json")) { text = text.trim().slice("```json".length).trim(); @@ -400,6 +402,7 @@ export async function generateCompletions({ // If this fixes the JSON, just return it. If not, continue - mogery try { JSON.parse(text); + logger.debug("Repaired text with string manipulation"); return text; } catch (e) { logger.error("Even after repairing, failed to parse JSON", { error: e }); @@ -418,6 +421,7 @@ export async function generateCompletions({ }, }, }); + logger.debug("Repaired text with LLM"); return fixedText; } catch (repairError) { lastError = repairError as Error; From 751c30f139bea9739d6bef32177495888ec0850e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 16 Apr 2025 16:23:12 -0700 Subject: [PATCH 149/160] feat(extractSmartScrape): better pagination handling --- apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts | 2 +- apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts index 0e7ac878..37205343 100644 --- a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts +++ b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts @@ -16,7 +16,7 @@ const commonSmartScrapeProperties = { shouldUseSmartscrape: { type: "boolean", description: - "Set to `true` if any of the extractedData is null and you think you can find the information by performing user-like interactions (e.g., clicking buttons/accordions to reveal hidden text, login, inputs etc.). SmartScrape can perform these actions to access the data.", + "Set to `true` if any of the extractedData is null and you think you can find the information by performing user-like interactions (e.g., clicking buttons/accordions to reveal hidden text, login, inputs, pagination etc.). SmartScrape can perform these actions to access the data.", }, // Note: extractedData is added dynamically in prepareSmartScrapeSchema }; diff --git a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts index 21572c9d..73456096 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts @@ -386,7 +386,7 @@ export async function generateCompletions({ const repairConfig = { experimental_repairText: async ({ text, error }) => { // AI may output a markdown JSON code block. Remove it - mogery - logger.debug("Repairing text", { textType: typeof text, error }); + logger.debug("Repairing text", { textType: typeof text, textPeek: JSON.stringify(text).slice(0, 100) + "...", error }); if (typeof text === "string" && text.trim().startsWith("```")) { if (text.trim().startsWith("```json")) { From 6a93293fd020eb077607c9c4280ff7a849f4c1ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 16 Apr 2025 16:39:48 -0700 Subject: [PATCH 150/160] feat(smart-scrape): use correct models for multi-entity assembly --- .../scrapeURL/lib/extractSmartScrape.ts | 20 +++++++++++++------ .../src/scraper/scrapeURL/lib/smartScrape.ts | 12 +++++++++-- .../scraper/scrapeURL/transformers/agent.ts | 7 ++++++- 3 files changed, 30 insertions(+), 9 deletions(-) diff --git a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts index 37205343..82f16d12 100644 --- a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts +++ b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts @@ -315,7 +315,13 @@ export async function extractData({ let smartscrapeResults: SmartScrapeResult[]; if (isSingleUrl) { smartscrapeResults = [ - await smartScrape(urls[0], extract?.smartscrape_prompt, sessionId, extractId, scrapeId), + await smartScrape({ + url: urls[0], + prompt: extract?.smartscrape_prompt, + sessionId, + extractId, + scrapeId, + }), ]; smartScrapeCost += smartscrapeResults[0].tokenUsage; smartScrapeCallCount++; @@ -332,13 +338,13 @@ export async function extractData({ smartscrapeResults = await Promise.all( pages.slice(0, 100).map(async (page) => { - return await smartScrape( - urls[page.page_index], - page.smartscrape_prompt, - undefined, + return await smartScrape({ + url: urls[page.page_index], + prompt: page.smartscrape_prompt, + sessionId, extractId, scrapeId, - ); + }); }), ); smartScrapeCost += smartscrapeResults.reduce( @@ -364,6 +370,8 @@ export async function extractData({ const newExtractOptions = { ...extractOptions, markdown: markdown, + model: getModel("gemini-2.5-pro-preview-03-25", "vertex"), + retryModel: getModel("gemini-2.5-pro-preview-03-25", "google"), }; const { extract, warning, totalUsage, model, cost } = await generateCompletions(newExtractOptions); diff --git a/apps/api/src/scraper/scrapeURL/lib/smartScrape.ts b/apps/api/src/scraper/scrapeURL/lib/smartScrape.ts index 046a7b5e..a913ec27 100644 --- a/apps/api/src/scraper/scrapeURL/lib/smartScrape.ts +++ b/apps/api/src/scraper/scrapeURL/lib/smartScrape.ts @@ -45,13 +45,21 @@ export type SmartScrapeResult = z.infer; * @returns A promise that resolves to an object matching the SmartScrapeResult type. * @throws Throws an error if the request fails or the response is invalid. */ -export async function smartScrape( +export async function smartScrape({ + url, + prompt, + sessionId, + extractId, + scrapeId, + beforeSubmission, +}: { url: string, prompt: string, sessionId?: string, extractId?: string, scrapeId?: string, -): Promise { + beforeSubmission?: () => unknown, +}): Promise { let logger = _logger.child({ method: "smartScrape", module: "smartScrape", diff --git a/apps/api/src/scraper/scrapeURL/transformers/agent.ts b/apps/api/src/scraper/scrapeURL/transformers/agent.ts index 30a0f46f..5ad304d3 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/agent.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/agent.ts @@ -25,7 +25,12 @@ export async function performAgent( let smartscrapeResults: SmartScrapeResult; try { - smartscrapeResults = await smartScrape(url, prompt, sessionId, undefined, meta.id) + smartscrapeResults = await smartScrape({ + url, + prompt, + sessionId, + scrapeId: meta.id, + }) } catch (error) { if (error instanceof Error && error.message === "Cost limit exceeded") { logger.error("Cost limit exceeded", { error }) From ba4df67de7830c2c56ed4fef7cb7d11a6c217b43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 16 Apr 2025 16:53:04 -0700 Subject: [PATCH 151/160] force 2.5 --- apps/api/src/lib/extract/completions/batchExtract.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/apps/api/src/lib/extract/completions/batchExtract.ts b/apps/api/src/lib/extract/completions/batchExtract.ts index 7e254a0f..c57bcd7a 100644 --- a/apps/api/src/lib/extract/completions/batchExtract.ts +++ b/apps/api/src/lib/extract/completions/batchExtract.ts @@ -73,7 +73,8 @@ export async function batchExtractPromise(options: BatchExtractOptions, logger: }, markdown: buildDocument(doc), isExtractEndpoint: true, - model: getModel("gemini-2.0-flash", "google"), + model: getModel("gemini-2.5-pro-preview-03-25", "vertex"), + retryModel: getModel("gemini-2.5-pro-preview-03-25", "google"), }; let extractedDataArray: any[] = []; From 8546bcacc0bd8f2c39b1e49bead34ec77043acfc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 17 Apr 2025 09:23:53 -0700 Subject: [PATCH 152/160] new cost tracking --- apps/api/src/controllers/v1/batch-scrape.ts | 1 + apps/api/src/controllers/v1/scrape.ts | 7 +- apps/api/src/controllers/v1/search.ts | 11 +- apps/api/src/controllers/v1/types.ts | 1 - .../deep-research/deep-research-service.ts | 9 +- .../src/lib/deep-research/research-manager.ts | 25 +++ .../completions/analyzeSchemaAndPrompt.ts | 20 +-- .../lib/extract/completions/batchExtract.ts | 20 +-- .../extract/completions/checkShouldExtract.ts | 12 +- .../lib/extract/completions/singleAnswer.ts | 30 ++-- .../api/src/lib/extract/extraction-service.ts | 88 +++++------ .../api/src/lib/extract/fire-0/reranker-f0.ts | 13 +- .../lib/extract/fire-0/url-processor-f0.ts | 5 +- apps/api/src/lib/extract/reranker.ts | 9 ++ apps/api/src/lib/extract/url-processor.ts | 46 ++++-- .../generate-llmstxt-service.ts | 11 +- apps/api/src/main/runWebScraper.ts | 7 +- apps/api/src/scraper/WebScraper/sitemap.ts | 2 + apps/api/src/scraper/scrapeURL/index.ts | 9 +- .../scrapeURL/lib/extractSmartScrape.ts | 51 +++---- .../src/scraper/scrapeURL/lib/smartScrape.ts | 14 +- .../src/scraper/scrapeURL/scrapeURL.test.ts | 20 ++- .../scraper/scrapeURL/transformers/agent.ts | 15 +- .../scraper/scrapeURL/transformers/diff.ts | 39 ++--- .../scrapeURL/transformers/llmExtract.ts | 142 +++++++++++++----- apps/api/src/services/queue-worker.ts | 11 +- apps/api/src/types.ts | 1 + 27 files changed, 395 insertions(+), 224 deletions(-) diff --git a/apps/api/src/controllers/v1/batch-scrape.ts b/apps/api/src/controllers/v1/batch-scrape.ts index b4da87c5..d5896f6d 100644 --- a/apps/api/src/controllers/v1/batch-scrape.ts +++ b/apps/api/src/controllers/v1/batch-scrape.ts @@ -22,6 +22,7 @@ import { getJobPriority } from "../../lib/job-priority"; import { addScrapeJobs } from "../../services/queue-jobs"; import { callWebhook } from "../../services/webhook"; import { logger as _logger } from "../../lib/logger"; +import { CostTracking } from "../../lib/extract/extraction-service"; export async function batchScrapeController( req: RequestWithAuth<{}, BatchScrapeResponse, BatchScrapeRequest>, diff --git a/apps/api/src/controllers/v1/scrape.ts b/apps/api/src/controllers/v1/scrape.ts index de33a7b9..86ca905b 100644 --- a/apps/api/src/controllers/v1/scrape.ts +++ b/apps/api/src/controllers/v1/scrape.ts @@ -15,6 +15,7 @@ import { getJobPriority } from "../../lib/job-priority"; import { getScrapeQueue } from "../../services/queue-service"; import { getJob } from "./crawl-status"; import { getJobFromGCS } from "../../lib/gcs-jobs"; +import { CostTracking } from "src/lib/extract/extraction-service"; export async function scrapeController( req: RequestWithAuth<{}, ScrapeResponse, ScrapeRequest>, @@ -128,12 +129,6 @@ export async function scrapeController( } } - const cost_tracking = doc?.metadata?.costTracking; - - if (doc && doc.metadata) { - delete doc.metadata.costTracking; - } - return res.status(200).json({ success: true, data: doc, diff --git a/apps/api/src/controllers/v1/search.ts b/apps/api/src/controllers/v1/search.ts index 06f26700..1c902df4 100644 --- a/apps/api/src/controllers/v1/search.ts +++ b/apps/api/src/controllers/v1/search.ts @@ -21,6 +21,7 @@ import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings"; import { logger as _logger } from "../../lib/logger"; import type { Logger } from "winston"; import { getJobFromGCS } from "../../lib/gcs-jobs"; +import { CostTracking } from "../../lib/extract/extraction-service"; // Used for deep research export async function searchAndScrapeSearchResult( @@ -32,6 +33,7 @@ export async function searchAndScrapeSearchResult( scrapeOptions: ScrapeOptions; }, logger: Logger, + costTracking: CostTracking, ): Promise { try { const searchResults = await search({ @@ -48,7 +50,8 @@ export async function searchAndScrapeSearchResult( description: result.description }, options, - logger + logger, + costTracking ) ) ); @@ -68,6 +71,7 @@ async function scrapeSearchResult( scrapeOptions: ScrapeOptions; }, logger: Logger, + costTracking: CostTracking, ): Promise { const jobId = uuidv4(); const jobPriority = await getJobPriority({ @@ -220,6 +224,8 @@ export async function searchController( }); } + const costTracking = new CostTracking(); + // Scrape each non-blocked result, handling timeouts individually logger.info("Scraping search results"); const scrapePromises = searchResults.map((result) => @@ -228,7 +234,7 @@ export async function searchController( origin: req.body.origin, timeout: req.body.timeout, scrapeOptions: req.body.scrapeOptions, - }, logger), + }, logger, costTracking), ); const docs = await Promise.all(scrapePromises); @@ -279,6 +285,7 @@ export async function searchController( mode: "search", url: req.body.query, origin: req.body.origin, + cost_tracking: costTracking, }); return res.status(200).json({ diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 92c95e20..38881dd2 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -739,7 +739,6 @@ export type Document = { statusCode: number; scrapeId?: string; error?: string; - costTracking?: CostTracking; // [key: string]: string | string[] | number | { smartScrape: number; other: number; total: number } | undefined; }; serpResults?: { diff --git a/apps/api/src/lib/deep-research/deep-research-service.ts b/apps/api/src/lib/deep-research/deep-research-service.ts index 64d94946..296661c9 100644 --- a/apps/api/src/lib/deep-research/deep-research-service.ts +++ b/apps/api/src/lib/deep-research/deep-research-service.ts @@ -5,6 +5,7 @@ import { ResearchLLMService, ResearchStateManager } from "./research-manager"; import { logJob } from "../../services/logging/log_job"; import { billTeam } from "../../services/billing/credit_billing"; import { ExtractOptions } from "../../controllers/v1/types"; +import { CostTracking } from "../extract/extraction-service"; interface DeepResearchServiceOptions { researchId: string; @@ -21,6 +22,7 @@ interface DeepResearchServiceOptions { } export async function performDeepResearch(options: DeepResearchServiceOptions) { + const costTracking = new CostTracking(); const { researchId, teamId, timeLimit, subId, maxUrls } = options; const startTime = Date.now(); let currentTopic = options.query; @@ -70,6 +72,7 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) { await llmService.generateSearchQueries( nextSearchTopic, state.getFindings(), + costTracking, ) ).slice(0, 3); @@ -109,7 +112,7 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) { fastMode: false, blockAds: false, }, - }, logger); + }, logger, costTracking); return response.length > 0 ? response : []; }); @@ -205,6 +208,7 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) { currentTopic, timeRemaining, options.systemPrompt ?? "", + costTracking, ); if (!analysis) { @@ -268,6 +272,7 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) { state.getFindings(), state.getSummaries(), options.analysisPrompt, + costTracking, options.formats, options.jsonOptions, ); @@ -278,6 +283,7 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) { state.getFindings(), state.getSummaries(), options.analysisPrompt, + costTracking, ); } @@ -307,6 +313,7 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) { origin: "api", num_tokens: 0, tokens_billed: 0, + cost_tracking: costTracking, }); await updateDeepResearch(researchId, { status: "completed", diff --git a/apps/api/src/lib/deep-research/research-manager.ts b/apps/api/src/lib/deep-research/research-manager.ts index 87655fac..0f9618a9 100644 --- a/apps/api/src/lib/deep-research/research-manager.ts +++ b/apps/api/src/lib/deep-research/research-manager.ts @@ -12,6 +12,7 @@ import { import { ExtractOptions } from "../../controllers/v1/types"; import { getModel } from "../generic-ai"; +import { CostTracking } from "../extract/extraction-service"; interface AnalysisResult { gaps: string[]; nextSteps: string[]; @@ -152,6 +153,7 @@ export class ResearchLLMService { async generateSearchQueries( topic: string, findings: DeepResearchFinding[] = [], + costTracking: CostTracking, ): Promise<{ query: string; researchGoal: string }[]> { const { extract } = await generateCompletions({ logger: this.logger.child({ @@ -194,6 +196,13 @@ export class ResearchLLMService { The first SERP query you generate should be a very concise, simple version of the topic. `, }, markdown: "", + costTrackingOptions: { + costTracking, + metadata: { + module: "deep-research", + method: "generateSearchQueries", + }, + }, }); return extract.queries; @@ -204,6 +213,7 @@ export class ResearchLLMService { currentTopic: string, timeRemaining: number, systemPrompt: string, + costTracking: CostTracking, ): Promise { try { const timeRemainingMinutes = @@ -246,6 +256,13 @@ export class ResearchLLMService { ).text, }, markdown: "", + costTrackingOptions: { + costTracking, + metadata: { + module: "deep-research", + method: "analyzeAndPlan", + }, + }, }); return extract.analysis; @@ -260,6 +277,7 @@ export class ResearchLLMService { findings: DeepResearchFinding[], summaries: string[], analysisPrompt: string, + costTracking: CostTracking, formats?: string[], jsonOptions?: ExtractOptions, ): Promise { @@ -312,6 +330,13 @@ export class ResearchLLMService { }, markdown: "", model: getModel("o3-mini"), + costTrackingOptions: { + costTracking, + metadata: { + module: "deep-research", + method: "generateFinalAnalysis", + }, + }, }); return extract; diff --git a/apps/api/src/lib/extract/completions/analyzeSchemaAndPrompt.ts b/apps/api/src/lib/extract/completions/analyzeSchemaAndPrompt.ts index 98e3ccc0..2626c50b 100644 --- a/apps/api/src/lib/extract/completions/analyzeSchemaAndPrompt.ts +++ b/apps/api/src/lib/extract/completions/analyzeSchemaAndPrompt.ts @@ -11,25 +11,23 @@ import { import { jsonSchema } from "ai"; import { getModel } from "../../../lib/generic-ai"; import { Logger } from "winston"; - +import { CostTracking } from "../extraction-service"; export async function analyzeSchemaAndPrompt( urls: string[], schema: any, prompt: string, logger: Logger, + costTracking: CostTracking, ): Promise<{ isMultiEntity: boolean; multiEntityKeys: string[]; reasoning: string; keyIndicators: string[]; tokenUsage: TokenUsage; - cost: number; }> { - let cost = 0; if (!schema) { - const genRes = await generateSchemaFromPrompt(prompt, logger); + const genRes = await generateSchemaFromPrompt(prompt, logger, costTracking); schema = genRes.extract; - cost = genRes.cost; } const schemaString = JSON.stringify(schema); @@ -49,7 +47,7 @@ export async function analyzeSchemaAndPrompt( ); try { - const { extract: result, totalUsage, cost: cost2 } = await generateCompletions({ + const { extract: result, totalUsage } = await generateCompletions({ logger, options: { mode: "llm", @@ -59,8 +57,14 @@ export async function analyzeSchemaAndPrompt( }, markdown: "", model, + costTrackingOptions: { + costTracking, + metadata: { + module: "extract", + method: "analyzeSchemaAndPrompt", + }, + }, }); - cost += cost2; const { isMultiEntity, multiEntityKeys, reasoning, keyIndicators } = checkSchema.parse(result); @@ -71,7 +75,6 @@ export async function analyzeSchemaAndPrompt( reasoning, keyIndicators, tokenUsage: totalUsage, - cost, }; } catch (e) { logger.warn("(analyzeSchemaAndPrompt) Error parsing schema analysis", { @@ -90,6 +93,5 @@ export async function analyzeSchemaAndPrompt( totalTokens: 0, model: model.modelId, }, - cost: 0, }; } diff --git a/apps/api/src/lib/extract/completions/batchExtract.ts b/apps/api/src/lib/extract/completions/batchExtract.ts index c57bcd7a..b7075c6e 100644 --- a/apps/api/src/lib/extract/completions/batchExtract.ts +++ b/apps/api/src/lib/extract/completions/batchExtract.ts @@ -10,7 +10,7 @@ import { buildBatchExtractSystemPrompt, } from "../build-prompts"; import { getModel } from "../../generic-ai"; - +import { CostTracking } from "../extraction-service"; import fs from "fs/promises"; import { extractData } from "../../../scraper/scrapeURL/lib/extractSmartScrape"; import type { Logger } from "winston"; @@ -24,6 +24,7 @@ type BatchExtractOptions = { useAgent: boolean; extractId?: string; sessionId?: string; + costTracking: CostTracking; }; /** @@ -75,6 +76,13 @@ export async function batchExtractPromise(options: BatchExtractOptions, logger: isExtractEndpoint: true, model: getModel("gemini-2.5-pro-preview-03-25", "vertex"), retryModel: getModel("gemini-2.5-pro-preview-03-25", "google"), + costTrackingOptions: { + costTracking: options.costTracking, + metadata: { + module: "extract", + method: "batchExtractPromise", + }, + }, }; let extractedDataArray: any[] = []; @@ -84,23 +92,15 @@ export async function batchExtractPromise(options: BatchExtractOptions, logger: const { extractedDataArray: e, warning: w, - smartScrapeCost, - otherCost, - smartScrapeCallCount, - otherCallCount } = await extractData({ extractOptions: generationOptions, urls: [doc.metadata.sourceURL || doc.metadata.url || ""], useAgent, extractId, - sessionId + sessionId, }); extractedDataArray = e; warning = w; - smCost = smartScrapeCost; - oCost = otherCost; - smCallCount = smartScrapeCallCount; - oCallCount = otherCallCount; } catch (error) { logger.error("extractData failed", { error }); } diff --git a/apps/api/src/lib/extract/completions/checkShouldExtract.ts b/apps/api/src/lib/extract/completions/checkShouldExtract.ts index 3bff4fc7..e2c10ade 100644 --- a/apps/api/src/lib/extract/completions/checkShouldExtract.ts +++ b/apps/api/src/lib/extract/completions/checkShouldExtract.ts @@ -7,12 +7,14 @@ import { buildShouldExtractUserPrompt, } from "../build-prompts"; import { getModel } from "../../../lib/generic-ai"; +import { CostTracking } from "../extraction-service"; export async function checkShouldExtract( prompt: string, multiEntitySchema: any, doc: Document, -): Promise<{ tokenUsage: TokenUsage; extract: boolean; cost: number }> { + costTracking: CostTracking, +): Promise<{ tokenUsage: TokenUsage; extract: boolean; }> { const shouldExtractCheck = await generateCompletions({ logger: logger.child({ method: "extractService/checkShouldExtract" }), options: { @@ -32,11 +34,17 @@ export async function checkShouldExtract( markdown: buildDocument(doc), isExtractEndpoint: true, model: getModel("gpt-4o-mini"), + costTrackingOptions: { + costTracking, + metadata: { + module: "extract", + method: "checkShouldExtract", + }, + }, }); return { tokenUsage: shouldExtractCheck.totalUsage, extract: shouldExtractCheck.extract["extract"], - cost: shouldExtractCheck.cost, }; } diff --git a/apps/api/src/lib/extract/completions/singleAnswer.ts b/apps/api/src/lib/extract/completions/singleAnswer.ts index 3f15edfb..866b6049 100644 --- a/apps/api/src/lib/extract/completions/singleAnswer.ts +++ b/apps/api/src/lib/extract/completions/singleAnswer.ts @@ -7,6 +7,7 @@ import { buildDocument } from "../build-document"; import { Document, TokenUsage } from "../../../controllers/v1/types"; import { getModel } from "../../../lib/generic-ai"; import { extractData } from "../../../scraper/scrapeURL/lib/extractSmartScrape"; +import { CostTracking } from "../extraction-service"; export async function singleAnswerCompletion({ singleAnswerDocs, @@ -17,6 +18,7 @@ export async function singleAnswerCompletion({ useAgent, extractId, sessionId, + costTracking, }: { singleAnswerDocs: Document[]; rSchema: any; @@ -26,14 +28,11 @@ export async function singleAnswerCompletion({ useAgent: boolean; extractId: string; sessionId: string; + costTracking: CostTracking; }): Promise<{ extract: any; tokenUsage: TokenUsage; sources: string[]; - smartScrapeCallCount: number; - smartScrapeCost: number; - otherCallCount: number; - otherCost: number; }> { const docsPrompt = `Today is: ` + new Date().toISOString() + `.\n` + prompt; const generationOptions: GenerateCompletionsOptions = { @@ -49,13 +48,20 @@ export async function singleAnswerCompletion({ "Always prioritize using the provided content to answer the question. Do not make up an answer. Do not hallucinate. In case you can't find the information and the string is required, instead of 'N/A' or 'Not speficied', return an empty string: '', if it's not a string and you can't find the information, return null. Be concise and follow the schema always if provided.", prompt: docsPrompt, schema: rSchema, + }, + markdown: `${singleAnswerDocs.map((x, i) => `[START_PAGE (ID: ${i})]` + buildDocument(x)).join("\n")} [END_PAGE]\n`, + isExtractEndpoint: true, + model: getModel("gemini-2.0-flash", "google"), + costTrackingOptions: { + costTracking, + metadata: { + module: "extract", + method: "singleAnswerCompletion", }, - markdown: `${singleAnswerDocs.map((x, i) => `[START_PAGE (ID: ${i})]` + buildDocument(x)).join("\n")} [END_PAGE]\n`, - isExtractEndpoint: true, - model: getModel("gemini-2.0-flash", "google"), - }; - - const { extractedDataArray, warning, smartScrapeCost, otherCost, smartScrapeCallCount, otherCallCount } = await extractData({ + }, + }; + + const { extractedDataArray, warning } = await extractData({ extractOptions: generationOptions, urls: singleAnswerDocs.map(doc => doc.metadata.url || doc.metadata.sourceURL || ""), useAgent, @@ -100,9 +106,5 @@ export async function singleAnswerCompletion({ sources: singleAnswerDocs.map( (doc) => doc.metadata.url || doc.metadata.sourceURL || "", ), - smartScrapeCost, - otherCost, - smartScrapeCallCount, - otherCallCount, }; } diff --git a/apps/api/src/lib/extract/extraction-service.ts b/apps/api/src/lib/extract/extraction-service.ts index 50358328..ae3df261 100644 --- a/apps/api/src/lib/extract/extraction-service.ts +++ b/apps/api/src/lib/extract/extraction-service.ts @@ -67,14 +67,39 @@ type completions = { sources?: string[]; }; -export type CostTracking = { - smartScrapeCallCount: number; - smartScrapeCost: number; - otherCallCount: number; - otherCost: number; - totalCost: number; - costLimitExceededTokenUsage?: number; -}; +export class CostTracking { + calls: { + type: "smartScrape" | "other", + metadata: Record, + cost: number, + tokens?: { + input: number, + output: number, + }, + stack: string, + }[] = []; + + constructor() {} + + public addCall(call: Omit) { + this.calls.push({ + ...call, + stack: new Error().stack!.split("\n").slice(2).join("\n"), + }); + } + + public toJSON() { + return { + calls: this.calls, + + smartScrapeCallCount: this.calls.filter(c => c.type === "smartScrape").length, + smartScrapeCost: this.calls.filter(c => c.type === "smartScrape").reduce((acc, c) => acc + c.cost, 0), + otherCallCount: this.calls.filter(c => c.type === "other").length, + otherCost: this.calls.filter(c => c.type === "other").reduce((acc, c) => acc + c.cost, 0), + totalCost: this.calls.reduce((acc, c) => acc + c.cost, 0), + } + } +} export async function performExtraction( extractId: string, @@ -89,13 +114,7 @@ export async function performExtraction( let singleAnswerResult: any = {}; let totalUrlsScraped = 0; let sources: Record = {}; - let costTracking: CostTracking = { - smartScrapeCallCount: 0, - smartScrapeCost: 0, - otherCallCount: 0, - otherCost: 0, - totalCost: 0, - }; + let costTracking: CostTracking = new CostTracking(); let log = { extractId, @@ -118,13 +137,9 @@ export async function performExtraction( }); const rephrasedPrompt = await generateBasicCompletion( buildRephraseToSerpPrompt(request.prompt), + costTracking, ); let rptxt = rephrasedPrompt?.text.replace('"', "").replace("'", "") || ""; - if (rephrasedPrompt) { - costTracking.otherCallCount++; - costTracking.otherCost += rephrasedPrompt.cost; - costTracking.totalCost += rephrasedPrompt.cost; - } const searchResults = await search({ query: rptxt, num_results: 10, @@ -197,11 +212,9 @@ export async function performExtraction( let reqSchema = request.schema; if (!reqSchema && request.prompt) { - const schemaGenRes = await generateSchemaFromPrompt(request.prompt, logger); + const schemaGenRes = await generateSchemaFromPrompt(request.prompt, logger, costTracking); reqSchema = schemaGenRes.extract; - costTracking.otherCallCount++; - costTracking.otherCost += schemaGenRes.cost; - costTracking.totalCost += schemaGenRes.cost; + logger.debug("Generated request schema.", { originalSchema: request.schema, @@ -232,8 +245,7 @@ export async function performExtraction( reasoning, keyIndicators, tokenUsage: schemaAnalysisTokenUsage, - cost: schemaAnalysisCost, - } = await analyzeSchemaAndPrompt(urls, reqSchema, request.prompt ?? "", logger); + } = await analyzeSchemaAndPrompt(urls, reqSchema, request.prompt ?? "", logger, costTracking); logger.debug("Analyzed schema.", { isMultiEntity, @@ -242,11 +254,6 @@ export async function performExtraction( keyIndicators, }); - costTracking.otherCallCount++; - costTracking.otherCost += schemaAnalysisCost; - costTracking.totalCost += schemaAnalysisCost; - - // Track schema analysis tokens tokenUsage.push(schemaAnalysisTokenUsage); let startMap = Date.now(); @@ -467,7 +474,8 @@ export async function performExtraction( doc, useAgent: isAgentExtractModelValid(request.agent?.model), extractId, - sessionId + sessionId, + costTracking, }, logger); // Race between timeout and completion @@ -481,12 +489,6 @@ export async function performExtraction( if (multiEntityCompletion) { tokenUsage.push(multiEntityCompletion.totalUsage); - costTracking.smartScrapeCallCount += multiEntityCompletion.smartScrapeCallCount; - costTracking.smartScrapeCost += multiEntityCompletion.smartScrapeCost; - costTracking.otherCallCount += multiEntityCompletion.otherCallCount; - costTracking.otherCost += multiEntityCompletion.otherCost; - costTracking.totalCost += multiEntityCompletion.smartScrapeCost + multiEntityCompletion.otherCost; - if (multiEntityCompletion.extract) { return { extract: multiEntityCompletion.extract, @@ -776,10 +778,6 @@ export async function performExtraction( extract: completionResult, tokenUsage: singleAnswerTokenUsage, sources: singleAnswerSources, - smartScrapeCost: singleAnswerSmartScrapeCost, - otherCost: singleAnswerOtherCost, - smartScrapeCallCount: singleAnswerSmartScrapeCallCount, - otherCallCount: singleAnswerOtherCallCount, } = await singleAnswerCompletion({ singleAnswerDocs, rSchema, @@ -789,12 +787,8 @@ export async function performExtraction( useAgent: isAgentExtractModelValid(request.agent?.model), extractId, sessionId: thisSessionId, + costTracking, }); - costTracking.smartScrapeCost += singleAnswerSmartScrapeCost; - costTracking.smartScrapeCallCount += singleAnswerSmartScrapeCallCount; - costTracking.otherCost += singleAnswerOtherCost; - costTracking.otherCallCount += singleAnswerOtherCallCount; - costTracking.totalCost += singleAnswerSmartScrapeCost + singleAnswerOtherCost; logger.debug("Done generating singleAnswer completions."); singleAnswerResult = transformArrayToObject(rSchema, completionResult); diff --git a/apps/api/src/lib/extract/fire-0/reranker-f0.ts b/apps/api/src/lib/extract/fire-0/reranker-f0.ts index 87e673df..155df0c6 100644 --- a/apps/api/src/lib/extract/fire-0/reranker-f0.ts +++ b/apps/api/src/lib/extract/fire-0/reranker-f0.ts @@ -6,7 +6,7 @@ import { extractConfig } from "../config"; import { generateCompletions } from "../../../scraper/scrapeURL/transformers/llmExtract"; import { performRanking_F0 } from "./ranker-f0"; import { buildRerankerSystemPrompt_F0, buildRerankerUserPrompt_F0 } from "./build-prompts-f0"; - +import { CostTracking } from "../extraction-service"; const cohere = new CohereClient({ token: process.env.COHERE_API_KEY, }); @@ -166,7 +166,7 @@ export type RerankerOptions = { urlTraces: URLTrace[]; }; -export async function rerankLinksWithLLM_F0(options: RerankerOptions): Promise { +export async function rerankLinksWithLLM_F0(options: RerankerOptions, costTracking: CostTracking): Promise { const { links, searchQuery, urlTraces } = options; const chunkSize = 100; const chunks: MapDocument[][] = []; @@ -231,7 +231,14 @@ export async function rerankLinksWithLLM_F0(options: RerankerOptions): Promise { +export async function generateBasicCompletion(prompt: string, costTracking: CostTracking): Promise<{ text: string } | null> { try { const result = await generateText({ model: getModel("gpt-4o", "openai"), @@ -22,7 +22,19 @@ export async function generateBasicCompletion(prompt: string): Promise<{ text: s }, } }); - return { text: result.text, cost: calculateCost("openai/gpt-4o", result.usage?.promptTokens ?? 0, result.usage?.completionTokens ?? 0) }; + costTracking.addCall({ + type: "other", + metadata: { + module: "extract", + method: "generateBasicCompletion", + }, + cost: calculateCost("openai/gpt-4o", result.usage?.promptTokens ?? 0, result.usage?.completionTokens ?? 0), + tokens: { + input: result.usage?.promptTokens ?? 0, + output: result.usage?.completionTokens ?? 0, + }, + }); + return { text: result.text }; } catch (error) { console.error("Error generating basic completion:", error); if (error?.type == "rate_limit_error") { @@ -36,7 +48,19 @@ export async function generateBasicCompletion(prompt: string): Promise<{ text: s }, } }); - return { text: result.text, cost: calculateCost("openai/gpt-4o-mini", result.usage?.promptTokens ?? 0, result.usage?.completionTokens ?? 0) }; + costTracking.addCall({ + type: "other", + metadata: { + module: "extract", + method: "generateBasicCompletion", + }, + cost: calculateCost("openai/gpt-4o-mini", result.usage?.promptTokens ?? 0, result.usage?.completionTokens ?? 0), + tokens: { + input: result.usage?.promptTokens ?? 0, + output: result.usage?.completionTokens ?? 0, + }, + }); + return { text: result.text }; } catch (fallbackError) { console.error("Error generating basic completion with fallback model:", fallbackError); return null; @@ -96,13 +120,11 @@ export async function processUrl( if (options.prompt) { const res = await generateBasicCompletion( buildRefrasedPrompt(options.prompt, baseUrl), + costTracking, ); if (res) { searchQuery = res.text.replace('"', "").replace("/", "") ?? options.prompt; - costTracking.otherCallCount++; - costTracking.otherCost += res.cost; - costTracking.totalCost += res.cost; } } @@ -223,13 +245,11 @@ export async function processUrl( try { const res = await generateBasicCompletion( buildPreRerankPrompt(rephrasedPrompt, options.schema, baseUrl), + costTracking, ); if (res) { rephrasedPrompt = res.text; - costTracking.otherCallCount++; - costTracking.otherCost += res.cost; - costTracking.totalCost += res.cost; } else { rephrasedPrompt = "Extract the data according to the schema: " + @@ -262,10 +282,8 @@ export async function processUrl( reasoning: options.reasoning, multiEntityKeys: options.multiEntityKeys, keyIndicators: options.keyIndicators, + costTracking, }); - costTracking.otherCallCount++; - costTracking.otherCost += rerankerResult.cost; - costTracking.totalCost += rerankerResult.cost; mappedLinks = rerankerResult.mapDocument; let tokensUsed = rerankerResult.tokensUsed; logger.info("Reranked! (pass 1)", { @@ -283,10 +301,8 @@ export async function processUrl( reasoning: options.reasoning, multiEntityKeys: options.multiEntityKeys, keyIndicators: options.keyIndicators, + costTracking, }); - costTracking.otherCallCount++; - costTracking.otherCost += rerankerResult.cost; - costTracking.totalCost += rerankerResult.cost; mappedLinks = rerankerResult.mapDocument; tokensUsed += rerankerResult.tokensUsed; logger.info("Reranked! (pass 2)", { diff --git a/apps/api/src/lib/generate-llmstxt/generate-llmstxt-service.ts b/apps/api/src/lib/generate-llmstxt/generate-llmstxt-service.ts index fa72d582..3528f48f 100644 --- a/apps/api/src/lib/generate-llmstxt/generate-llmstxt-service.ts +++ b/apps/api/src/lib/generate-llmstxt/generate-llmstxt-service.ts @@ -11,7 +11,7 @@ import { billTeam } from "../../services/billing/credit_billing"; import { logJob } from "../../services/logging/log_job"; import { getModel } from "../generic-ai"; import { generateCompletions } from "../../scraper/scrapeURL/transformers/llmExtract"; - +import { CostTracking } from "../extract/extraction-service"; interface GenerateLLMsTextServiceOptions { generationId: string; teamId: string; @@ -71,6 +71,7 @@ export async function performGenerateLlmsTxt( generationId, teamId, }); + const costTracking = new CostTracking(); try { // Enforce max URL limit @@ -167,6 +168,13 @@ export async function performGenerateLlmsTxt( prompt: `Generate a 9-10 word description and a 3-4 word title of the entire page based on ALL the content one will find on the page for this url: ${document.metadata?.url}. This will help in a user finding the page for its intended purpose.`, }, markdown: document.markdown, + costTrackingOptions: { + costTracking, + metadata: { + module: "generate-llmstxt", + method: "generateDescription", + }, + }, }); return { @@ -229,6 +237,7 @@ export async function performGenerateLlmsTxt( num_tokens: 0, tokens_billed: 0, sources: {}, + cost_tracking: costTracking, }); // Bill team for usage diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index ba983e07..8e287180 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -17,14 +17,17 @@ import { } from "../scraper/scrapeURL"; import { Engine } from "../scraper/scrapeURL/engines"; import { indexPage } from "../lib/extract/index/pinecone"; +import { CostTracking } from "../lib/extract/extraction-service"; configDotenv(); export async function startWebScraperPipeline({ job, token, + costTracking, }: { job: Job & { id: string }; token: string; + costTracking: CostTracking; }) { return await runWebScraper({ url: job.data.url, @@ -52,6 +55,7 @@ export async function startWebScraperPipeline({ is_scrape: job.data.is_scrape ?? false, is_crawl: !!(job.data.crawl_id && job.data.crawlerOptions !== null), urlInvisibleInCurrentCrawl: job.data.crawlerOptions?.urlInvisibleInCurrentCrawl ?? false, + costTracking, }); } @@ -68,6 +72,7 @@ export async function runWebScraper({ is_scrape = false, is_crawl = false, urlInvisibleInCurrentCrawl = false, + costTracking, }: RunWebScraperParams): Promise { const logger = _logger.child({ method: "runWebScraper", @@ -101,7 +106,7 @@ export async function runWebScraper({ ...internalOptions, urlInvisibleInCurrentCrawl, teamId: internalOptions?.teamId ?? team_id, - }); + }, costTracking); if (!response.success) { if (response.error instanceof Error) { throw response.error; diff --git a/apps/api/src/scraper/WebScraper/sitemap.ts b/apps/api/src/scraper/WebScraper/sitemap.ts index f945cd22..67d3a2ff 100644 --- a/apps/api/src/scraper/WebScraper/sitemap.ts +++ b/apps/api/src/scraper/WebScraper/sitemap.ts @@ -3,6 +3,7 @@ import { WebCrawler } from "./crawler"; import { scrapeURL } from "../scrapeURL"; import { scrapeOptions, TimeoutSignal } from "../../controllers/v1/types"; import type { Logger } from "winston"; +import { CostTracking } from "../../lib/extract/extraction-service"; const useFireEngine = process.env.FIRE_ENGINE_BETA_URL !== "" && process.env.FIRE_ENGINE_BETA_URL !== undefined; @@ -49,6 +50,7 @@ export async function getLinksFromSitemap( abort, teamId: "sitemap", }, + new CostTracking(), ); if ( diff --git a/apps/api/src/scraper/scrapeURL/index.ts b/apps/api/src/scraper/scrapeURL/index.ts index cedd275e..b20fbdfd 100644 --- a/apps/api/src/scraper/scrapeURL/index.ts +++ b/apps/api/src/scraper/scrapeURL/index.ts @@ -26,6 +26,7 @@ import { executeTransformers } from "./transformers"; import { LLMRefusalError } from "./transformers/llmExtract"; import { urlSpecificParams } from "./lib/urlSpecificParams"; import { loadMock, MockState } from "./lib/mock"; +import { CostTracking } from "../../lib/extract/extraction-service"; export type ScrapeUrlResponse = ( | { @@ -55,6 +56,7 @@ export type Meta = { url?: string; status: number; } | null | undefined; // undefined: no prefetch yet, null: prefetch came back empty + costTracking: CostTracking; }; function buildFeatureFlags( @@ -127,6 +129,7 @@ async function buildMetaObject( url: string, options: ScrapeOptions, internalOptions: InternalOptions, + costTracking: CostTracking, ): Promise { const specParams = urlSpecificParams[new URL(url).hostname.replace(/^www\./, "")]; @@ -158,12 +161,13 @@ async function buildMetaObject( ? await loadMock(options.useMock, _logger) : null, pdfPrefetch: undefined, + costTracking, }; } export type InternalOptions = { teamId: string; - + priority?: number; // Passed along to fire-engine forceEngine?: Engine | Engine[]; atsv?: boolean; // anti-bot solver, beta @@ -389,8 +393,9 @@ export async function scrapeURL( url: string, options: ScrapeOptions, internalOptions: InternalOptions, + costTracking: CostTracking, ): Promise { - const meta = await buildMetaObject(id, url, options, internalOptions); + const meta = await buildMetaObject(id, url, options, internalOptions, costTracking); try { while (true) { try { diff --git a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts index 82f16d12..e02f4c83 100644 --- a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts +++ b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts @@ -10,8 +10,7 @@ import { parseMarkdown } from "../../../lib/html-to-markdown"; import { getModel } from "../../../lib/generic-ai"; import { TokenUsage } from "../../../controllers/v1/types"; import type { SmartScrapeResult } from "./smartScrape"; -import { ExtractStep } from "src/lib/extract/extract-redis"; - +import { CostTracking } from "../../../lib/extract/extraction-service"; const commonSmartScrapeProperties = { shouldUseSmartscrape: { type: "boolean", @@ -225,26 +224,16 @@ export async function extractData({ }): Promise<{ extractedDataArray: any[]; warning: any; - smartScrapeCallCount: number; - otherCallCount: number; - smartScrapeCost: number; - otherCost: number; costLimitExceededTokenUsage: number | null; }> { let schema = extractOptions.options.schema; const logger = extractOptions.logger; const isSingleUrl = urls.length === 1; - let smartScrapeCost = 0; - let otherCost = 0; - let smartScrapeCallCount = 0; - let otherCallCount = 0; let costLimitExceededTokenUsage: number | null = null; // TODO: remove the "required" fields here!! it breaks o3-mini if (!schema && extractOptions.options.prompt) { - const genRes = await generateSchemaFromPrompt(extractOptions.options.prompt, logger); - otherCallCount++; - otherCost += genRes.cost; + const genRes = await generateSchemaFromPrompt(extractOptions.options.prompt, logger, extractOptions.costTrackingOptions.costTracking); schema = genRes.extract; } @@ -278,17 +267,22 @@ export async function extractData({ extract: e, warning: w, totalUsage: t, - cost: c, } = await generateCompletions({ ...extractOptionsNewSchema, model: getModel("gemini-2.5-pro-preview-03-25", "vertex"), retryModel: getModel("gemini-2.5-pro-preview-03-25", "google"), + costTrackingOptions: { + costTracking: extractOptions.costTrackingOptions.costTracking, + metadata: { + module: "scrapeURL", + method: "extractData", + description: "Check if using smartScrape is needed for this case" + }, + }, }); extract = e; warning = w; totalUsage = t; - otherCost += c; - otherCallCount++; } catch (error) { logger.error( "failed during extractSmartScrape.ts:generateCompletions", @@ -321,10 +315,9 @@ export async function extractData({ sessionId, extractId, scrapeId, + costTracking: extractOptions.costTrackingOptions.costTracking, }), ]; - smartScrapeCost += smartscrapeResults[0].tokenUsage; - smartScrapeCallCount++; } else { const pages = extract?.smartscrapePages ?? []; //do it async promiseall instead @@ -344,14 +337,10 @@ export async function extractData({ sessionId, extractId, scrapeId, + costTracking: extractOptions.costTrackingOptions.costTracking, }); }), ); - smartScrapeCost += smartscrapeResults.reduce( - (acc, result) => acc + result.tokenUsage, - 0, - ); - smartScrapeCallCount += smartscrapeResults.length; } // console.log("smartscrapeResults", smartscrapeResults); @@ -372,11 +361,17 @@ export async function extractData({ markdown: markdown, model: getModel("gemini-2.5-pro-preview-03-25", "vertex"), retryModel: getModel("gemini-2.5-pro-preview-03-25", "google"), + costTrackingOptions: { + costTracking: extractOptions.costTrackingOptions.costTracking, + metadata: { + module: "scrapeURL", + method: "extractData", + description: "Extract data from markdown (smart-scape results)", + }, + }, }; - const { extract, warning, totalUsage, model, cost } = + const { extract } = await generateCompletions(newExtractOptions); - otherCost += cost; - otherCallCount++; return extract; }), ); @@ -399,10 +394,6 @@ export async function extractData({ return { extractedDataArray: extractedData, warning: warning, - smartScrapeCallCount: smartScrapeCallCount, - otherCallCount: otherCallCount, - smartScrapeCost: smartScrapeCost, - otherCost: otherCost, costLimitExceededTokenUsage: costLimitExceededTokenUsage, }; } diff --git a/apps/api/src/scraper/scrapeURL/lib/smartScrape.ts b/apps/api/src/scraper/scrapeURL/lib/smartScrape.ts index a913ec27..8458e506 100644 --- a/apps/api/src/scraper/scrapeURL/lib/smartScrape.ts +++ b/apps/api/src/scraper/scrapeURL/lib/smartScrape.ts @@ -3,7 +3,7 @@ import { logger as _logger } from "../../../lib/logger"; import { robustFetch } from "./fetch"; import fs from "fs/promises"; import { configDotenv } from "dotenv"; - +import { CostTracking } from "../../../lib/extract/extraction-service"; configDotenv(); // Define schemas outside the function scope @@ -52,6 +52,7 @@ export async function smartScrape({ extractId, scrapeId, beforeSubmission, + costTracking, }: { url: string, prompt: string, @@ -59,6 +60,7 @@ export async function smartScrape({ extractId?: string, scrapeId?: string, beforeSubmission?: () => unknown, + costTracking: CostTracking, }): Promise { let logger = _logger.child({ method: "smartScrape", @@ -139,6 +141,16 @@ export async function smartScrape({ }); logger.info("Smart scrape cost $" + response.tokenUsage); + costTracking.addCall({ + type: "smartScrape", + cost: response.tokenUsage, + metadata: { + module: "smartScrape", + method: "smartScrape", + url, + sessionId, + }, + }); return response; // The response type now matches SmartScrapeResult } catch (error) { diff --git a/apps/api/src/scraper/scrapeURL/scrapeURL.test.ts b/apps/api/src/scraper/scrapeURL/scrapeURL.test.ts index b545266f..2f11d945 100644 --- a/apps/api/src/scraper/scrapeURL/scrapeURL.test.ts +++ b/apps/api/src/scraper/scrapeURL/scrapeURL.test.ts @@ -5,6 +5,7 @@ process.env.ENV = "test"; import { scrapeURL } from "."; import { scrapeOptions } from "../../controllers/v1/types"; import { Engine } from "./engines"; +import { CostTracking } from "../../lib/extract/extraction-service"; const testEngines: (Engine | undefined)[] = [ undefined, @@ -32,6 +33,7 @@ describe("Standalone scrapeURL tests", () => { "https://www.roastmywebsite.ai/", scrapeOptions.parse({}), { forceEngine, teamId: "test" }, + new CostTracking(), ); // expect(out.logs.length).toBeGreaterThan(0); @@ -79,6 +81,7 @@ describe("Standalone scrapeURL tests", () => { formats: ["markdown", "html"], }), { forceEngine, teamId: "test" }, + new CostTracking(), ); // expect(out.logs.length).toBeGreaterThan(0); @@ -103,6 +106,7 @@ describe("Standalone scrapeURL tests", () => { onlyMainContent: false, }), { forceEngine, teamId: "test" }, + new CostTracking(), ); // expect(out.logs.length).toBeGreaterThan(0); @@ -126,6 +130,7 @@ describe("Standalone scrapeURL tests", () => { excludeTags: [".nav", "#footer", "strong"], }), { forceEngine, teamId: "test" }, + new CostTracking(), ); // expect(out.logs.length).toBeGreaterThan(0); @@ -146,6 +151,7 @@ describe("Standalone scrapeURL tests", () => { "https://httpstat.us/400", scrapeOptions.parse({}), { forceEngine, teamId: "test" }, + new CostTracking(), ); // expect(out.logs.length).toBeGreaterThan(0); @@ -164,6 +170,7 @@ describe("Standalone scrapeURL tests", () => { "https://httpstat.us/401", scrapeOptions.parse({}), { forceEngine, teamId: "test" }, + new CostTracking(), ); // expect(out.logs.length).toBeGreaterThan(0); @@ -182,6 +189,7 @@ describe("Standalone scrapeURL tests", () => { "https://httpstat.us/403", scrapeOptions.parse({}), { forceEngine, teamId: "test" }, + new CostTracking(), ); // expect(out.logs.length).toBeGreaterThan(0); @@ -200,6 +208,7 @@ describe("Standalone scrapeURL tests", () => { "https://httpstat.us/404", scrapeOptions.parse({}), { forceEngine, teamId: "test" }, + new CostTracking(), ); // expect(out.logs.length).toBeGreaterThan(0); @@ -218,6 +227,7 @@ describe("Standalone scrapeURL tests", () => { "https://httpstat.us/405", scrapeOptions.parse({}), { forceEngine, teamId: "test" }, + new CostTracking(), ); // expect(out.logs.length).toBeGreaterThan(0); @@ -236,6 +246,7 @@ describe("Standalone scrapeURL tests", () => { "https://httpstat.us/500", scrapeOptions.parse({}), { forceEngine, teamId: "test" }, + new CostTracking(), ); // expect(out.logs.length).toBeGreaterThan(0); @@ -254,6 +265,7 @@ describe("Standalone scrapeURL tests", () => { "https://scrapethissite.com/", scrapeOptions.parse({}), { forceEngine, teamId: "test" }, + new CostTracking(), ); // expect(out.logs.length).toBeGreaterThan(0); @@ -286,6 +298,7 @@ describe("Standalone scrapeURL tests", () => { formats: ["screenshot"], }), { forceEngine, teamId: "test" }, + new CostTracking(), ); // expect(out.logs.length).toBeGreaterThan(0); @@ -314,6 +327,7 @@ describe("Standalone scrapeURL tests", () => { formats: ["screenshot@fullPage"], }), { forceEngine, teamId: "test" }, + new CostTracking(), ); // expect(out.logs.length).toBeGreaterThan(0); @@ -342,6 +356,7 @@ describe("Standalone scrapeURL tests", () => { "https://arxiv.org/pdf/astro-ph/9301001.pdf", scrapeOptions.parse({}), { teamId: "test" }, + new CostTracking(), ); // expect(out.logs.length).toBeGreaterThan(0); @@ -361,6 +376,7 @@ describe("Standalone scrapeURL tests", () => { "https://nvca.org/wp-content/uploads/2019/06/NVCA-Model-Document-Stock-Purchase-Agreement.docx", scrapeOptions.parse({}), { teamId: "test" }, + new CostTracking(), ); // expect(out.logs.length).toBeGreaterThan(0); @@ -398,6 +414,7 @@ describe("Standalone scrapeURL tests", () => { }, }), { teamId: "test" }, + new CostTracking(), ); // expect(out.logs.length).toBeGreaterThan(0); @@ -434,6 +451,7 @@ describe("Standalone scrapeURL tests", () => { }, }), { teamId: "test" }, + new CostTracking(), ); // expect(out.logs.length).toBeGreaterThan(0); @@ -455,7 +473,7 @@ describe("Standalone scrapeURL tests", () => { async (i) => { const url = "https://www.scrapethissite.com/?i=" + i; const id = "test:concurrent:" + url; - const out = await scrapeURL(id, url, scrapeOptions.parse({}), { teamId: "test" }); + const out = await scrapeURL(id, url, scrapeOptions.parse({}), { teamId: "test" }, new CostTracking()); const replacer = (key: string, value: any) => { if (value instanceof Error) { diff --git a/apps/api/src/scraper/scrapeURL/transformers/agent.ts b/apps/api/src/scraper/scrapeURL/transformers/agent.ts index 5ad304d3..7f98bee1 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/agent.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/agent.ts @@ -30,6 +30,7 @@ export async function performAgent( prompt, sessionId, scrapeId: meta.id, + costTracking: meta.costTracking, }) } catch (error) { if (error instanceof Error && error.message === "Cost limit exceeded") { @@ -50,20 +51,6 @@ export async function performAgent( if (meta.options.formats.includes("html")) { document.html = html } - - if (document.metadata.costTracking) { - document.metadata.costTracking.smartScrapeCallCount++; - document.metadata.costTracking.smartScrapeCost = document.metadata.costTracking.smartScrapeCost + smartscrapeResults.tokenUsage; - document.metadata.costTracking.totalCost = document.metadata.costTracking.totalCost + smartscrapeResults.tokenUsage; - } else { - document.metadata.costTracking = { - smartScrapeCallCount: 1, - smartScrapeCost: smartscrapeResults.tokenUsage, - otherCallCount: 0, - otherCost: 0, - totalCost: smartscrapeResults.tokenUsage, - } - } } return document; diff --git a/apps/api/src/scraper/scrapeURL/transformers/diff.ts b/apps/api/src/scraper/scrapeURL/transformers/diff.ts index 8cdea891..94df276b 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/diff.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/diff.ts @@ -6,9 +6,9 @@ import gitDiff from 'git-diff'; import parseDiff from 'parse-diff'; import { generateCompletions } from "./llmExtract"; -async function extractDataWithSchema(content: string, meta: Meta): Promise<{ extract: any, cost: number } | null> { +async function extractDataWithSchema(content: string, meta: Meta): Promise<{ extract: any } | null> { try { - const { extract, cost } = await generateCompletions({ + const { extract } = await generateCompletions({ logger: meta.logger.child({ method: "extractDataWithSchema/generateCompletions", }), @@ -18,9 +18,16 @@ async function extractDataWithSchema(content: string, meta: Meta): Promise<{ ext systemPrompt: "Extract the requested information from the content based on the provided schema.", temperature: 0 }, - markdown: content + markdown: content, + costTrackingOptions: { + costTracking: meta.costTracking, + metadata: { + module: "extract", + method: "extractDataWithSchema", + }, + }, }); - return { extract, cost }; + return { extract }; } catch (error) { meta.logger.error("Error extracting data with schema", { error }); return null; @@ -145,19 +152,6 @@ export async function deriveDiff(meta: Meta, document: Document): Promise; + }; }; export async function generateCompletions({ logger, @@ -242,13 +247,13 @@ export async function generateCompletions({ mode = "object", providerOptions, retryModel = getModel("claude-3-5-sonnet-20240620", "anthropic"), + costTrackingOptions, }: GenerateCompletionsOptions): Promise<{ extract: any; numTokens: number; warning: string | undefined; totalUsage: TokenUsage; model: string; - cost: number; }> { let extract: any; let warning: string | undefined; @@ -278,6 +283,19 @@ export async function generateCompletions({ }, }); + costTrackingOptions.costTracking.addCall({ + type: "other", + metadata: { + ...costTrackingOptions.metadata, + gcDetails: "no-object", + }, + cost: calculateCost( + currentModel.modelId, + result.usage?.promptTokens ?? 0, + result.usage?.completionTokens ?? 0, + ), + }); + extract = result.text; return { @@ -290,11 +308,6 @@ export async function generateCompletions({ totalTokens: result.usage?.promptTokens ?? 0 + (result.usage?.completionTokens ?? 0), }, model: currentModel.modelId, - cost: calculateCost( - currentModel.modelId, - result.usage?.promptTokens ?? 0, - result.usage?.completionTokens ?? 0, - ), }; } catch (error) { lastError = error as Error; @@ -321,6 +334,19 @@ export async function generateCompletions({ extract = result.text; + costTrackingOptions.costTracking.addCall({ + type: "other", + metadata: { + ...costTrackingOptions.metadata, + gcDetails: "no-object fallback", + }, + cost: calculateCost( + currentModel.modelId, + result.usage?.promptTokens ?? 0, + result.usage?.completionTokens ?? 0, + ), + }); + return { extract, warning, @@ -331,11 +357,6 @@ export async function generateCompletions({ totalTokens: result.usage?.promptTokens ?? 0 + (result.usage?.completionTokens ?? 0), }, model: currentModel.modelId, - cost: calculateCost( - currentModel.modelId, - result.usage?.promptTokens ?? 0, - result.usage?.completionTokens ?? 0, - ), }; } catch (retryError) { lastError = retryError as Error; @@ -410,7 +431,7 @@ export async function generateCompletions({ } try { - const { text: fixedText } = await generateText({ + const { text: fixedText, usage: repairUsage } = await generateText({ model: currentModel, prompt: `Fix this JSON that had the following error: ${error}\n\nOriginal text:\n${text}\n\nReturn only the fixed JSON, no explanation.`, system: @@ -421,6 +442,23 @@ export async function generateCompletions({ }, }, }); + + costTrackingOptions.costTracking.addCall({ + type: "other", + metadata: { + ...costTrackingOptions.metadata, + gcDetails: "repairConfig", + }, + cost: calculateCost( + currentModel.modelId, + repairUsage?.promptTokens ?? 0, + repairUsage?.completionTokens ?? 0, + ), + tokens: { + input: repairUsage?.promptTokens ?? 0, + output: repairUsage?.completionTokens ?? 0, + }, + }); logger.debug("Repaired text with LLM"); return fixedText; } catch (repairError) { @@ -464,6 +502,23 @@ export async function generateCompletions({ let result: { object: any; usage: TokenUsage } | undefined; try { result = await generateObject(generateObjectConfig); + costTrackingOptions.costTracking.addCall({ + type: "other", + metadata: { + ...costTrackingOptions.metadata, + gcDetails: "generateObject", + gcModel: generateObjectConfig.model.modelId, + }, + tokens: { + input: result.usage?.promptTokens ?? 0, + output: result.usage?.completionTokens ?? 0, + }, + cost: calculateCost( + currentModel.modelId, + result.usage?.promptTokens ?? 0, + result.usage?.completionTokens ?? 0, + ), + }); } catch (error) { lastError = error as Error; if ( @@ -481,6 +536,23 @@ export async function generateCompletions({ model: currentModel, }; result = await generateObject(retryConfig); + costTrackingOptions.costTracking.addCall({ + type: "other", + metadata: { + ...costTrackingOptions.metadata, + gcDetails: "generateObject fallback", + gcModel: retryConfig.model.modelId, + }, + tokens: { + input: result.usage?.promptTokens ?? 0, + output: result.usage?.completionTokens ?? 0, + }, + cost: calculateCost( + currentModel.modelId, + result.usage?.promptTokens ?? 0, + result.usage?.completionTokens ?? 0, + ), + }); } catch (retryError) { lastError = retryError as Error; logger.error("Failed with fallback model", { @@ -549,7 +621,6 @@ export async function generateCompletions({ totalTokens: promptTokens + completionTokens, }, model: currentModel.modelId, - cost: calculateCost(currentModel.modelId, promptTokens, completionTokens), }; } catch (error) { lastError = error as Error; @@ -589,9 +660,16 @@ export async function performLLMExtract( // model: getModel("gemini-2.5-pro-preview-03-25", "vertex"), model: getModel("gemini-2.5-pro-preview-03-25", "vertex"), retryModel: getModel("gemini-2.5-pro-preview-03-25", "google"), + costTrackingOptions: { + costTracking: meta.costTracking, + metadata: { + module: "scrapeURL", + method: "performLLMExtract", + }, + }, }; - const { extractedDataArray, warning, smartScrapeCost, otherCost, costLimitExceededTokenUsage } = + const { extractedDataArray, warning, costLimitExceededTokenUsage } = await extractData({ extractOptions: generationOptions, urls: [meta.url], @@ -603,25 +681,6 @@ export async function performLLMExtract( document.warning = warning + (document.warning ? " " + document.warning : ""); } - if (document.metadata.costTracking) { - document.metadata.costTracking.smartScrapeCallCount++; - document.metadata.costTracking.smartScrapeCost += smartScrapeCost; - document.metadata.costTracking.otherCallCount++; - document.metadata.costTracking.otherCost += otherCost; - document.metadata.costTracking.totalCost += smartScrapeCost + otherCost; - if (costLimitExceededTokenUsage) { - document.metadata.costTracking.costLimitExceededTokenUsage = costLimitExceededTokenUsage; - } - } else { - document.metadata.costTracking = { - smartScrapeCallCount: 1, - smartScrapeCost: smartScrapeCost, - otherCallCount: 1, - otherCost: otherCost, - totalCost: smartScrapeCost + otherCost, - }; - } - // IMPORTANT: here it only get's the last page!!! const extractedData = extractedDataArray[extractedDataArray.length - 1] ?? undefined; @@ -758,7 +817,8 @@ export function removeDefaultProperty(schema: any): any { export async function generateSchemaFromPrompt( prompt: string, logger: Logger, -): Promise<{ extract: any; cost: number }> { + costTracking: CostTracking, +): Promise<{ extract: any }> { const model = getModel("gpt-4o", "openai"); const retryModel = getModel("gpt-4o-mini", "openai"); const temperatures = [0, 0.1, 0.3]; // Different temperatures to try @@ -766,7 +826,7 @@ export async function generateSchemaFromPrompt( for (const temp of temperatures) { try { - const { extract, cost } = await generateCompletions({ + const { extract } = await generateCompletions({ logger: logger.child({ method: "generateSchemaFromPrompt/generateCompletions", }), @@ -802,10 +862,16 @@ Return a valid JSON schema object with properties that would capture the informa prompt: `Generate a JSON schema for extracting the following information: ${prompt}`, // temperature: temp, }, - markdown: prompt, + costTrackingOptions: { + costTracking, + metadata: { + module: "scrapeURL", + method: "generateSchemaFromPrompt", + }, + }, }); - return { extract, cost }; + return { extract }; } catch (error) { lastError = error as Error; logger.warn(`Failed attempt with temperature ${temp}: ${error.message}`); diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 1118a0f9..15af06bd 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -75,6 +75,7 @@ import { performDeepResearch } from "../lib/deep-research/deep-research-service" import { performGenerateLlmsTxt } from "../lib/generate-llmstxt/generate-llmstxt-service"; import { updateGeneratedLlmsTxt } from "../lib/generate-llmstxt/generate-llmstxt-redis"; import { performExtraction_F0 } from "../lib/extract/fire-0/extraction-service-f0"; +import { CostTracking } from "../lib/extract/extraction-service"; configDotenv(); @@ -1010,6 +1011,7 @@ async function processJob(job: Job & { id: string }, token: string) { // }; // return data; // } + const costTracking = new CostTracking(); try { job.updateProgress({ @@ -1030,6 +1032,7 @@ async function processJob(job: Job & { id: string }, token: string) { startWebScraperPipeline({ job, token, + costTracking, }), ...(job.data.scrapeOptions.timeout !== undefined ? [ @@ -1171,6 +1174,7 @@ async function processJob(job: Job & { id: string }, token: string) { scrapeOptions: job.data.scrapeOptions, origin: job.data.origin, crawl_id: job.data.crawl_id, + cost_tracking: costTracking, }, true, ); @@ -1276,10 +1280,6 @@ async function processJob(job: Job & { id: string }, token: string) { await finishCrawlIfNeeded(job, sc); } else { - const cost_tracking = doc?.metadata?.costTracking; - - delete doc.metadata.costTracking; - await logJob({ job_id: job.id, success: true, @@ -1293,7 +1293,7 @@ async function processJob(job: Job & { id: string }, token: string) { scrapeOptions: job.data.scrapeOptions, origin: job.data.origin, num_tokens: 0, // TODO: fix - cost_tracking, + cost_tracking: costTracking, }); indexJob(job, doc); @@ -1442,6 +1442,7 @@ async function processJob(job: Job & { id: string }, token: string) { scrapeOptions: job.data.scrapeOptions, origin: job.data.origin, crawl_id: job.data.crawl_id, + cost_tracking: costTracking, }, true, ); diff --git a/apps/api/src/types.ts b/apps/api/src/types.ts index 03f0a015..34e2f60f 100644 --- a/apps/api/src/types.ts +++ b/apps/api/src/types.ts @@ -62,6 +62,7 @@ export interface RunWebScraperParams { is_scrape?: boolean; is_crawl?: boolean; urlInvisibleInCurrentCrawl?: boolean; + costTracking: CostTracking; } export type RunWebScraperResult = From 06770bc63ff7964f1844978f7ed11fcff3b4fbdd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 17 Apr 2025 10:48:13 -0700 Subject: [PATCH 153/160] fix(scrape/json): move back to 4o mini --- apps/api/src/lib/extract/completions/singleAnswer.ts | 5 +++-- apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts | 2 -- apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts | 4 ++-- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/apps/api/src/lib/extract/completions/singleAnswer.ts b/apps/api/src/lib/extract/completions/singleAnswer.ts index 866b6049..79bb52f2 100644 --- a/apps/api/src/lib/extract/completions/singleAnswer.ts +++ b/apps/api/src/lib/extract/completions/singleAnswer.ts @@ -51,7 +51,8 @@ export async function singleAnswerCompletion({ }, markdown: `${singleAnswerDocs.map((x, i) => `[START_PAGE (ID: ${i})]` + buildDocument(x)).join("\n")} [END_PAGE]\n`, isExtractEndpoint: true, - model: getModel("gemini-2.0-flash", "google"), + model: getModel("gemini-2.5-pro-preview-03-25", "vertex"), + retryModel: getModel("gemini-2.5-pro-preview-03-25", "google"), costTrackingOptions: { costTracking, metadata: { @@ -75,7 +76,7 @@ export async function singleAnswerCompletion({ promptTokens: 0, completionTokens: 0, totalTokens: 0, - model: "gemini-2.0-flash", + model: "gemini-2.5-pro-preview-03-25", }, sources: singleAnswerDocs.map( (doc) => doc.metadata.url || doc.metadata.sourceURL || "", diff --git a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts index e02f4c83..52ad2fa1 100644 --- a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts +++ b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts @@ -269,8 +269,6 @@ export async function extractData({ totalUsage: t, } = await generateCompletions({ ...extractOptionsNewSchema, - model: getModel("gemini-2.5-pro-preview-03-25", "vertex"), - retryModel: getModel("gemini-2.5-pro-preview-03-25", "google"), costTrackingOptions: { costTracking: extractOptions.costTrackingOptions.costTracking, metadata: { diff --git a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts index 48dcd128..4ff1f254 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts @@ -658,8 +658,8 @@ export async function performLLMExtract( // model: getModel("qwen-qwq-32b", "groq"), // model: getModel("gemini-2.0-flash", "google"), // model: getModel("gemini-2.5-pro-preview-03-25", "vertex"), - model: getModel("gemini-2.5-pro-preview-03-25", "vertex"), - retryModel: getModel("gemini-2.5-pro-preview-03-25", "google"), + model: getModel("gpt-4o-mini", "openai"), + retryModel: getModel("gpt-4o", "openai"), costTrackingOptions: { costTracking: meta.costTracking, metadata: { From f844b329f1635e7cda306904ea8b83241d502113 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 17 Apr 2025 11:04:45 -0700 Subject: [PATCH 154/160] remove agent preview rate limiter --- apps/api/src/routes/v1.ts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/apps/api/src/routes/v1.ts b/apps/api/src/routes/v1.ts index c53f55a7..185a70de 100644 --- a/apps/api/src/routes/v1.ts +++ b/apps/api/src/routes/v1.ts @@ -98,9 +98,9 @@ export function authMiddleware( rateLimiterMode = RateLimiterMode.ExtractAgentPreview; } - if (rateLimiterMode === RateLimiterMode.Scrape && isAgentExtractModelValid((req.body as any)?.agent?.model)) { - rateLimiterMode = RateLimiterMode.ScrapeAgentPreview; - } + // if (rateLimiterMode === RateLimiterMode.Scrape && isAgentExtractModelValid((req.body as any)?.agent?.model)) { + // rateLimiterMode = RateLimiterMode.ScrapeAgentPreview; + // } const auth = await authenticateUser(req, res, rateLimiterMode); From 5aa9469081879a7aadd7056ec9905dcfd07ea53e Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 17 Apr 2025 11:14:09 -0700 Subject: [PATCH 155/160] Update __init__.py --- apps/python-sdk/firecrawl/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py index 7ff7f018..c30ba0fb 100644 --- a/apps/python-sdk/firecrawl/__init__.py +++ b/apps/python-sdk/firecrawl/__init__.py @@ -13,7 +13,7 @@ import os from .firecrawl import FirecrawlApp # noqa -__version__ = "1.16.0" +__version__ = "1.17.0" # Define the logger for the Firecrawl project logger: logging.Logger = logging.getLogger("firecrawl") From 7df557e59cbd4beb8c01f71e75ae3bb65b164892 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 17 Apr 2025 11:33:01 -0700 Subject: [PATCH 156/160] feat(cost-tracking): add model tracking and more costs --- apps/api/src/lib/extract/extraction-service.ts | 1 + apps/api/src/lib/extract/url-processor.ts | 2 ++ apps/api/src/scraper/scrapeURL/lib/smartScrape.ts | 1 + apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts | 7 +++++++ 4 files changed, 11 insertions(+) diff --git a/apps/api/src/lib/extract/extraction-service.ts b/apps/api/src/lib/extract/extraction-service.ts index ae3df261..523c9c8c 100644 --- a/apps/api/src/lib/extract/extraction-service.ts +++ b/apps/api/src/lib/extract/extraction-service.ts @@ -72,6 +72,7 @@ export class CostTracking { type: "smartScrape" | "other", metadata: Record, cost: number, + model: string, tokens?: { input: number, output: number, diff --git a/apps/api/src/lib/extract/url-processor.ts b/apps/api/src/lib/extract/url-processor.ts index 9696bee2..6591a41b 100644 --- a/apps/api/src/lib/extract/url-processor.ts +++ b/apps/api/src/lib/extract/url-processor.ts @@ -28,6 +28,7 @@ export async function generateBasicCompletion(prompt: string, costTracking: Cost module: "extract", method: "generateBasicCompletion", }, + model: "openai/gpt-4o", cost: calculateCost("openai/gpt-4o", result.usage?.promptTokens ?? 0, result.usage?.completionTokens ?? 0), tokens: { input: result.usage?.promptTokens ?? 0, @@ -54,6 +55,7 @@ export async function generateBasicCompletion(prompt: string, costTracking: Cost module: "extract", method: "generateBasicCompletion", }, + model: "openai/gpt-4o-mini", cost: calculateCost("openai/gpt-4o-mini", result.usage?.promptTokens ?? 0, result.usage?.completionTokens ?? 0), tokens: { input: result.usage?.promptTokens ?? 0, diff --git a/apps/api/src/scraper/scrapeURL/lib/smartScrape.ts b/apps/api/src/scraper/scrapeURL/lib/smartScrape.ts index 8458e506..97e5b666 100644 --- a/apps/api/src/scraper/scrapeURL/lib/smartScrape.ts +++ b/apps/api/src/scraper/scrapeURL/lib/smartScrape.ts @@ -144,6 +144,7 @@ export async function smartScrape({ costTracking.addCall({ type: "smartScrape", cost: response.tokenUsage, + model: "firecrawl/smart-scrape", metadata: { module: "smartScrape", method: "smartScrape", diff --git a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts index 4ff1f254..64edea23 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts @@ -187,6 +187,8 @@ export function calculateCost( ) { const modelCosts = { "openai/o3-mini": { input_cost: 1.1, output_cost: 4.4 }, + "openai/gpt-4o-mini": { input_cost: 0.15, output_cost: 0.6 }, + "openai/gpt-4o": { input_cost: 2.5, output_cost: 10 }, "google/gemini-2.0-flash-001": { input_cost: 0.15, output_cost: 0.6 }, "deepseek/deepseek-r1": { input_cost: 0.55, output_cost: 2.19 }, "google/gemini-2.0-flash-thinking-exp:free": { @@ -289,6 +291,7 @@ export async function generateCompletions({ ...costTrackingOptions.metadata, gcDetails: "no-object", }, + model: currentModel.modelId, cost: calculateCost( currentModel.modelId, result.usage?.promptTokens ?? 0, @@ -340,6 +343,7 @@ export async function generateCompletions({ ...costTrackingOptions.metadata, gcDetails: "no-object fallback", }, + model: currentModel.modelId, cost: calculateCost( currentModel.modelId, result.usage?.promptTokens ?? 0, @@ -454,6 +458,7 @@ export async function generateCompletions({ repairUsage?.promptTokens ?? 0, repairUsage?.completionTokens ?? 0, ), + model: currentModel.modelId, tokens: { input: repairUsage?.promptTokens ?? 0, output: repairUsage?.completionTokens ?? 0, @@ -513,6 +518,7 @@ export async function generateCompletions({ input: result.usage?.promptTokens ?? 0, output: result.usage?.completionTokens ?? 0, }, + model: currentModel.modelId, cost: calculateCost( currentModel.modelId, result.usage?.promptTokens ?? 0, @@ -547,6 +553,7 @@ export async function generateCompletions({ input: result.usage?.promptTokens ?? 0, output: result.usage?.completionTokens ?? 0, }, + model: currentModel.modelId, cost: calculateCost( currentModel.modelId, result.usage?.promptTokens ?? 0, From 9bea877eb1d40f477035d02491168b991fa09cf4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 17 Apr 2025 21:44:28 +0200 Subject: [PATCH 157/160] feat(extract): cost limit (#1473) --- apps/api/src/controllers/auth.ts | 10 ++++---- .../completions/analyzeSchemaAndPrompt.ts | 2 +- .../lib/extract/completions/batchExtract.ts | 5 +++- .../extract/completions/checkShouldExtract.ts | 2 +- .../api/src/lib/extract/extraction-service.ts | 24 +++++++++++++++++-- .../scrapeURL/lib/extractSmartScrape.ts | 6 ++++- 6 files changed, 38 insertions(+), 11 deletions(-) diff --git a/apps/api/src/controllers/auth.ts b/apps/api/src/controllers/auth.ts index d0392428..e6a16cd7 100644 --- a/apps/api/src/controllers/auth.ts +++ b/apps/api/src/controllers/auth.ts @@ -75,11 +75,11 @@ export async function setCachedACUC( const mockPreviewACUC: (team_id: string, is_extract: boolean) => AuthCreditUsageChunk = (team_id, is_extract) => ({ api_key: "preview", team_id, - sub_id: "bypass", - sub_current_period_start: new Date().toISOString(), - sub_current_period_end: new Date(new Date().getTime() + 30 * 24 * 60 * 60 * 1000).toISOString(), - sub_user_id: "bypass", - price_id: "bypass", + sub_id: null, + sub_current_period_start: null, + sub_current_period_end: null, + sub_user_id: null, + price_id: null, rate_limits: { crawl: 2, scrape: 10, diff --git a/apps/api/src/lib/extract/completions/analyzeSchemaAndPrompt.ts b/apps/api/src/lib/extract/completions/analyzeSchemaAndPrompt.ts index 2626c50b..922ebeb9 100644 --- a/apps/api/src/lib/extract/completions/analyzeSchemaAndPrompt.ts +++ b/apps/api/src/lib/extract/completions/analyzeSchemaAndPrompt.ts @@ -32,7 +32,7 @@ export async function analyzeSchemaAndPrompt( const schemaString = JSON.stringify(schema); - const model = getModel("gpt-4o"); + const model = getModel("gpt-4o", "openai"); const checkSchema = z .object({ diff --git a/apps/api/src/lib/extract/completions/batchExtract.ts b/apps/api/src/lib/extract/completions/batchExtract.ts index b7075c6e..f35f3f78 100644 --- a/apps/api/src/lib/extract/completions/batchExtract.ts +++ b/apps/api/src/lib/extract/completions/batchExtract.ts @@ -10,7 +10,7 @@ import { buildBatchExtractSystemPrompt, } from "../build-prompts"; import { getModel } from "../../generic-ai"; -import { CostTracking } from "../extraction-service"; +import { CostTracking, CostLimitExceededError } from "../extraction-service"; import fs from "fs/promises"; import { extractData } from "../../../scraper/scrapeURL/lib/extractSmartScrape"; import type { Logger } from "winston"; @@ -102,6 +102,9 @@ export async function batchExtractPromise(options: BatchExtractOptions, logger: extractedDataArray = e; warning = w; } catch (error) { + if (error instanceof CostLimitExceededError) { + throw error; + } logger.error("extractData failed", { error }); } diff --git a/apps/api/src/lib/extract/completions/checkShouldExtract.ts b/apps/api/src/lib/extract/completions/checkShouldExtract.ts index e2c10ade..6f678ee8 100644 --- a/apps/api/src/lib/extract/completions/checkShouldExtract.ts +++ b/apps/api/src/lib/extract/completions/checkShouldExtract.ts @@ -33,7 +33,7 @@ export async function checkShouldExtract( }, markdown: buildDocument(doc), isExtractEndpoint: true, - model: getModel("gpt-4o-mini"), + model: getModel("gpt-4o-mini", "openai"), costTrackingOptions: { costTracking, metadata: { diff --git a/apps/api/src/lib/extract/extraction-service.ts b/apps/api/src/lib/extract/extraction-service.ts index 523c9c8c..6a3e6756 100644 --- a/apps/api/src/lib/extract/extraction-service.ts +++ b/apps/api/src/lib/extract/extraction-service.ts @@ -67,6 +67,14 @@ type completions = { sources?: string[]; }; +export class CostLimitExceededError extends Error { + constructor() { + super("Cost limit exceeded"); + this.message = "Cost limit exceeded"; + this.name = "CostLimitExceededError"; + } +} + export class CostTracking { calls: { type: "smartScrape" | "other", @@ -79,14 +87,21 @@ export class CostTracking { }, stack: string, }[] = []; + limit: number | null = null; - constructor() {} + constructor(limit: number | null = null) { + this.limit = limit; + } public addCall(call: Omit) { this.calls.push({ ...call, stack: new Error().stack!.split("\n").slice(2).join("\n"), }); + + if (this.limit !== null && this.toJSON().totalCost > this.limit) { + throw new CostLimitExceededError(); + } } public toJSON() { @@ -115,7 +130,8 @@ export async function performExtraction( let singleAnswerResult: any = {}; let totalUrlsScraped = 0; let sources: Record = {}; - let costTracking: CostTracking = new CostTracking(); + + let costTracking = new CostTracking(subId ? null : 1.5); let log = { extractId, @@ -532,6 +548,10 @@ export async function performExtraction( return null; } catch (error) { + if (error instanceof CostLimitExceededError) { + throw error; + } + logger.error(`Failed to process document.`, { error, url: doc.metadata.url ?? doc.metadata.sourceURL!, diff --git a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts index 52ad2fa1..2dffe047 100644 --- a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts +++ b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts @@ -10,7 +10,7 @@ import { parseMarkdown } from "../../../lib/html-to-markdown"; import { getModel } from "../../../lib/generic-ai"; import { TokenUsage } from "../../../controllers/v1/types"; import type { SmartScrapeResult } from "./smartScrape"; -import { CostTracking } from "../../../lib/extract/extraction-service"; +import { CostLimitExceededError, CostTracking } from "../../../lib/extract/extraction-service"; const commonSmartScrapeProperties = { shouldUseSmartscrape: { type: "boolean", @@ -282,6 +282,10 @@ export async function extractData({ warning = w; totalUsage = t; } catch (error) { + if (error instanceof CostLimitExceededError) { + throw error; + } + logger.error( "failed during extractSmartScrape.ts:generateCompletions", { error }, From 33aece8e96b86b17304b9a46149efcccaad65e1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 17 Apr 2025 14:00:44 -0700 Subject: [PATCH 158/160] more cost calc --- apps/api/src/lib/generate-llmstxt/generate-llmstxt-service.ts | 2 +- apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/apps/api/src/lib/generate-llmstxt/generate-llmstxt-service.ts b/apps/api/src/lib/generate-llmstxt/generate-llmstxt-service.ts index 3528f48f..7f48872e 100644 --- a/apps/api/src/lib/generate-llmstxt/generate-llmstxt-service.ts +++ b/apps/api/src/lib/generate-llmstxt/generate-llmstxt-service.ts @@ -160,7 +160,7 @@ export async function performGenerateLlmsTxt( const { extract } = await generateCompletions({ logger, - model: getModel("gpt-4o-mini"), + model: getModel("gpt-4o-mini", "openai"), options: { systemPrompt: "", mode: "llm", diff --git a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts index 64edea23..b63b2f85 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts @@ -187,6 +187,7 @@ export function calculateCost( ) { const modelCosts = { "openai/o3-mini": { input_cost: 1.1, output_cost: 4.4 }, + "gpt-4o-mini": { input_cost: 0.15, output_cost: 0.6 }, "openai/gpt-4o-mini": { input_cost: 0.15, output_cost: 0.6 }, "openai/gpt-4o": { input_cost: 2.5, output_cost: 10 }, "google/gemini-2.0-flash-001": { input_cost: 0.15, output_cost: 0.6 }, From f2c01340d19b8f2d15d5622a50f116464b16997a Mon Sep 17 00:00:00 2001 From: kkharji Date: Fri, 18 Apr 2025 07:59:59 +0300 Subject: [PATCH 159/160] feat(rust): update rust sdk to support new features (#1446) * chore(rust-sdk): cargo fmt * feat(rust-sdk): implement search api + example + test * feat(rust-sdk): implement crawl cancel api + example + test * feat(rust-sdk): implement crawl check errors api + example + test * feat(rust-sdk): implement batch crawl + test + example + Fix MapOptions * feat(rust-sdk): implement extract api + test + example * feat(rust-sdk): implement llmtxt api + test + example * chore(rust-sdk): correct mock tests * chore(rust-sdk): prep for cargo distribution --- apps/rust-sdk/Cargo.lock | 1377 ++++++++++++++++- apps/rust-sdk/Cargo.toml | 10 +- .../rust-sdk/examples/batch_scrape_example.rs | 175 +++ .../rust-sdk/examples/cancel_crawl_example.rs | 33 + .../examples/check_crawl_errors_example.rs | 59 + apps/rust-sdk/examples/example.rs | 30 +- apps/rust-sdk/examples/extract_example.rs | 237 +++ apps/rust-sdk/examples/llmstxt_example.rs | 173 +++ apps/rust-sdk/examples/search_example.rs | 186 +++ apps/rust-sdk/src/batch_scrape.rs | 494 ++++++ apps/rust-sdk/src/crawl.rs | 333 +++- apps/rust-sdk/src/document.rs | 5 +- apps/rust-sdk/src/error.rs | 2 + apps/rust-sdk/src/extract.rs | 596 +++++++ apps/rust-sdk/src/lib.rs | 45 +- apps/rust-sdk/src/llmstxt.rs | 426 +++++ apps/rust-sdk/src/map.rs | 6 +- apps/rust-sdk/src/scrape.rs | 18 +- apps/rust-sdk/src/search.rs | 245 +++ apps/rust-sdk/tests/e2e_with_auth.rs | 25 +- 20 files changed, 4350 insertions(+), 125 deletions(-) create mode 100644 apps/rust-sdk/examples/batch_scrape_example.rs create mode 100644 apps/rust-sdk/examples/cancel_crawl_example.rs create mode 100644 apps/rust-sdk/examples/check_crawl_errors_example.rs create mode 100644 apps/rust-sdk/examples/extract_example.rs create mode 100644 apps/rust-sdk/examples/llmstxt_example.rs create mode 100644 apps/rust-sdk/examples/search_example.rs create mode 100644 apps/rust-sdk/src/batch_scrape.rs create mode 100644 apps/rust-sdk/src/extract.rs create mode 100644 apps/rust-sdk/src/llmstxt.rs create mode 100644 apps/rust-sdk/src/search.rs diff --git a/apps/rust-sdk/Cargo.lock b/apps/rust-sdk/Cargo.lock index 2ea5de69..d7a7b64e 100644 --- a/apps/rust-sdk/Cargo.lock +++ b/apps/rust-sdk/Cargo.lock @@ -17,6 +17,21 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" +[[package]] +name = "adler2" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627" + +[[package]] +name = "aho-corasick" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" +dependencies = [ + "memchr", +] + [[package]] name = "android-tzdata" version = "0.1.1" @@ -32,6 +47,71 @@ dependencies = [ "libc", ] +[[package]] +name = "ansi_colours" +version = "1.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14eec43e0298190790f41679fe69ef7a829d2a2ddd78c8c00339e84710e435fe" +dependencies = [ + "rgb", +] + +[[package]] +name = "anstream" +version = "0.6.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9" + +[[package]] +name = "anstyle-parse" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c" +dependencies = [ + "windows-sys 0.59.0", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca3534e77181a9cc07539ad51f2141fe32f6c3ffd4df76db8ad92346b003ae4e" +dependencies = [ + "anstyle", + "once_cell", + "windows-sys 0.59.0", +] + +[[package]] +name = "anyhow" +version = "1.0.97" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dcfed56ad506cb2c684a14971b8861fdc3baaaae314b9e5f9bb532cbe3ba7a4f" + [[package]] name = "arrayref" version = "0.3.7" @@ -44,6 +124,16 @@ version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "23b62fc65de8e4e7f52534fb52b0f3ed04746ae267519eef2a83941e8085068b" +[[package]] +name = "assert-json-diff" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47e4f2b81832e72834d7518d8487a0396a28cc408186a2e8854c0f98011faf12" +dependencies = [ + "serde", + "serde_json", +] + [[package]] name = "assert_matches" version = "1.5.0" @@ -62,6 +152,72 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0" +[[package]] +name = "axum" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "de45108900e1f9b9242f7f2e254aa3e2c029c921c258fe9e6b4217eeebd54288" +dependencies = [ + "axum-core", + "axum-macros", + "bytes", + "form_urlencoded", + "futures-util", + "http", + "http-body", + "http-body-util", + "hyper", + "hyper-util", + "itoa", + "matchit", + "memchr", + "mime", + "percent-encoding", + "pin-project-lite", + "rustversion", + "serde", + "serde_json", + "serde_path_to_error", + "serde_urlencoded", + "sync_wrapper", + "tokio", + "tower 0.5.2", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "axum-core" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68464cd0412f486726fb3373129ef5d2993f90c34bc2bc1c1e9943b2f4fc7ca6" +dependencies = [ + "bytes", + "futures-core", + "http", + "http-body", + "http-body-util", + "mime", + "pin-project-lite", + "rustversion", + "sync_wrapper", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "axum-macros" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "604fde5e028fea851ce1d8570bbdc034bec850d157f7569d10f347d06808c05c" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "backtrace" version = "0.3.73" @@ -72,7 +228,7 @@ dependencies = [ "cc", "cfg-if", "libc", - "miniz_oxide", + "miniz_oxide 0.7.4", "object", "rustc-demangle", ] @@ -89,6 +245,59 @@ version = "0.22.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" +[[package]] +name = "bat" +version = "0.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2ab792c2ad113a666f08856c88cdec0a62d732559b1f3982eedf0142571e669a" +dependencies = [ + "ansi_colours", + "anyhow", + "bincode", + "bugreport", + "bytesize", + "clap", + "clircle", + "console", + "content_inspector", + "encoding_rs", + "etcetera", + "flate2", + "git2", + "globset", + "grep-cli", + "home", + "indexmap 2.9.0", + "itertools", + "nu-ansi-term", + "once_cell", + "path_abs", + "plist", + "regex", + "semver", + "serde", + "serde_derive", + "serde_with", + "serde_yaml", + "shell-words", + "syntect", + "terminal-colorsaurus", + "thiserror", + "toml", + "unicode-width 0.1.14", + "walkdir", + "wild", +] + +[[package]] +name = "bincode" +version = "1.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" +dependencies = [ + "serde", +] + [[package]] name = "bitflags" version = "1.3.2" @@ -112,12 +321,40 @@ dependencies = [ "constant_time_eq", ] +[[package]] +name = "bstr" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "234113d19d0d7d613b40e86fb654acf958910802bcceab913a4f9e7cda03b1a4" +dependencies = [ + "memchr", + "regex-automata", + "serde", +] + +[[package]] +name = "bugreport" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f280f65ce85b880919349bbfcb204930291251eedcb2e5f84ce2f51df969c162" +dependencies = [ + "git-version", + "shell-escape", + "sysinfo", +] + [[package]] name = "bumpalo" version = "3.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" +[[package]] +name = "bytemuck" +version = "1.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6b1fc10dbac614ebc03540c9dbd60e83887fda27794998c6528f1782047d540" + [[package]] name = "byteorder" version = "1.5.0" @@ -130,11 +367,22 @@ version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "514de17de45fdb8dc022b1a7975556c53c86f9f0aa5f534b98977b171857c2c9" +[[package]] +name = "bytesize" +version = "1.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e93abca9e28e0a1b9877922aacb20576e05d4679ffa78c3d6dc22a26a216659" + [[package]] name = "cc" version = "1.0.105" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5208975e568d83b6b05cc0a063c8e7e9acc2b43bee6da15616a5b73e109d7437" +dependencies = [ + "jobserver", + "libc", + "once_cell", +] [[package]] name = "cfg-if" @@ -155,6 +403,47 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "clap" +version = "4.5.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8aa86934b44c19c50f87cc2790e19f54f7a67aedb64101c2e1a2e5ecfb73944" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.5.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2414dbb2dd0695280da6ea9261e327479e9d37b0630f6b53ba2a11c60c679fd9" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim", + "terminal_size", +] + +[[package]] +name = "clap_derive" +version = "4.5.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09176aae279615badda0765c0c0b3f6ed53f4709118af73cf4655d85d1530cd7" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "clap_lex" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6" + [[package]] name = "clippy" version = "0.0.302" @@ -164,12 +453,59 @@ dependencies = [ "term", ] +[[package]] +name = "clircle" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d9334f725b46fb9bed8580b9b47a932587e044fadb344ed7fa98774b067ac1a" +dependencies = [ + "cfg-if", + "windows 0.56.0", +] + +[[package]] +name = "colorchoice" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990" + +[[package]] +name = "colored" +version = "3.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fde0e0ec90c9dfb3b4b1a0891a7dcd0e2bffde2f7efed5fe7c9bb00e5bfb915e" +dependencies = [ + "windows-sys 0.59.0", +] + +[[package]] +name = "console" +version = "0.15.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "054ccb5b10f9f2cbf51eb355ca1d05c2d279ce1804688d0db74b4733a5aeafd8" +dependencies = [ + "encode_unicode", + "libc", + "once_cell", + "unicode-width 0.2.0", + "windows-sys 0.59.0", +] + [[package]] name = "constant_time_eq" version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "245097e9a4535ee1e3e3931fcfcd55a796a44c643e8596ff6566d68f09b87bbc" +[[package]] +name = "content_inspector" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7bda66e858c683005a53a9a60c69a4aca7eeaa45d124526e389f7aec8e62f38" +dependencies = [ + "memchr", +] + [[package]] name = "core-foundation" version = "0.9.4" @@ -182,9 +518,37 @@ dependencies = [ [[package]] name = "core-foundation-sys" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" + +[[package]] +name = "crc32fast" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "crossbeam-deque" version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] [[package]] name = "crossbeam-utils" @@ -255,10 +619,28 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1aaf95b3e5c8f23aa320147307562d361db0ae0d51242340f558153b4eb2439b" [[package]] -name = "encoding_rs" -version = "0.8.34" +name = "dyn-clone" +version = "1.0.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b45de904aa0b010bce2ab45264d0631681847fa7b6f2eaa7dab7619943bc4f59" +checksum = "1c7a8fb8a9fbf66c1f703fe16184d10ca0ee9d23be5b4436400408ba54a95005" + +[[package]] +name = "either" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" + +[[package]] +name = "encode_unicode" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0" + +[[package]] +name = "encoding_rs" +version = "0.8.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3" dependencies = [ "cfg-if", ] @@ -279,6 +661,17 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "etcetera" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "136d1b5283a1ab77bd9257427ffd09d8667ced0570b6f938942bc7568ed5b943" +dependencies = [ + "cfg-if", + "home", + "windows-sys 0.48.0", +] + [[package]] name = "fastrand" version = "2.1.0" @@ -287,13 +680,19 @@ checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a" [[package]] name = "firecrawl" -version = "1.0.0" +version = "1.1.0" dependencies = [ "assert_matches", + "axum", + "bat", + "clap", "clippy", "dotenvy", + "futures", "log", + "mockito", "reqwest", + "schemars", "serde", "serde_json", "serde_with", @@ -302,6 +701,16 @@ dependencies = [ "uuid", ] +[[package]] +name = "flate2" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ced92e76e966ca2fd84c8f7aa01a4aea65b0eb6648d72f7c8f3e2764a67fece" +dependencies = [ + "crc32fast", + "miniz_oxide 0.8.8", +] + [[package]] name = "fnv" version = "1.0.7" @@ -333,10 +742,25 @@ dependencies = [ ] [[package]] -name = "futures-channel" -version = "0.3.30" +name = "futures" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78" +checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876" +dependencies = [ + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-channel" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10" dependencies = [ "futures-core", "futures-sink", @@ -344,36 +768,60 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d" +checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e" + +[[package]] +name = "futures-executor" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] [[package]] name = "futures-io" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1" +checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" + +[[package]] +name = "futures-macro" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] [[package]] name = "futures-sink" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5" +checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7" [[package]] name = "futures-task" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004" +checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988" [[package]] name = "futures-util" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48" +checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81" dependencies = [ + "futures-channel", "futures-core", "futures-io", + "futures-macro", "futures-sink", "futures-task", "memchr", @@ -404,12 +852,90 @@ dependencies = [ "wasi 0.11.0+wasi-snapshot-preview1", ] +[[package]] +name = "getrandom" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73fea8450eea4bac3940448fb7ae50d91f034f941199fcd9d909a5a07aa455f0" +dependencies = [ + "cfg-if", + "libc", + "r-efi", + "wasi 0.14.2+wasi-0.2.4", +] + [[package]] name = "gimli" version = "0.29.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "40ecd4077b5ae9fd2e9e169b102c6c330d0605168eb0e8bf79952b256dbefffd" +[[package]] +name = "git-version" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ad568aa3db0fcbc81f2f116137f263d7304f512a1209b35b85150d3ef88ad19" +dependencies = [ + "git-version-macro", +] + +[[package]] +name = "git-version-macro" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53010ccb100b96a67bc32c0175f0ed1426b31b655d562898e57325f81c023ac0" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "git2" +version = "0.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b903b73e45dc0c6c596f2d37eccece7c1c8bb6e4407b001096387c63d0d93724" +dependencies = [ + "bitflags 2.6.0", + "libc", + "libgit2-sys", + "log", + "url", +] + +[[package]] +name = "glob" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8d1add55171497b4705a648c6b583acafb01d58050a51727785f0b2c8e0a2b2" + +[[package]] +name = "globset" +version = "0.4.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "54a1028dfc5f5df5da8a56a73e6c153c9a9708ec57232470703592a3f18e49f5" +dependencies = [ + "aho-corasick", + "bstr", + "log", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "grep-cli" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47f1288f0e06f279f84926fa4c17e3fcd2a22b357927a82f2777f7be26e4cec0" +dependencies = [ + "bstr", + "globset", + "libc", + "log", + "termcolor", + "winapi-util", +] + [[package]] name = "h2" version = "0.4.5" @@ -422,7 +948,7 @@ dependencies = [ "futures-core", "futures-sink", "http", - "indexmap 2.2.6", + "indexmap 2.9.0", "slab", "tokio", "tokio-util", @@ -437,9 +963,15 @@ checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" [[package]] name = "hashbrown" -version = "0.14.5" +version = "0.15.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" +checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289" + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" [[package]] name = "hermit-abi" @@ -453,6 +985,15 @@ version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" +[[package]] +name = "home" +version = "0.5.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589533453244b0995c858700322199b2becb13b627df2851f64a2775d024abcf" +dependencies = [ + "windows-sys 0.59.0", +] + [[package]] name = "http" version = "1.1.0" @@ -494,10 +1035,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fcc0b4a115bf80b728eb8ea024ad5bd707b615bfed49e0665b6e0f86fd082d9" [[package]] -name = "hyper" -version = "1.4.1" +name = "httpdate" +version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50dfd22e0e76d0f662d429a5f80fcaf3855009297eab6a0a9f8543834744ba05" +checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" + +[[package]] +name = "hyper" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc2b571658e38e0c01b1fdca3bbbe93c00d3d71693ff2770043f8c29bc7d6f80" dependencies = [ "bytes", "futures-channel", @@ -506,6 +1053,7 @@ dependencies = [ "http", "http-body", "httparse", + "httpdate", "itoa", "pin-project-lite", "smallvec", @@ -561,7 +1109,7 @@ dependencies = [ "pin-project-lite", "socket2", "tokio", - "tower", + "tower 0.4.13", "tower-service", "tracing", ] @@ -577,7 +1125,7 @@ dependencies = [ "iana-time-zone-haiku", "js-sys", "wasm-bindgen", - "windows-core", + "windows-core 0.52.0", ] [[package]] @@ -618,12 +1166,12 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.2.6" +version = "2.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "168fb715dda47215e360912c096649d23d58bf392ac62f73919e831745e40f26" +checksum = "cea70ddb795996207ad57735b50c5982d8844f38ba9ee5f1aedcfb708a2aa11e" dependencies = [ "equivalent", - "hashbrown 0.14.5", + "hashbrown 0.15.2", "serde", ] @@ -633,12 +1181,37 @@ version = "2.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f518f335dce6725a761382244631d86cf0ccb2863413590b31338feb467f9c3" +[[package]] +name = "is_terminal_polyfill" +version = "1.70.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" + +[[package]] +name = "itertools" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" +[[package]] +name = "jobserver" +version = "0.1.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38f262f097c174adebe41eb73d66ae9c06b2844fb0da69969647bbddd9b0538a" +dependencies = [ + "getrandom 0.3.2", + "libc", +] + [[package]] name = "js-sys" version = "0.3.69" @@ -650,9 +1223,39 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.155" +version = "0.2.171" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c" +checksum = "c19937216e9d3aa9956d9bb8dfc0b0c8beb6058fc4f7a4dc4d850edf86a237d6" + +[[package]] +name = "libgit2-sys" +version = "0.17.0+1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "10472326a8a6477c3c20a64547b0059e4b0d086869eee31e6d7da728a8eb7224" +dependencies = [ + "cc", + "libc", + "libz-sys", + "pkg-config", +] + +[[package]] +name = "libz-sys" +version = "1.1.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b70e7a7df205e92a1a4cd9aaae7898dac0aa555503cc0a649494d0d60e7651d" +dependencies = [ + "cc", + "libc", + "pkg-config", + "vcpkg", +] + +[[package]] +name = "linked-hash-map" +version = "0.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0717cef1bc8b636c6e1c1bbdefc09e6322da8a9321966e8928ef80d20f7f770f" [[package]] name = "linux-raw-sys" @@ -676,6 +1279,12 @@ version = "0.4.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" +[[package]] +name = "matchit" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47e1ffaa40ddd1f3ed91f717a33c8c0ee23fff369e3aa8772b9605cc1d22f4c3" + [[package]] name = "memchr" version = "2.7.4" @@ -697,6 +1306,15 @@ dependencies = [ "adler", ] +[[package]] +name = "miniz_oxide" +version = "0.8.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3be647b768db090acb35d5ec5db2b0e1f1de11133ca123b9eacf5137868f892a" +dependencies = [ + "adler2", +] + [[package]] name = "mio" version = "1.0.2" @@ -709,6 +1327,30 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "mockito" +version = "1.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7760e0e418d9b7e5777c0374009ca4c93861b9066f18cb334a20ce50ab63aa48" +dependencies = [ + "assert-json-diff", + "bytes", + "colored", + "futures-util", + "http", + "http-body", + "http-body-util", + "hyper", + "hyper-util", + "log", + "rand", + "regex", + "serde_json", + "serde_urlencoded", + "similar", + "tokio", +] + [[package]] name = "native-tls" version = "0.2.12" @@ -726,6 +1368,24 @@ dependencies = [ "tempfile", ] +[[package]] +name = "ntapi" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8a3895c6391c39d7fe7ebc444a87eb2991b2a0bc718fdabd071eec617fc68e4" +dependencies = [ + "winapi", +] + +[[package]] +name = "nu-ansi-term" +version = "0.50.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4a28e057d01f97e61255210fcff094d74ed0466038633e95017f5beb68e4399" +dependencies = [ + "windows-sys 0.52.0", +] + [[package]] name = "num-conv" version = "0.1.0" @@ -752,9 +1412,31 @@ dependencies = [ [[package]] name = "once_cell" -version = "1.19.0" +version = "1.21.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" +checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" + +[[package]] +name = "onig" +version = "6.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c4b31c8722ad9171c6d77d3557db078cab2bd50afcc9d09c8b315c59df8ca4f" +dependencies = [ + "bitflags 1.3.2", + "libc", + "once_cell", + "onig_sys", +] + +[[package]] +name = "onig_sys" +version = "69.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b829e3d7e9cc74c7e315ee8edb185bf4190da5acde74afd7fc59c35b1f086e7" +dependencies = [ + "cc", + "pkg-config", +] [[package]] name = "openssl" @@ -823,6 +1505,15 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "path_abs" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05ef02f6342ac01d8a93b65f96db53fe68a92a15f41144f97fb00a9e669633c3" +dependencies = [ + "std_prelude", +] + [[package]] name = "percent-encoding" version = "2.3.1" @@ -867,12 +1558,34 @@ version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec" +[[package]] +name = "plist" +version = "1.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eac26e981c03a6e53e0aee43c113e3202f5581d5360dae7bd2c70e800dd0451d" +dependencies = [ + "base64 0.22.1", + "indexmap 2.9.0", + "quick-xml", + "serde", + "time", +] + [[package]] name = "powerfmt" version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" +[[package]] +name = "ppv-lite86" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" +dependencies = [ + "zerocopy", +] + [[package]] name = "proc-macro2" version = "1.0.86" @@ -882,6 +1595,15 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "quick-xml" +version = "0.32.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d3a6e5838b60e0e8fa7a43f22ade549a37d61f8bdbe636d0d7816191de969c2" +dependencies = [ + "memchr", +] + [[package]] name = "quote" version = "1.0.36" @@ -891,6 +1613,62 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "r-efi" +version = "5.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74765f6d916ee2faa39bc8e68e4f3ed8949b48cccdac59983d287a7cb71ce9c5" + +[[package]] +name = "rand" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3779b94aeb87e8bd4e834cee3650289ee9e0d5677f976ecdb6d219e5f4f6cd94" +dependencies = [ + "rand_chacha", + "rand_core", + "zerocopy", +] + +[[package]] +name = "rand_chacha" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38" +dependencies = [ + "getrandom 0.3.2", +] + +[[package]] +name = "rayon" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + [[package]] name = "redox_syscall" version = "0.1.57" @@ -917,6 +1695,35 @@ dependencies = [ "rust-argon2", ] +[[package]] +name = "regex" +version = "1.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" + [[package]] name = "reqwest" version = "0.12.5" @@ -961,6 +1768,15 @@ dependencies = [ "winreg", ] +[[package]] +name = "rgb" +version = "0.8.50" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57397d16646700483b67d2dd6511d79318f9d057fdbd21a4066aeac8b41d310a" +dependencies = [ + "bytemuck", +] + [[package]] name = "ring" version = "0.17.8" @@ -1047,12 +1863,27 @@ dependencies = [ "untrusted", ] +[[package]] +name = "rustversion" +version = "1.0.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eded382c5f5f786b989652c49544c4877d9f015cc22e145a5ea8ea66c2921cd2" + [[package]] name = "ryu" version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + [[package]] name = "schannel" version = "0.1.23" @@ -1062,6 +1893,30 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "schemars" +version = "0.8.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fbf2ae1b8bc8e02df939598064d22402220cd5bbcca1c76f7d6a310974d5615" +dependencies = [ + "dyn-clone", + "schemars_derive", + "serde", + "serde_json", +] + +[[package]] +name = "schemars_derive" +version = "0.8.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32e265784ad618884abaea0600a9adf15393368d840e0222d101a072f3f7534d" +dependencies = [ + "proc-macro2", + "quote", + "serde_derive_internals", + "syn", +] + [[package]] name = "scopeguard" version = "1.2.0" @@ -1091,6 +1946,12 @@ dependencies = [ "libc", ] +[[package]] +name = "semver" +version = "1.0.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56e6fa9c48d24d85fb3de5ad847117517440f6beceb7798af16b4a87d616b8d0" + [[package]] name = "serde" version = "1.0.204" @@ -1111,6 +1972,17 @@ dependencies = [ "syn", ] +[[package]] +name = "serde_derive_internals" +version = "0.29.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "serde_json" version = "1.0.120" @@ -1122,6 +1994,25 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_path_to_error" +version = "0.1.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59fab13f937fa393d08645bf3a84bdfe86e296747b506ada67bb15f10f218b2a" +dependencies = [ + "itoa", + "serde", +] + +[[package]] +name = "serde_spanned" +version = "0.6.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87607cb1398ed59d48732e575a4c28a7a8ebf2454b964fe3f224f2afc07909e1" +dependencies = [ + "serde", +] + [[package]] name = "serde_urlencoded" version = "0.7.1" @@ -1136,15 +2027,15 @@ dependencies = [ [[package]] name = "serde_with" -version = "3.9.0" +version = "3.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69cecfa94848272156ea67b2b1a53f20fc7bc638c4a46d2f8abde08f05f4b857" +checksum = "d6b6f7f2fcb69f747921f79f3926bd1e203fce4fef62c268dd3abfb6d86029aa" dependencies = [ "base64 0.22.1", "chrono", "hex", "indexmap 1.9.3", - "indexmap 2.2.6", + "indexmap 2.9.0", "serde", "serde_derive", "serde_json", @@ -1154,9 +2045,9 @@ dependencies = [ [[package]] name = "serde_with_macros" -version = "3.9.0" +version = "3.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8fee4991ef4f274617a51ad4af30519438dacb2f56ac773b08a1922ff743350" +checksum = "8d00caa5193a3c8362ac2b73be6b9e768aa5a4b2f721d8f4b339600c3cb51f8e" dependencies = [ "darling", "proc-macro2", @@ -1164,6 +2055,31 @@ dependencies = [ "syn", ] +[[package]] +name = "serde_yaml" +version = "0.9.34+deprecated" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a8b1a1a2ebf674015cc02edccce75287f1a0130d394307b36743c2f5d504b47" +dependencies = [ + "indexmap 2.9.0", + "itoa", + "ryu", + "serde", + "unsafe-libyaml", +] + +[[package]] +name = "shell-escape" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "45bb67a18fa91266cc7807181f62f9178a6873bfad7dc788c42e6430db40184f" + +[[package]] +name = "shell-words" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24188a676b6ae68c3b2cb3a01be17fbf7240ce009799bb56d5b1409051e78fde" + [[package]] name = "signal-hook-registry" version = "1.4.2" @@ -1173,6 +2089,12 @@ dependencies = [ "libc", ] +[[package]] +name = "similar" +version = "2.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbbb5d9659141646ae647b42fe094daf6c6192d1620870b449d9557f748b2daa" + [[package]] name = "slab" version = "0.4.9" @@ -1204,6 +2126,12 @@ version = "0.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" +[[package]] +name = "std_prelude" +version = "0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8207e78455ffdf55661170876f88daf85356e4edd54e0a3dbc79586ca1e50cbe" + [[package]] name = "strsim" version = "0.11.1" @@ -1233,6 +2161,42 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7065abeca94b6a8a577f9bd45aa0867a2238b74e8eb67cf10d492bc39351394" +[[package]] +name = "syntect" +version = "5.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "874dcfa363995604333cf947ae9f751ca3af4522c60886774c4963943b4746b1" +dependencies = [ + "bincode", + "bitflags 1.3.2", + "flate2", + "fnv", + "once_cell", + "onig", + "plist", + "regex-syntax", + "serde", + "serde_derive", + "serde_json", + "thiserror", + "walkdir", + "yaml-rust", +] + +[[package]] +name = "sysinfo" +version = "0.33.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fc858248ea01b66f19d8e8a6d55f41deaf91e9d495246fd01368d99935c6c01" +dependencies = [ + "core-foundation-sys", + "libc", + "memchr", + "ntapi", + "rayon", + "windows 0.57.0", +] + [[package]] name = "system-configuration" version = "0.5.1" @@ -1277,6 +2241,51 @@ dependencies = [ "winapi", ] +[[package]] +name = "termcolor" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06794f8f6c5c898b3275aebefa6b8a1cb24cd2c6c79397ab15774837a0bc5755" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "terminal-colorsaurus" +version = "0.4.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7afe4c174a3cbfb52ebcb11b28965daf74fe9111d4e07e40689d05af06e26e8" +dependencies = [ + "cfg-if", + "libc", + "memchr", + "mio", + "terminal-trx", + "windows-sys 0.59.0", + "xterm-color", +] + +[[package]] +name = "terminal-trx" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "975b4233aefa1b02456d5e53b22c61653c743e308c51cf4181191d8ce41753ab" +dependencies = [ + "cfg-if", + "libc", + "windows-sys 0.59.0", +] + +[[package]] +name = "terminal_size" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5352447f921fda68cf61b4101566c0bdb5104eff6804d0678e5227580ab6a4e9" +dependencies = [ + "rustix", + "windows-sys 0.59.0", +] + [[package]] name = "thiserror" version = "1.0.61" @@ -1345,9 +2354,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.42.0" +version = "1.44.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5cec9b21b0450273377fc97bd4c33a8acffc8c996c987a7c5b319a0083707551" +checksum = "e6b88822cbe49de4185e3a4cbf8321dd487cf5fe0c5c65695fef6346371e9c48" dependencies = [ "backtrace", "bytes", @@ -1363,9 +2372,9 @@ dependencies = [ [[package]] name = "tokio-macros" -version = "2.4.0" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752" +checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8" dependencies = [ "proc-macro2", "quote", @@ -1406,6 +2415,41 @@ dependencies = [ "tokio", ] +[[package]] +name = "toml" +version = "0.8.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd87a5cdd6ffab733b2f74bc4fd7ee5fff6634124999ac278c35fc78c6120148" +dependencies = [ + "indexmap 2.9.0", + "serde", + "serde_spanned", + "toml_datetime", + "toml_edit", +] + +[[package]] +name = "toml_datetime" +version = "0.6.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0dd7358ecb8fc2f8d014bf86f6f638ce72ba252a2c3a2572f2a795f1d23efb41" +dependencies = [ + "serde", +] + +[[package]] +name = "toml_edit" +version = "0.22.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "17b4795ff5edd201c7cd6dca065ae59972ce77d1b80fa0a84d94950ece7d1474" +dependencies = [ + "indexmap 2.9.0", + "serde", + "serde_spanned", + "toml_datetime", + "winnow", +] + [[package]] name = "tower" version = "0.4.13" @@ -1422,16 +2466,32 @@ dependencies = [ ] [[package]] -name = "tower-layer" -version = "0.3.2" +name = "tower" +version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c20c8dbed6283a09604c3e69b4b7eeb54e298b8a600d4d5ecb5ad39de609f1d0" +checksum = "d039ad9159c98b70ecfd540b2573b97f7f52c3e8d9f8ad57a24b916a536975f9" +dependencies = [ + "futures-core", + "futures-util", + "pin-project-lite", + "sync_wrapper", + "tokio", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "tower-layer" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e" [[package]] name = "tower-service" -version = "0.3.2" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52" +checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3" [[package]] name = "tracing" @@ -1439,6 +2499,7 @@ version = "0.1.40" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef" dependencies = [ + "log", "pin-project-lite", "tracing-core", ] @@ -1479,6 +2540,24 @@ dependencies = [ "tinyvec", ] +[[package]] +name = "unicode-width" +version = "0.1.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af" + +[[package]] +name = "unicode-width" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fc81956842c57dac11422a97c3b8195a1ff727f06e85c84ed2e8aa277c9a0fd" + +[[package]] +name = "unsafe-libyaml" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "673aac59facbab8a9007c7f6108d11f63b603f7cabff99fabf650fea5c32b861" + [[package]] name = "untrusted" version = "0.9.0" @@ -1496,6 +2575,12 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + [[package]] name = "uuid" version = "1.10.0" @@ -1511,6 +2596,16 @@ version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + [[package]] name = "want" version = "0.3.1" @@ -1532,6 +2627,15 @@ version = "0.11.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" +[[package]] +name = "wasi" +version = "0.14.2+wasi-0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9683f9a5a998d873c0d21fcbe3c083009670149a8fab228644b8bd36b2c48cb3" +dependencies = [ + "wit-bindgen-rt", +] + [[package]] name = "wasm-bindgen" version = "0.2.92" @@ -1608,6 +2712,15 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "wild" +version = "2.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a3131afc8c575281e1e80f36ed6a092aa502c08b18ed7524e86fbbb12bb410e1" +dependencies = [ + "glob", +] + [[package]] name = "winapi" version = "0.3.9" @@ -1624,12 +2737,41 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" +[[package]] +name = "winapi-util" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" +dependencies = [ + "windows-sys 0.59.0", +] + [[package]] name = "winapi-x86_64-pc-windows-gnu" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +[[package]] +name = "windows" +version = "0.56.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1de69df01bdf1ead2f4ac895dc77c9351aefff65b2f3db429a343f9cbf05e132" +dependencies = [ + "windows-core 0.56.0", + "windows-targets 0.52.6", +] + +[[package]] +name = "windows" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "12342cb4d8e3b046f3d80effd474a7a02447231330ef77d71daa6fbc40681143" +dependencies = [ + "windows-core 0.57.0", + "windows-targets 0.52.6", +] + [[package]] name = "windows-core" version = "0.52.0" @@ -1639,6 +2781,83 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "windows-core" +version = "0.56.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4698e52ed2d08f8658ab0c39512a7c00ee5fe2688c65f8c0a4f06750d729f2a6" +dependencies = [ + "windows-implement 0.56.0", + "windows-interface 0.56.0", + "windows-result", + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-core" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2ed2439a290666cd67ecce2b0ffaad89c2a56b976b736e6ece670297897832d" +dependencies = [ + "windows-implement 0.57.0", + "windows-interface 0.57.0", + "windows-result", + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-implement" +version = "0.56.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6fc35f58ecd95a9b71c4f2329b911016e6bec66b3f2e6a4aad86bd2e99e2f9b" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "windows-implement" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9107ddc059d5b6fbfbffdfa7a7fe3e22a226def0b2608f72e9d552763d3e1ad7" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "windows-interface" +version = "0.56.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08990546bf4edef8f431fa6326e032865f27138718c587dc21bc0265bbcb57cc" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "windows-interface" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29bee4b38ea3cde66011baa44dba677c432a78593e202392d1e9070cf2a7fca7" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "windows-result" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e383302e8ec8515204254685643de10811af0ed97ea37210dc26fb0032647f8" +dependencies = [ + "windows-targets 0.52.6", +] + [[package]] name = "windows-sys" version = "0.48.0" @@ -1657,6 +2876,15 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets 0.52.6", +] + [[package]] name = "windows-targets" version = "0.48.5" @@ -1778,6 +3006,15 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" +[[package]] +name = "winnow" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63d3fcd9bba44b03821e7d699eeee959f3126dcc4aa8e4ae18ec617c2a5cea10" +dependencies = [ + "memchr", +] + [[package]] name = "winreg" version = "0.52.0" @@ -1788,6 +3025,50 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "wit-bindgen-rt" +version = "0.39.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1" +dependencies = [ + "bitflags 2.6.0", +] + +[[package]] +name = "xterm-color" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4de5f056fb9dc8b7908754867544e26145767187aaac5a98495e88ad7cb8a80f" + +[[package]] +name = "yaml-rust" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56c1936c4cc7a1c9ab21a1ebb602eb942ba868cbd44a99cb7cdc5892335e1c85" +dependencies = [ + "linked-hash-map", +] + +[[package]] +name = "zerocopy" +version = "0.8.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2586fea28e186957ef732a5f8b3be2da217d65c5969d4b1e17f973ebbe876879" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.8.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a996a8f63c5c4448cd959ac1bab0aaa3306ccfd060472f85943ee0750f0169be" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "zeroize" version = "1.8.1" diff --git a/apps/rust-sdk/Cargo.toml b/apps/rust-sdk/Cargo.toml index 6ea8d179..3affd864 100644 --- a/apps/rust-sdk/Cargo.toml +++ b/apps/rust-sdk/Cargo.toml @@ -1,13 +1,13 @@ [package] name = "firecrawl" author= "Mendable.ai" -version = "1.0.0" +version = "1.1.0" edition = "2021" license = "MIT" homepage = "https://www.firecrawl.dev/" repository ="https://github.com/mendableai/firecrawl" description = "Rust SDK for Firecrawl API." -authors = ["Gergő Móricz ", "sanix-darker "] +authors = ["Gergő Móricz ", "sanix-darker ", "kkharji "] [lib] path = "src/lib.rs" @@ -23,12 +23,18 @@ log = "^0.4" thiserror = "^1.0" uuid = { version = "^1.10", features = ["v4"] } tokio = { version = "^1", features = ["full"] } +futures = "0.3.31" +schemars = "0.8.22" [dev-dependencies] clippy = "^0.0.302" assert_matches = "^1.5" dotenvy = "^0.15" tokio = { version = "1", features = ["full"] } +mockito = "1.7.0" +clap = { version ="4.5.35", features = ["derive"] } +axum = { version = "0.8.3", features = ["tokio", "macros"] } +bat = "0.25.0" [build-dependencies] tokio = { version = "1", features = ["full"] } diff --git a/apps/rust-sdk/examples/batch_scrape_example.rs b/apps/rust-sdk/examples/batch_scrape_example.rs new file mode 100644 index 00000000..7db7856e --- /dev/null +++ b/apps/rust-sdk/examples/batch_scrape_example.rs @@ -0,0 +1,175 @@ +use clap::{Parser, Subcommand}; +use firecrawl::{ + batch_scrape::{BatchScrapeParams, WebhookOptions}, + map::MapOptions, + scrape::{ScrapeFormats, ScrapeOptions}, + FirecrawlApp, +}; +use serde_json::Value; +use std::error::Error; +use std::net::SocketAddr; +use std::sync::Arc; +use tokio::sync::Mutex; + +// Store webhook responses +struct WebhookState { + responses: Vec, +} + +#[derive(Parser)] +#[command(version, about, long_about = None)] +struct Cli { + #[command(subcommand)] + command: Commands, +} + +#[derive(Subcommand)] +enum Commands { + /// Multiple URL scraping with webhook monitoring + Basic, +} + +async fn create_firecrawl_app() -> Result> { + let api_url = std::env::var("FIRECRAWL_API_URL") + .expect("Please set the FIRECRAWL_API_URL environment variable"); + FirecrawlApp::new_selfhosted(api_url, None::<&str>).map_err(|e| e.into()) +} + +// Start webhook server and return its address +async fn start_webhook_server( + port: u16, + state: Arc>, +) -> Result> { + let state = state.clone(); + use axum::routing::post; + use axum::Json; + + let app = axum::Router::new().route( + "/", + post(move |body: Json| { + let state = state.clone(); + async move { + state.lock().await.responses.push(body.0.clone()); + match serde_json::to_string_pretty(&body.0) { + Ok(data) => println!( + "Received webhook: {}", + serde_json::to_string_pretty(&data).unwrap() + ), + Err(_) => println!("Received webhook: {}", body.0), + } + "OK" + } + }), + ); + + let addr = SocketAddr::from(([0, 0, 0, 0], port)); + let webhook_url = format!("http://host.docker.internal:{}", port); + + tokio::spawn(async move { + let listener = tokio::net::TcpListener::bind(addr) + .await + .inspect_err(|err| println!("{err:?}")) + .unwrap(); + + if let Err(e) = axum::serve(listener, app).await { + eprintln!("Webhook server error: {}", e); + } + }); + + println!("Webhook server running at {}", webhook_url); + + Ok(webhook_url) +} + +#[tokio::main] +async fn main() -> Result<(), Box> { + let cli = Cli::parse(); + let firecrawl = create_firecrawl_app().await?; + + let state = Arc::new(Mutex::new(WebhookState { responses: vec![] })); + let webhook_url = start_webhook_server(39120, state.clone()).await?; + + match cli.command { + Commands::Basic => { + let mut urls = Vec::new(); + + let url_one = "https://invalid-url.url/"; + println!("Mapping: {}", url_one); + match firecrawl.map_url(url_one, None).await { + Ok(mapped_urls) => urls.extend(mapped_urls), + Err(e) => println!("Error mapping {}: {}", url_one, e), + } + + let url_two = "https://www.devjobsscanner.com"; + println!("Mapping: {}", url_two); + match firecrawl + .map_url( + url_two, + Some(MapOptions { + search: Some("rust".into()), + limit: Some(20), + ..Default::default() + }), + ) + .await + { + Ok(mapped_urls) => urls.extend(mapped_urls), + Err(e) => println!("Error mapping {}: {}", url_two, e), + } + + test_multiple_urls(&firecrawl, urls, &webhook_url).await?; + + // Give time for webhooks to arrive + tokio::time::sleep(tokio::time::Duration::from_secs(5)).await; + println!( + "Received {} webhook responses", + state.lock().await.responses.len() + ); + } + } + + Ok(()) +} + +async fn test_multiple_urls( + app: &FirecrawlApp, + urls: Vec, + webhook_url: &str, +) -> Result<(), Box> { + println!("Testing batch scraping of {} URLs", urls.len()); + + let webhook = WebhookOptions { + url: webhook_url.to_string(), + headers: None, + auth_token: None, + }; + + let params = BatchScrapeParams { + urls, + webhook: Some(webhook), + ignore_invalid_urls: true, + options: Some(ScrapeOptions { + formats: Some(vec![ScrapeFormats::Markdown, ScrapeFormats::Links]), + ..Default::default() + }), + ..Default::default() + }; + + let batch = app.async_batch_scrape_urls(params).await?; + println!("Batch job started: {}", batch.id); + + // Poll status periodically + loop { + let status = app.check_batch_scrape_status(&batch.id).await?; + println!("Progress: {}/{} pages", status.completed, status.total); + + if status.completed >= status.total { + println!("Batch job completed!"); + break; + } + + tokio::time::sleep(tokio::time::Duration::from_secs(2)).await; + } + + Ok(()) +} diff --git a/apps/rust-sdk/examples/cancel_crawl_example.rs b/apps/rust-sdk/examples/cancel_crawl_example.rs new file mode 100644 index 00000000..3451cc10 --- /dev/null +++ b/apps/rust-sdk/examples/cancel_crawl_example.rs @@ -0,0 +1,33 @@ +use firecrawl::FirecrawlApp; +use std::error::Error; +use std::time::Duration; + +#[tokio::main] +async fn main() -> Result<(), Box> { + // Get API URL from environment + let api_url = std::env::var("FIRECRAWL_API_URL") + .expect("Please set the FIRECRAWL_API_URL environment variable"); + + // Create the FirecrawlApp instance + let firecrawl = FirecrawlApp::new_selfhosted(api_url, None::<&str>)?; + + // Start a crawl job + println!("Starting a crawl job..."); + let crawl_response = firecrawl + .crawl_url_async("https://example.com", None) + .await?; + println!("Crawl job started with ID: {}", crawl_response.id); + + // Wait for a moment to let the crawl job start + println!("Waiting for a moment..."); + tokio::time::sleep(Duration::from_secs(2)).await; + + // Cancel the crawl job + println!("Cancelling the crawl job..."); + let cancel_response = firecrawl.cancel_crawl(&crawl_response.id).await?; + + println!("Cancellation result:"); + println!(" Status: {:?}", cancel_response.status); + + Ok(()) +} diff --git a/apps/rust-sdk/examples/check_crawl_errors_example.rs b/apps/rust-sdk/examples/check_crawl_errors_example.rs new file mode 100644 index 00000000..7629d36c --- /dev/null +++ b/apps/rust-sdk/examples/check_crawl_errors_example.rs @@ -0,0 +1,59 @@ +use firecrawl::FirecrawlApp; +use std::error::Error; + +#[tokio::main] +async fn main() -> Result<(), Box> { + // Get API URL from environment + let api_url = std::env::var("FIRECRAWL_API_URL") + .expect("Please set the FIRECRAWL_API_URL environment variable"); + + // Create the FirecrawlApp instance + let firecrawl = FirecrawlApp::new_selfhosted(api_url, None::<&str>)?; + + // Start a crawl job that will likely have some errors (invalid URL format) + println!("Starting a crawl job..."); + let crawl_response = firecrawl + .crawl_url_async("https://no-wer-agg.invalid", None) + .await?; + println!("Crawl job started with ID: {}", crawl_response.id); + + println!("Let it do it's thing..."); + tokio::time::sleep(tokio::time::Duration::from_secs(3)).await; + + // Check the crawl errors + println!("Checking for crawl errors..."); + match firecrawl.check_crawl_errors(&crawl_response.id).await { + Ok(error_response) => { + println!("Crawl errors response:"); + println!(" Number of errors: {}", error_response.errors.len()); + + if !error_response.errors.is_empty() { + println!("\nDetailed errors:"); + for (i, error) in error_response.errors.iter().enumerate() { + println!("Error #{}", i + 1); + println!(" ID: {}", error.id); + if let Some(timestamp) = &error.timestamp { + println!(" Timestamp: {}", timestamp); + } + println!(" URL: {}", error.url); + println!(" Error: {}", error.error); + } + } + + println!( + "\nRobots.txt blocked URLs: {}", + error_response.robots_blocked.len() + ); + for (i, url) in error_response.robots_blocked.iter().enumerate() { + println!(" {}. {}", i + 1, url); + } + } + Err(e) => { + println!("Failed to check crawl errors: {}", e); + } + } + let cancel = firecrawl.cancel_crawl(&crawl_response.id).await?; + println!("Cancel: {}", cancel.status); + + Ok(()) +} diff --git a/apps/rust-sdk/examples/example.rs b/apps/rust-sdk/examples/example.rs index 0dcb0d46..51592f3f 100644 --- a/apps/rust-sdk/examples/example.rs +++ b/apps/rust-sdk/examples/example.rs @@ -1,4 +1,8 @@ -use firecrawl::{crawl::CrawlOptions, scrape::{ExtractOptions, ScrapeFormats, ScrapeOptions}, FirecrawlApp}; +use firecrawl::{ + crawl::CrawlOptions, + scrape::{ExtractOptions, ScrapeFormats, ScrapeOptions}, + FirecrawlApp, +}; use serde_json::json; #[tokio::main] @@ -19,19 +23,20 @@ async fn main() { // Crawl a website let crawl_options = CrawlOptions { - exclude_paths: vec![ "blog/*".into() ].into(), + exclude_paths: vec!["blog/*".into()].into(), ..Default::default() }; - - let crawl_result = app - .crawl_url("https://mendable.ai", crawl_options) - .await; + + let crawl_result = app.crawl_url("https://mendable.ai", crawl_options).await; match crawl_result { - Ok(data) => println!("Crawl Result (used {} credits):\n{:#?}", data.credits_used, data.data), + Ok(data) => println!( + "Crawl Result (used {} credits):\n{:#?}", + data.credits_used, data.data + ), Err(e) => eprintln!("Crawl failed: {}", e), } - + // Scrape with Extract let json_schema = json!({ "type": "object", @@ -57,11 +62,12 @@ async fn main() { }); let llm_extraction_options = ScrapeOptions { - formats: vec![ ScrapeFormats::Extract ].into(), + formats: vec![ScrapeFormats::Extract].into(), extract: ExtractOptions { schema: json_schema.into(), ..Default::default() - }.into(), + } + .into(), ..Default::default() }; @@ -75,9 +81,7 @@ async fn main() { } // Map a website (Alpha) - let map_result = app - .map_url("https://firecrawl.dev", None) - .await; + let map_result = app.map_url("https://firecrawl.dev", None).await; match map_result { Ok(data) => println!("Mapped URLs: {:#?}", data), diff --git a/apps/rust-sdk/examples/extract_example.rs b/apps/rust-sdk/examples/extract_example.rs new file mode 100644 index 00000000..47f446ae --- /dev/null +++ b/apps/rust-sdk/examples/extract_example.rs @@ -0,0 +1,237 @@ +use firecrawl::{extract::ExtractParams, FirecrawlApp}; +use serde_json::json; +use std::error::Error; + +use clap::{Parser, ValueEnum}; + +#[derive(Parser)] +#[command(author, version, about, long_about = None)] +struct Args { + #[arg(value_enum)] + command: Examples, +} + +#[derive(Copy, Clone, PartialEq, Eq, ValueEnum)] +enum Examples { + Basic, + Schema, + JsonSchema, +} + +#[tokio::main] +async fn main() -> Result<(), Box> { + let args = Args::parse(); + + let api_url = std::env::var("FIRECRAWL_API_URL") + .expect("Please set the FIRECRAWL_API_URL environment variable"); + let firecrawl = FirecrawlApp::new_selfhosted(api_url, None::<&str>)?; + let urls = vec![ + "https://www.firecrawl.dev/".to_string(), + "https://betteruptime.com".to_string(), + ]; + + match args.command { + Examples::Basic => { + println!("Example 1: Extracting with URLs and prompt"); + + let extract_params = ExtractParams { + prompt: Some( + "Extract Product promise, consice descirption and category".to_string(), + ), + url_trace: Some(true), + ..Default::default() + }; + + println!("Starting asynchronous extraction job..."); + let response = firecrawl + .async_extract(ExtractParams { + urls: Some(urls.iter().map(|u| u.to_string()).collect()), + prompt: extract_params.prompt.clone(), + url_trace: extract_params.url_trace, + ..Default::default() + }) + .await?; + + println!("Extract job initiated:"); + println!(" Job ID: {}", response.id); + + println!("\nChecking extract status..."); + for _ in 0..5 { + let response = firecrawl.get_extract_status(&response.id).await?; + + println!("Extract status: {}", response.status); + if let Some(url_trace) = &response.url_trace { + println!("URL traces:"); + for trace in url_trace { + println!(" URL: {}", trace.url); + println!(" Status: {}", trace.status); + } + } + println!("Extract data: {:#?}", response.data); + if response.status == "completed" { + break; + } + + tokio::time::sleep(tokio::time::Duration::from_secs(2)).await; + } + } + Examples::Schema => { + println!("Example 2: Extracting with schema"); + + let schema = json!({ + "type": "object", + "properties": { + "category": { "type": "string" }, + "promise": { "type": "string" }, + "descirption": { "type": "string" } + }, + "required": ["category", "promise", "description"] + }); + + println!("Starting synchronous extraction job..."); + + match firecrawl + .extract(ExtractParams { + urls: urls.into(), + schema: Some(schema), + ..Default::default() + }) + .await + { + Ok(result) => { + println!("Extraction completed successfully!"); + println!("Status: {}", result.status); + + if let Some(data) = result.data { + println!("\nExtracted data:"); + println!(" Title: {}", data["title"]); + if let Some(desc) = data.get("description") { + println!(" Description: {}", desc); + } + println!( + " Content (preview): {:.100}...", + data["content"].as_str().unwrap_or("N/A") + ); + } + + if let Some(sources) = result.sources { + println!("\nSources:"); + for (field, urls) in sources { + println!(" {}: {}", field, urls.join(", ")); + } + } + } + Err(e) => { + println!("Extraction failed: {}", e); + } + } + } + Examples::JsonSchema => { + println!("Example 3: Using JsonSchema derive"); + + /// A comprehensive analysis of given product + #[derive(serde::Serialize, serde::Deserialize, schemars::JsonSchema)] + struct ProductAnalysis { + /// The full name of the product + product_name: String, + /// The company/brand behind the product + brand: String, + /// The general price range (e.g. "Premium", "$10-50", "Enterprise") + price_range: String, + /// The main customer segments this product targets + target_audience: Vec, + /// Primary benefits and value propositions of the product + key_benefits: Vec, + /// Distinctive features that set this product apart from competitors + unique_selling_points: Vec, + /// Direct comparisons with competing products/services + competitor_comparison: Vec, + /// Technologies, frameworks, or platforms used (if applicable) + tech_stack: Option>, + /// Aggregated review data and sentiment analysis + reviews_summary: ReviewsSummary, + // /// Score from 0-10 indicating product-market fit based on analysis + // market_fit_score: f32, // NOTE: Breaks + /// Assessment of future growth prospects (e.g. "High", "Moderate", "Limited") + growth_potential: String, + /// Relevant compliance standards and certifications + regulatory_compliance: Option>, + } + + /// Aggregated analysis of product reviews from multiple sources + #[derive(serde::Serialize, serde::Deserialize, schemars::JsonSchema)] + struct ReviewsSummary { + /// Overall sentiment from review analysis (e.g. "Highly Positive", "Mixed", "Negative") + sentiment_analysis: String, + /// Most frequently mentioned positive aspects + common_praises: Vec, + /// Most frequently mentioned criticisms or issues + common_complaints: Vec, + /// Platforms or websites where reviews were sourced from + review_sources: Vec, + } + println!("Starting extraction with derived schema..."); + match firecrawl + .extract_with_schemars::(ExtractParams { + urls: urls.into(), + ..Default::default() + }) + .await + { + Ok(result) => { + println!("Extraction completed!"); + println!("Status: {}", result.status); + + if let Some(data) = result.data { + if let Ok(analysis) = serde_json::from_value::(data) { + println!("\nExtracted Product Analysis:"); + println!(" Product: {}", analysis.product_name); + println!(" Brand: {}", analysis.brand); + println!(" Price Range: {}", analysis.price_range); + println!(" Target Audience:"); + for audience in analysis.target_audience { + println!(" - {}", audience); + } + println!(" Key Benefits:"); + for benefit in analysis.key_benefits { + println!(" - {}", benefit); + } + println!(" USPs:"); + for usp in analysis.unique_selling_points { + println!(" - {}", usp); + } + + println!("\n Reviews Summary:"); + println!( + " Sentiment: {}", + analysis.reviews_summary.sentiment_analysis + ); + println!(" Common Praises:"); + for praise in analysis.reviews_summary.common_praises { + println!(" - {}", praise); + } + println!(" Common Complaints:"); + for complaint in analysis.reviews_summary.common_complaints { + println!(" - {}", complaint); + } + } else { + println!("Failed to parse extracted data"); + } + } + + if let Some(sources) = result.sources { + println!("\nSources:"); + for (field, urls) in sources { + println!(" {}: {}", field, urls.join(", ")); + } + } + } + Err(e) => { + println!("Extraction failed: {}", e); + } + } + } + } + + Ok(()) +} diff --git a/apps/rust-sdk/examples/llmstxt_example.rs b/apps/rust-sdk/examples/llmstxt_example.rs new file mode 100644 index 00000000..64fb3317 --- /dev/null +++ b/apps/rust-sdk/examples/llmstxt_example.rs @@ -0,0 +1,173 @@ +#![allow(clippy::option_map_unit_fn)] +use bat::{Input, PrettyPrinter}; +use firecrawl::{llmstxt::GenerateLLMsTextParams, FirecrawlApp}; +use std::error::Error; + +use clap::{Parser, ValueEnum}; + +#[derive(Copy, Clone, PartialEq, Eq, ValueEnum)] +enum Mode { + Basic, + Pool, + Fulltext, +} + +#[derive(Parser)] +#[command(author, version, about, long_about = None)] +struct Args { + /// URL for which to generate LLMs.txt + #[arg(default_value = "https://www.firecrawl.dev/")] + url: String, + + #[arg(long, short = 'm', value_enum, default_value = "Mode::Basic")] + mode: Mode, + + /// Maximum number of URLs to process + #[arg(long, short = 'd', default_value = "1")] + max_urls: u32, + + /// Whether to show the full LLMs-full.txt in the response + #[arg(long, short = 'f', default_value = "false")] + full_text: bool, + + /// Experimental streaming option + #[arg(long, short = 's', default_value = "false")] + stream: bool, +} + +#[tokio::main] +async fn main() -> Result<(), Box> { + let args = Args::parse(); + + let api_url = std::env::var("FIRECRAWL_API_URL") + .expect("Please set the FIRECRAWL_API_URL environment variable"); + let firecrawl = FirecrawlApp::new_selfhosted(api_url, None::<&str>)?; + + let params = GenerateLLMsTextParams { + url: args.url.clone(), + max_urls: args.max_urls, + show_full_text: args.full_text, + experimental_stream: args.stream, + }; + + match args.mode { + Mode::Basic => { + println!("Example 1: Basic LLMs.txt generation (synchronous)"); + println!("Generating LLMs.txt for {}...", args.url); + firecrawl + .generate_llms_text(params) + .await + .inspect(|result| { + println!("Expires at: {}", result.expires_at); + let text = (if args.full_text { + result.data.full.as_ref() + } else { + result.data.compact.as_ref() + }) + .expect("LLM Text"); + + pretty_print_content("Firecrawl Result", text).expect("Print"); + })?; + } + Mode::Pool => { + println!("Example 2: Asynchronous LLMs.txt generation with manual polling"); + + println!("Starting asynchronous LLMs.txt generation job..."); + let response = firecrawl.async_generate_llms_text(params).await?; + + println!("LLMs.txt generation job initiated:"); + println!(" Job ID: {}", response.id); + println!("\nManually polling for status..."); + for _ in 0..10 { + let status = firecrawl + .check_generate_llms_text_status(&response.id) + .await?; + + match status.status.as_str() { + "completed" => { + println!("LLMs.txt generation completed!"); + let text = (if args.full_text { + status.data.full.as_ref() + } else { + status.data.compact.as_ref() + }) + .expect("LLM Text"); + + pretty_print_content("Pool Result", text).expect("Print"); + + break; + } + "failed" => { + println!( + "LLMs.txt generation failed: {}", + status.error.unwrap_or_default() + ); + break; + } + status => println!("Generation status: {}", status), + } + + println!("Waiting 2 seconds before checking again..."); + tokio::time::sleep(tokio::time::Duration::from_secs(2)).await; + } + } + Mode::Fulltext => { + println!("Example 3: LLMs.txt generation with full text"); + + println!("Generating LLMs.txt with full text..."); + match firecrawl.generate_llms_text(params).await { + Ok(result) => { + println!("LLMs.txt generation completed successfully!"); + let llmstxt = result.data.compact.expect("LLMs Text Expected"); + let fulltxt = result.data.full.expect("Full LLMs Text Expected"); + + pretty_print_contents(&[ + ("LLMs.txt (compact)", llmstxt), + ("LLMs.txt (full text)", fulltxt), + ]) + .expect("Print") + } + Err(e) => { + println!("LLMs.txt generation failed: {}", e); + } + } + } + } + + Ok(()) +} + +/// Pretty prints the provided content with syntax highlighting +fn pretty_print_content(title: &str, content: &str) -> Result<(), Box> { + PrettyPrinter::new() + .header(true) + .grid(true) + .input( + Input::from_bytes(content.as_bytes()) + .title(title) + .name("file.md"), + ) + .print()?; + + Ok(()) +} + +/// Pretty prints multiple contents with syntax highlighting +fn pretty_print_contents(title_contents: &[(&'static str, String)]) -> Result<(), Box> { + let mut inputs = Vec::new(); + for (title, content) in title_contents { + inputs.push( + Input::from_bytes(content.as_bytes()) + .title(*title) + .name("file.md"), + ); + } + + PrettyPrinter::new() + .header(true) + .grid(true) + .inputs(inputs) + .print()?; + + Ok(()) +} diff --git a/apps/rust-sdk/examples/search_example.rs b/apps/rust-sdk/examples/search_example.rs new file mode 100644 index 00000000..28437193 --- /dev/null +++ b/apps/rust-sdk/examples/search_example.rs @@ -0,0 +1,186 @@ +use clap::{Parser, ValueEnum}; +use firecrawl::{ + search::{SearchParams, SearchResponse}, + FirecrawlApp, +}; +use std::error::Error; + +#[derive(Debug, Parser)] +#[command(author, version, about, long_about = None)] +struct Args { + /// Which example to run + #[arg(value_enum, default_value_t = Examples::All)] + example: Examples, +} + +#[derive(Debug, Clone, ValueEnum)] +enum Examples { + All, + Basic, + Advanced, + Geo, + Temporal, + Social, + News, + Academic, + Commercial, +} + +#[tokio::main] +async fn main() -> Result<(), Box> { + let args = Args::parse(); + + let api_url = std::env::var("FIRECRAWL_API_URL") + .expect("Please set the FIRECRAWL_API_URL environment variable"); + let firecrawl = FirecrawlApp::new_selfhosted(api_url, None::<&str>)?; + + match args.example { + Examples::All => { + run_basic_example(&firecrawl).await?; + run_advanced_example(&firecrawl).await?; + run_geographic_example(&firecrawl).await?; + run_temporal_example(&firecrawl).await?; + run_social_example(&firecrawl).await?; + run_news_example(&firecrawl).await?; + run_academic_example(&firecrawl).await?; + run_commercial_example(&firecrawl).await?; + } + Examples::Basic => run_basic_example(&firecrawl).await?, + Examples::Advanced => run_advanced_example(&firecrawl).await?, + Examples::Geo => run_geographic_example(&firecrawl).await?, + Examples::Temporal => run_temporal_example(&firecrawl).await?, + Examples::Social => run_social_example(&firecrawl).await?, + Examples::News => run_news_example(&firecrawl).await?, + Examples::Academic => run_academic_example(&firecrawl).await?, + Examples::Commercial => run_commercial_example(&firecrawl).await?, + } + + Ok(()) +} +async fn run_basic_example(firecrawl: &FirecrawlApp) -> Result<(), Box> { + let query = "rust programming language"; + let results = firecrawl.search(query, None).await?; + print_results("Basic Search", query, &results); + Ok(()) +} + +async fn run_advanced_example(firecrawl: &FirecrawlApp) -> Result<(), Box> { + let query = "rust web framework site:github.com OR site:gitlab.com"; + let params = SearchParams { + query: query.to_string(), + limit: Some(5), + ..Default::default() + }; + let results = firecrawl.search_with_params(params).await?; + print_results("Advanced Repository Search", query, &results); + Ok(()) +} + +async fn run_geographic_example(firecrawl: &FirecrawlApp) -> Result<(), Box> { + let query = "coworking space startup hub"; + let params = SearchParams { + query: query.to_string(), + // WARN: Doesn't work with searxng + location: Some("Silicon Valley, CA".to_string()), + // WARN: Doesn't work with searxng + country: Some("us".to_string()), + limit: Some(5), + ..Default::default() + }; + let results = firecrawl.search_with_params(params).await?; + print_results("Geographic-Specific Search", query, &results); + Ok(()) +} + +async fn run_temporal_example(firecrawl: &FirecrawlApp) -> Result<(), Box> { + let query = "artificial intelligence breakthroughs"; + let params = SearchParams { + query: query.to_string(), + // WARN: Doesn't work with searxng + tbs: Some("qdr:m1".to_string()), + limit: Some(5), + ..Default::default() + }; + let results = firecrawl.search_with_params(params).await?; + print_results("Recent AI News", query, &results); + Ok(()) +} + +async fn run_social_example(firecrawl: &FirecrawlApp) -> Result<(), Box> { + let query = "viral tech trends site:twitter.com"; + let params = SearchParams { + query: query.to_string(), + // WARN: Doesn't work. Maybe searxng related + filter: Some("site:twitter.com OR site:linkedin.com".to_string()), + // WARN: Doesn't work with searxng + tbs: Some("qdr:w".to_string()), // Last week + limit: Some(5), + ..Default::default() + }; + let results = firecrawl.search_with_params(params).await?; + print_results("Social Media Tech Trends", query, &results); + Ok(()) +} + +async fn run_news_example(firecrawl: &FirecrawlApp) -> Result<(), Box> { + let query = + "cryptocurrency market analysis site:reuters.com OR site:bloomberg.com OR site:ft.com"; + let params = SearchParams { + query: query.to_string(), + // WARN: Doesn't work with searxng + tbs: Some("qdr:d".to_string()), // Last 24 hours + limit: Some(5), + ..Default::default() + }; + let results = firecrawl.search_with_params(params).await?; + print_results("Financial News Search", query, &results); + Ok(()) +} + +async fn run_academic_example(firecrawl: &FirecrawlApp) -> Result<(), Box> { + let query = "quantum computing research papers site:arxiv.org OR site:scholar.google.com"; + let params = SearchParams { + query: query.to_string(), + // WARN: Doesn't work. Maybe searxng related + // filter: Some("site:arxiv.org OR site:scholar.google.com".to_string()), + // WARN: Doesn't work with searxng + tbs: Some("qdr:y".to_string()), // Last year + limit: Some(5), + ..Default::default() + }; + let results = firecrawl.search_with_params(params).await?; + print_results("Academic Research Search", query, &results); + Ok(()) +} + +async fn run_commercial_example(firecrawl: &FirecrawlApp) -> Result<(), Box> { + let query = "enterprise cloud solutions reviews site:g2.com"; + let params = SearchParams { + query: query.to_string(), + limit: Some(5), + ..Default::default() + }; + let results = firecrawl.search_with_params(params).await?; + print_results("Commercial Product Search", query, &results); + Ok(()) +} + +fn print_results(name: &str, query: &str, results: &SearchResponse) { + let sec = "=".repeat(70); + + println!("\n{sec}"); + println!("🔍 {name}"); + println!("🔎 Query: \"{query}\""); + println!("{sec}"); + + for (i, doc) in results.data.iter().enumerate() { + println!("{}. 📌 Title: {}", i + 1, doc.title); + println!(" - 🔗 URL: {}", doc.url); + println!(" - 📝 Description: \"{:.40}\"...", doc.description); + } + + if let Some(warning) = &results.warning { + println!("\n⚠️ Warning: {warning}"); + } + println!("{sec}\n"); +} diff --git a/apps/rust-sdk/src/batch_scrape.rs b/apps/rust-sdk/src/batch_scrape.rs new file mode 100644 index 00000000..1429fbea --- /dev/null +++ b/apps/rust-sdk/src/batch_scrape.rs @@ -0,0 +1,494 @@ +use std::collections::HashMap; + +use serde::{Deserialize, Serialize}; + +use crate::{ + crawl::{CrawlErrorsResponse, CrawlStatus, CrawlStatusTypes}, + scrape::ScrapeOptions, + FirecrawlApp, FirecrawlError, API_VERSION, +}; + +#[serde_with::skip_serializing_none] +#[derive(Deserialize, Serialize, Debug, Default, Clone)] +#[serde(rename_all = "camelCase")] +pub struct BatchScrapeParams { + /// List of URLs to scrape + pub urls: Vec, + /// Scrape options to apply to all URLs + #[serde(flatten)] + pub options: Option, + /// Whether to ignore invalid URLs + #[serde(rename = "ignoreInvalidURLs")] + pub ignore_invalid_urls: bool, + /// ID of an existing job to append these URLs to + pub append_to_id: Option, + /// Webhook configuration + pub webhook: Option, + + /// Idempotency key to send to the crawl endpoint. + #[serde(skip)] + pub idempotency_key: Option, +} + +/// Options for webhook notifications +#[serde_with::skip_serializing_none] +#[derive(Deserialize, Serialize, Debug, Default, Clone)] +#[serde(rename_all = "camelCase")] +pub struct WebhookOptions { + /// URL to send webhook notifications to + pub url: String, + /// Custom headers to include in webhook requests + pub headers: Option>, + /// Authentication token for the webhook + pub auth_token: Option, +} + +impl From<&str> for WebhookOptions { + fn from(url: &str) -> Self { + Self { + url: url.to_string(), + headers: None, + auth_token: None, + } + } +} + +/// Response from initiating a batch scrape job +#[derive(Deserialize, Serialize, Debug, Clone)] +#[serde(rename_all = "camelCase")] +pub struct BatchScrapeResponse { + /// Whether the request was successful + pub success: bool, + /// The ID of the batch scrape job + pub id: String, + /// URL to get the status of the batch scrape job + pub url: String, + /// List of URLs that were invalid and could not be processed + pub invalid_urls: Option>, +} + +impl From for WebhookOptions { + fn from(url: String) -> Self { + Self { + url, + headers: None, + auth_token: None, + } + } +} + +impl FirecrawlApp { + /// Initiates an asynchronous batch scrape job + pub async fn async_batch_scrape_urls( + &self, + params: BatchScrapeParams, + ) -> Result { + let headers = self.prepare_headers(params.idempotency_key.as_ref()); + + let response = self + .client + .post(format!("{}{}/batch/scrape", self.api_url, API_VERSION)) + .headers(headers) + .json(¶ms) + .send() + .await + .map_err(|e| FirecrawlError::HttpError("Initiating batch scrape job".to_string(), e))?; + + self.handle_response(response, "initiate batch scrape job") + .await + } + + /// Initiates a batch scrape job and waits for completion + pub async fn batch_scrape_urls( + &self, + params: BatchScrapeParams, + poll_interval: Option, + ) -> Result { + let poll_interval_ms = poll_interval.unwrap_or(2000); + + let response = self.async_batch_scrape_urls(params).await?; + + self.monitor_batch_job_status(&response.id, poll_interval_ms) + .await + } + + /// Checks the status of a batch scrape job + pub async fn check_batch_scrape_status( + &self, + id: impl AsRef, + ) -> Result { + let response = self + .client + .get(format!( + "{}{}/batch/scrape/{}", + self.api_url, + API_VERSION, + id.as_ref() + )) + .headers(self.prepare_headers(None)) + .send() + .await + .map_err(|e| { + FirecrawlError::HttpError( + format!("Checking status of batch scrape {}", id.as_ref()), + e, + ) + })?; + + let mut status: CrawlStatus = self + .handle_response( + response, + format!("Checking status of batch scrape {}", id.as_ref()), + ) + .await?; + + if status.status == CrawlStatusTypes::Completed { + while let Some(next) = status.next.clone() { + let new_status = self.check_batch_scrape_status_next(next).await?; + status.data.extend_from_slice(&new_status.data); + status.next = new_status.next; + } + } + + Ok(status) + } + + /// Helper function to paginate through batch scrape status results + async fn check_batch_scrape_status_next( + &self, + next: impl AsRef, + ) -> Result { + let response = self + .client + .get(next.as_ref()) + .headers(self.prepare_headers(None)) + .send() + .await + .map_err(|e| { + FirecrawlError::HttpError( + format!("Paginating batch scrape using URL {:?}", next.as_ref()), + e, + ) + })?; + + self.handle_response( + response, + format!("Paginating batch scrape using URL {:?}", next.as_ref()), + ) + .await + } + + /// Check for errors in a batch scrape job + pub async fn check_batch_scrape_errors( + &self, + id: impl AsRef, + ) -> Result { + let response = self + .client + .get(format!( + "{}{}/batch/scrape/{}/errors", + self.api_url, + API_VERSION, + id.as_ref() + )) + .headers(self.prepare_headers(None)) + .send() + .await + .map_err(|e| { + FirecrawlError::HttpError( + format!("Checking errors for batch scrape {}", id.as_ref()), + e, + ) + })?; + + self.handle_response( + response, + format!("Checking errors for batch scrape {}", id.as_ref()), + ) + .await + } + + /// Helper function to poll for batch job status until completion + async fn monitor_batch_job_status( + &self, + id: &str, + poll_interval: u64, + ) -> Result { + loop { + let status_data = self.check_batch_scrape_status(id).await?; + match status_data.status { + CrawlStatusTypes::Completed => { + break Ok(status_data); + } + CrawlStatusTypes::Scraping => { + tokio::time::sleep(tokio::time::Duration::from_millis(poll_interval)).await; + } + CrawlStatusTypes::Failed => { + break Err(FirecrawlError::CrawlJobFailed( + "Batch scrape job failed".into(), + status_data, + )); + } + CrawlStatusTypes::Cancelled => { + break Err(FirecrawlError::CrawlJobFailed( + "Batch scrape job was cancelled".into(), + status_data, + )); + } + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use serde_json::json; + + #[tokio::test] + #[ignore = "Makes real network request"] + async fn test_real_batch_scrape() { + let api_url = std::env::var("FIRECRAWL_API_URL") + .expect("Please set the FIRECRAWL_API_URL environment variable"); + let app = FirecrawlApp::new_selfhosted(api_url, None::<&str>).unwrap(); + + // Start a batch scrape job + let params = BatchScrapeParams { + urls: vec![ + "https://example.com".to_string(), + "https://example.org".to_string(), + ], + ignore_invalid_urls: true, + ..Default::default() + }; + + let response = app.async_batch_scrape_urls(params).await.unwrap(); + + assert!(response.success); + assert!(!response.id.is_empty()); + assert!(!response.url.is_empty()); + } + + #[tokio::test] + async fn test_async_batch_scrape_with_mock() { + let mut server = mockito::Server::new_async().await; + + // Set up the mock + let mock = server + .mock("POST", "/v1/batch/scrape") + // Remove the match_body expectation which might be causing issues + .with_status(200) + .with_header("content-type", "application/json") + .with_body( + json!({ + "success": true, + "id": "batch-123", + "url": "https://api.example.com/v1/batch/batch-123", + "invalidUrls": [] + }) + .to_string(), + ) + .create(); + + let app = FirecrawlApp::new_selfhosted(server.url(), Some("test_key")).unwrap(); + + let params = BatchScrapeParams { + urls: vec![ + "https://example.com".to_string(), + "https://example.org".to_string(), + ], + ignore_invalid_urls: true, + ..Default::default() + }; + + let response = app.async_batch_scrape_urls(params).await.unwrap(); + + assert!(response.success); + assert_eq!(response.id, "batch-123"); + assert_eq!(response.url, "https://api.example.com/v1/batch/batch-123"); + assert!(response.invalid_urls.unwrap_or_default().is_empty()); + mock.assert(); + } + + #[tokio::test] + async fn test_batch_scrape_with_webhook() { + let mut server = mockito::Server::new_async().await; + + let mock = server + .mock("POST", "/v1/batch/scrape") + // Remove the match_body expectation to simplify + .with_status(200) + .with_header("content-type", "application/json") + .with_body( + json!({ + "success": true, + "id": "batch-123", + "url": "https://api.example.com/v1/batch/batch-123" + }) + .to_string(), + ) + .create(); + + let app = FirecrawlApp::new_selfhosted(server.url(), Some("test_key")).unwrap(); + + let params = BatchScrapeParams { + urls: vec!["https://example.com".to_string()], + webhook: Some("https://webhook.example.com/notify".into()), + ..Default::default() + }; + + let response = app.async_batch_scrape_urls(params).await.unwrap(); + + assert!(response.success); + assert_eq!(response.id, "batch-123"); + mock.assert(); + } + + #[tokio::test] + async fn test_check_batch_scrape_status_with_mock() { + let mut server = mockito::Server::new_async().await; + + let mock = server + .mock("GET", "/v1/batch/scrape/batch-123") + .with_status(200) + .with_header("content-type", "application/json") + .with_body( + json!({ + "success": true, + "status": "completed", + "total": 2, + "completed": 2, + "creditsUsed": 2, + "expiresAt": "2023-12-31T23:59:59Z", + "data": [ + { + "metadata": { + "sourceURL": "https://example.com", + "statusCode": 200 + }, + "markdown": "Example Domain content" + }, + { + "metadata": { + "sourceURL": "https://example.org", + "statusCode": 200 + }, + "markdown": "Another example content" + } + ] + }) + .to_string(), + ) + .create(); + + let app = FirecrawlApp::new_selfhosted(server.url(), Some("test_key")).unwrap(); + let status = app.check_batch_scrape_status("batch-123").await.unwrap(); + + assert_eq!(status.total, 2); + assert_eq!(status.completed, 2); + assert_eq!(status.data.len(), 2); + assert_eq!(status.data[0].metadata.source_url, "https://example.com"); + assert_eq!(status.data[1].metadata.source_url, "https://example.org"); + mock.assert(); + } + + #[tokio::test] + async fn test_check_batch_scrape_errors_with_mock() { + let mut server = mockito::Server::new_async().await; + + let mock = server + .mock("GET", "/v1/batch/scrape/batch-123/errors") + .with_status(200) + .with_header("content-type", "application/json") + .with_body( + json!({ + "success": true, + "errors": [ + { + "id": "error1", + "timestamp": "2023-01-01T00:00:00Z", + "url": "https://invalid.example.com", + "error": "Failed to load page" + } + ], + "robotsBlocked": [ + "https://example.com/admin" + ] + }) + .to_string(), + ) + .create_async() + .await; + + let app = FirecrawlApp::new_selfhosted(server.url(), Some("test_key")).unwrap(); + let errors = app.check_batch_scrape_errors("batch-123").await.unwrap(); + + assert_eq!(errors.errors.len(), 1); + assert_eq!(errors.errors[0].url, "https://invalid.example.com"); + assert_eq!(errors.robots_blocked.len(), 1); + assert_eq!(errors.robots_blocked[0], "https://example.com/admin"); + mock.assert(); + } + + #[tokio::test] + async fn test_batch_scrape_with_invalid_urls() { + let mut server = mockito::Server::new_async().await; + + let mock = server + .mock("POST", "/v1/batch/scrape") + // Remove the match_body expectation + .with_status(200) + .with_header("content-type", "application/json") + .with_body( + json!({ + "success": true, + "id": "batch-123", + "url": "https://api.example.com/v1/batch/batch-123", + "invalidUrls": ["invalid-url"] + }) + .to_string(), + ) + .create(); + + let app = FirecrawlApp::new_selfhosted(server.url(), Some("test_key")).unwrap(); + + let params = BatchScrapeParams { + urls: vec!["https://example.com".to_string(), "invalid-url".to_string()], + ignore_invalid_urls: true, + ..Default::default() + }; + + let response = app.async_batch_scrape_urls(params).await.unwrap(); + + assert!(response.success); + assert_eq!(response.id, "batch-123"); + assert_eq!(response.invalid_urls, Some(vec!["invalid-url".to_string()])); + mock.assert(); + } + + #[tokio::test] + async fn test_batch_scrape_error_response() { + let mut server = mockito::Server::new_async().await; + + let mock = server + .mock("POST", "/v1/batch/scrape") + .with_status(400) + .with_header("content-type", "application/json") + .with_body( + json!({ + "success": false, + "error": "No valid URLs provided" + }) + .to_string(), + ) + .create(); + + let app = FirecrawlApp::new_selfhosted(server.url(), Some("test_key")).unwrap(); + + let params = BatchScrapeParams::default(); + let result = app.async_batch_scrape_urls(params).await; + + assert!(result.is_err()); + mock.assert(); + } +} diff --git a/apps/rust-sdk/src/crawl.rs b/apps/rust-sdk/src/crawl.rs index 2860d24a..a5f30f40 100644 --- a/apps/rust-sdk/src/crawl.rs +++ b/apps/rust-sdk/src/crawl.rs @@ -2,7 +2,11 @@ use std::collections::HashMap; use serde::{Deserialize, Serialize}; -use crate::{document::Document, scrape::{ScrapeFormats, ScrapeOptions}, FirecrawlApp, FirecrawlError, API_VERSION}; +use crate::{ + document::Document, + scrape::{ScrapeFormats, ScrapeOptions}, + FirecrawlApp, FirecrawlError, API_VERSION, +}; #[derive(Deserialize, Serialize, Clone, Copy, Debug)] pub enum CrawlScrapeFormats { @@ -23,13 +27,13 @@ pub enum CrawlScrapeFormats { Links, /// Will result in a URL to a screenshot of the page. - /// + /// /// Can not be used in conjunction with `CrawlScrapeFormats::ScreenshotFullPage`. #[serde(rename = "screenshot")] Screenshot, /// Will result in a URL to a full-page screenshot of the page. - /// + /// /// Can not be used in conjunction with `CrawlScrapeFormats::Screenshot`. #[serde(rename = "screenshot@fullPage")] ScreenshotFullPage, @@ -59,12 +63,12 @@ pub struct CrawlScrapeOptions { pub only_main_content: Option, /// HTML tags to exclusively include. - /// + /// /// For example, if you pass `div`, you will only get content from `
`s and their children. pub include_tags: Option>, /// HTML tags to exclude. - /// + /// /// For example, if you pass `img`, you will never get image URLs in your results. pub exclude_tags: Option>, @@ -81,7 +85,9 @@ pub struct CrawlScrapeOptions { impl From for ScrapeOptions { fn from(value: CrawlScrapeOptions) -> Self { ScrapeOptions { - formats: value.formats.map(|formats| formats.into_iter().map(|x| x.into()).collect()), + formats: value + .formats + .map(|formats| formats.into_iter().map(|x| x.into()).collect()), only_main_content: value.only_main_content, include_tags: value.include_tags, exclude_tags: value.exclude_tags, @@ -101,12 +107,12 @@ pub struct CrawlOptions { pub scrape_options: Option, /// URL RegEx patterns to (exclusively) include. - /// + /// /// For example, if you specified `"blog"`, only pages that have `blog` somewhere in the URL would be crawled. pub include_paths: Option>, /// URL RegEx patterns to exclude. - /// + /// /// For example, if you specified `"blog"`, pages that have `blog` somewhere in the URL would not be crawled. pub exclude_paths: Option>, @@ -200,6 +206,29 @@ pub struct CrawlStatus { pub data: Vec, } +#[derive(Deserialize, Serialize, Debug, Clone)] +#[serde(rename_all = "camelCase")] +pub struct CrawlError { + pub id: String, + pub timestamp: Option, + pub url: String, + pub error: String, +} + +#[derive(Deserialize, Serialize, Debug, Clone)] +#[serde(rename_all = "camelCase")] +pub struct CrawlErrorsResponse { + pub errors: Vec, + #[serde(rename = "robotsBlocked")] + pub robots_blocked: Vec, +} + +#[derive(Deserialize, Serialize, Debug, Clone)] +#[serde(rename_all = "camelCase")] +pub struct CancelCrawlResponse { + pub status: String, +} + #[derive(Deserialize, Serialize, Debug, Clone)] #[serde(rename_all = "camelCase")] pub struct CrawlAsyncResponse { @@ -223,19 +252,20 @@ impl FirecrawlApp { url: url.as_ref().to_string(), options: options.unwrap_or_default(), }; - + let headers = self.prepare_headers(body.options.idempotency_key.as_ref()); let response = self .client - .post(&format!("{}{}/crawl", self.api_url, API_VERSION)) + .post(format!("{}{}/crawl", self.api_url, API_VERSION)) .headers(headers.clone()) .json(&body) .send() .await .map_err(|e| FirecrawlError::HttpError(format!("Crawling {:?}", url.as_ref()), e))?; - self.handle_response::(response, "start crawl job").await + self.handle_response::(response, "start crawl job") + .await } /// Performs a crawl job for a URL using the Firecrawl API, waiting for the end result. This may take a long time depending on the size of the target page and your options (namely `CrawlOptions.limit`). @@ -245,38 +275,65 @@ impl FirecrawlApp { options: impl Into>, ) -> Result { let options = options.into(); - let poll_interval = options.as_ref().and_then(|x| x.poll_interval).unwrap_or(2000); + let poll_interval = options + .as_ref() + .and_then(|x| x.poll_interval) + .unwrap_or(2000); let res = self.crawl_url_async(url, options).await?; self.monitor_job_status(&res.id, poll_interval).await } - async fn check_crawl_status_next(&self, next: impl AsRef) -> Result { + async fn check_crawl_status_next( + &self, + next: impl AsRef, + ) -> Result { let response = self .client .get(next.as_ref()) .headers(self.prepare_headers(None)) .send() .await - .map_err(|e| FirecrawlError::HttpError(format!("Paginating crawl using URL {:?}", next.as_ref()), e))?; + .map_err(|e| { + FirecrawlError::HttpError( + format!("Paginating crawl using URL {:?}", next.as_ref()), + e, + ) + })?; - self.handle_response(response, format!("Paginating crawl using URL {:?}", next.as_ref())).await + self.handle_response( + response, + format!("Paginating crawl using URL {:?}", next.as_ref()), + ) + .await } /// Checks for the status of a crawl, based on the crawl's ID. To be used in conjunction with `FirecrawlApp::crawl_url_async`. - pub async fn check_crawl_status(&self, id: impl AsRef) -> Result { + pub async fn check_crawl_status( + &self, + id: impl AsRef, + ) -> Result { let response = self .client - .get(&format!( + .get(format!( "{}{}/crawl/{}", - self.api_url, API_VERSION, id.as_ref() + self.api_url, + API_VERSION, + id.as_ref() )) .headers(self.prepare_headers(None)) .send() .await - .map_err(|e| FirecrawlError::HttpError(format!("Checking status of crawl {}", id.as_ref()), e))?; + .map_err(|e| { + FirecrawlError::HttpError(format!("Checking status of crawl {}", id.as_ref()), e) + })?; - let mut status: CrawlStatus = self.handle_response(response, format!("Checking status of crawl {}", id.as_ref())).await?; + let mut status: CrawlStatus = self + .handle_response( + response, + format!("Checking status of crawl {}", id.as_ref()), + ) + .await?; if status.status == CrawlStatusTypes::Completed { while let Some(next) = status.next { @@ -304,16 +361,240 @@ impl FirecrawlApp { tokio::time::sleep(tokio::time::Duration::from_millis(poll_interval)).await; } CrawlStatusTypes::Failed => { - break Err(FirecrawlError::CrawlJobFailed(format!( - "Crawl job failed." - ), status_data)); + break Err(FirecrawlError::CrawlJobFailed( + "Crawl job failed".into(), + status_data, + )); } CrawlStatusTypes::Cancelled => { - break Err(FirecrawlError::CrawlJobFailed(format!( - "Crawl job was cancelled." - ), status_data)); + break Err(FirecrawlError::CrawlJobFailed( + "Crawl job was cancelled.".into(), + status_data, + )); } } } } + + /// Cancel an asynchronous crawl job using the Firecrawl API. + /// + /// # Returns + /// + /// A response indicating whether the cancellation was successful, or a FirecrawlError if the request fails. + pub async fn cancel_crawl( + &self, + id: impl AsRef, + ) -> Result { + let response = self + .client + .delete(format!( + "{}{}/crawl/{}", + self.api_url, + API_VERSION, + id.as_ref() + )) + .headers(self.prepare_headers(None)) + .send() + .await + .map_err(|e| { + FirecrawlError::HttpError(format!("Cancelling crawl {}", id.as_ref()), e) + })?; + + self.handle_response(response, "crawl_cancel").await + } + + /// Returns information about crawl errors. + /// + /// # Returns + /// + /// A response containing information about crawl errors, or a FirecrawlError if the request fails. + pub async fn check_crawl_errors( + &self, + id: impl AsRef, + ) -> Result { + let response = self + .client + .get(format!( + "{}{}/crawl/{}/errors", + self.api_url, + API_VERSION, + id.as_ref() + )) + .headers(self.prepare_headers(None)) + .send() + .await + .map_err(|e| { + FirecrawlError::HttpError(format!("Checking errors for crawl {}", id.as_ref()), e) + })?; + + self.handle_response(response, "crawl_check").await + } +} + +#[cfg(test)] +mod tests { + use super::*; + use serde_json::json; + + #[tokio::test] + #[ignore = "Makes real network request"] + async fn test_real_cancel_crawl() { + let api_url = std::env::var("FIRECRAWL_API_URL") + .expect("Please set the FIRECRAWL_API_URL environment variable"); + let app = FirecrawlApp::new_selfhosted(api_url, None::<&str>).unwrap(); + + // First start a crawl job + let crawl_response = app + .crawl_url_async("https://example.com", None) + .await + .unwrap(); + + // Then cancel it + let cancel_response = app.cancel_crawl(crawl_response.id).await.unwrap(); + + assert_eq!(cancel_response.status, "cancelled"); + } + + #[tokio::test] + async fn test_cancel_crawl_with_mock() { + let mut server = mockito::Server::new_async().await; + + // Set up the mock for the cancel request + let mock = server + .mock("DELETE", "/v1/crawl/test-crawl-id") + .with_status(200) + .with_header("content-type", "application/json") + .with_body( + json!({ + "success": null, + "status": "cancelled" + }) + .to_string(), + ) + .create(); + + let app = FirecrawlApp::new_selfhosted(server.url(), Some("test_key")).unwrap(); + let response = app.cancel_crawl("test-crawl-id").await.unwrap(); + + assert_eq!(response.status, "cancelled"); + mock.assert(); + } + + #[tokio::test] + async fn test_cancel_crawl_error_response() { + let mut server = mockito::Server::new_async().await; + + // Set up the mock for an error response + let mock = server + .mock("DELETE", "/v1/crawl/invalid-id") + .with_status(404) + .with_header("content-type", "application/json") + .with_body( + json!({ + "success": false, + "error": "Crawl job not found" + }) + .to_string(), + ) + .create(); + + let app = FirecrawlApp::new_selfhosted(server.url(), Some("test_key")).unwrap(); + let result = app.cancel_crawl("invalid-id").await; + + assert!(result.is_err()); + mock.assert(); + } + + #[tokio::test] + #[ignore = "Makes real network request"] + async fn test_real_check_crawl_errors() { + let api_url = std::env::var("FIRECRAWL_API_URL") + .expect("Please set the FIRECRAWL_API_URL environment variable"); + let app = FirecrawlApp::new_selfhosted(api_url, None::<&str>).unwrap(); + + // First start a crawl job + let crawl_response = app + .crawl_url_async("https://no-wer-agg.invalid", None) + .await + .unwrap(); + + // Check for errors + let errors_response = app.check_crawl_errors(crawl_response.id).await.unwrap(); + println!("{errors_response:?}"); + + tokio::time::sleep(tokio::time::Duration::from_secs(3)).await; + + assert!( + !errors_response.errors.is_empty(), + "WARN: Error returned related to Supabase not in my environment. It may fail" + ); + } + + #[tokio::test] + async fn test_check_crawl_errors_with_mock() { + let mut server = mockito::Server::new_async().await; + + // Set up the mock for the check errors request + let mock = server + .mock("GET", "/v1/crawl/test-crawl-id/errors") + .with_status(200) + .with_header("content-type", "application/json") + .with_body( + json!({ + "success": true, + "errors": [ + { + "id": "error1", + "timestamp": "2023-01-01T00:00:00Z", + "url": "https://example.com/error-page", + "error": "Failed to load page" + } + ], + "robotsBlocked": [ + "https://example.com/blocked-by-robots" + ] + }) + .to_string(), + ) + .create(); + + let app = FirecrawlApp::new_selfhosted(server.url(), Some("test_key")).unwrap(); + let response = app.check_crawl_errors("test-crawl-id").await.unwrap(); + + assert_eq!(response.errors.len(), 1); + assert_eq!(response.errors[0].id, "error1"); + assert_eq!(response.errors[0].url, "https://example.com/error-page"); + assert_eq!(response.errors[0].error, "Failed to load page"); + assert_eq!(response.robots_blocked.len(), 1); + assert_eq!( + response.robots_blocked[0], + "https://example.com/blocked-by-robots" + ); + mock.assert(); + } + + #[tokio::test] + async fn test_check_crawl_errors_error_response() { + let mut server = mockito::Server::new_async().await; + + // Set up the mock for an error response + let mock = server + .mock("GET", "/v1/crawl/invalid-id/errors") + .with_status(404) + .with_header("content-type", "application/json") + .with_body( + json!({ + "success": false, + "error": "Crawl job not found" + }) + .to_string(), + ) + .create(); + + let app = FirecrawlApp::new_selfhosted(server.url(), Some("test_key")).unwrap(); + let result = app.check_crawl_errors("invalid-id").await; + + assert!(result.is_err()); + mock.assert(); + } } diff --git a/apps/rust-sdk/src/document.rs b/apps/rust-sdk/src/document.rs index 1948a4ce..38c7fcc1 100644 --- a/apps/rust-sdk/src/document.rs +++ b/apps/rust-sdk/src/document.rs @@ -57,12 +57,12 @@ pub struct Document { pub markdown: Option, /// The HTML of the page, present if `ScrapeFormats::HTML` is present in `ScrapeOptions.formats`. - /// + /// /// This contains HTML that has non-content tags removed. If you need the original HTML, use `ScrapeFormats::RawHTML`. pub html: Option, /// The raw HTML of the page, present if `ScrapeFormats::RawHTML` is present in `ScrapeOptions.formats`. - /// + /// /// This contains the original, untouched HTML on the page. If you only need human-readable content, use `ScrapeFormats::HTML`. pub raw_html: Option, @@ -83,4 +83,3 @@ pub struct Document { /// The warning message will contain any errors encountered during the extraction. pub warning: Option, } - diff --git a/apps/rust-sdk/src/error.rs b/apps/rust-sdk/src/error.rs index 33e4edc6..f452cd0e 100644 --- a/apps/rust-sdk/src/error.rs +++ b/apps/rust-sdk/src/error.rs @@ -42,4 +42,6 @@ pub enum FirecrawlError { APIError(String, FirecrawlAPIError), #[error("Crawl job failed: {0}")] CrawlJobFailed(String, CrawlStatus), + #[error("Missuse: {0}")] + Missuse(String), } diff --git a/apps/rust-sdk/src/extract.rs b/apps/rust-sdk/src/extract.rs new file mode 100644 index 00000000..a1dd2ef9 --- /dev/null +++ b/apps/rust-sdk/src/extract.rs @@ -0,0 +1,596 @@ +use std::collections::HashMap; + +use schemars::schema_for; +use serde::{Deserialize, Serialize}; +use serde_json::Value; + +use crate::{FirecrawlApp, FirecrawlError, API_VERSION}; + +/// Parameters for extract requests +#[serde_with::skip_serializing_none] +#[derive(Deserialize, Serialize, Debug, Default, Clone)] +#[serde(rename_all = "camelCase")] +pub struct ExtractParams { + /// URLs to extract information from + pub urls: Option>, + + /// Extraction prompt + pub prompt: Option, + + /// Schema for structured output + pub schema: Option, + + /// System prompt for the LLM + pub system_prompt: Option, + + /// Allow following external links + pub allow_external_links: Option, + + /// Enable web search for additional information + pub enable_web_search: Option, + + /// Show sources in the response + pub show_sources: Option, + + /// Origin information, defaults to "api-sdk" + pub origin: Option, + + /// Timeout in milliseconds, defaults to 60000 + pub timeout: Option, + + /// Whether to include URL trace information, defaults to false + pub url_trace: Option, + + /// Whether to ignore sitemap, defaults to false + pub ignore_sitemap: Option, + + /// Whether to include subdomains, defaults to true + pub include_subdomains: Option, + + /// Maximum number of URLs to process + pub limit: Option, + + /// Experimental: Stream steps information + #[serde(rename = "__experimental_streamSteps")] + pub experimental_stream_steps: Option, + + /// Experimental: Include LLM usage information + #[serde(rename = "__experimental_llmUsage")] + pub experimental_llm_usage: Option, + + /// Experimental: Show sources information + #[serde(rename = "__experimental_showSources")] + pub experimental_show_sources: Option, + + /// Experimental: Cache key + #[serde(rename = "__experimental_cacheKey")] + pub experimental_cache_key: Option, + + /// Experimental: Cache mode, defaults to "direct" + #[serde(rename = "__experimental_cacheMode")] + pub experimental_cache_mode: Option, +} + +/// Response from initiating an extract operation +#[derive(Deserialize, Serialize, Debug, Clone)] +#[serde(rename_all = "camelCase")] +pub struct ExtractResponse { + /// Whether the request was successful + pub success: bool, + + /// The ID of the extract job + pub id: String, + + /// URL trace information if requested + pub url_trace: Option>, +} + +/// Information about URL processing during extraction +#[derive(Deserialize, Serialize, Debug, Clone)] +#[serde(rename_all = "camelCase")] +pub struct URLTrace { + /// The URL being processed + pub url: String, + + /// Status of processing this URL + pub status: String, + + /// Timing information for URL processing + pub timing: URLTraceTiming, + + /// Error message if processing failed + pub error: Option, + + /// Warning message if there were issues + pub warning: Option, + + /// Content statistics + pub content_stats: Option, + + /// Relevance score for this URL (0-1) + pub relevance_score: Option, + + /// Whether this URL was used in the final completion + pub used_in_completion: Option, + + /// Fields extracted from this URL + pub extracted_fields: Option>, +} + +/// Timing information for URL processing +#[derive(Deserialize, Serialize, Debug, Clone)] +#[serde(rename_all = "camelCase")] +pub struct URLTraceTiming { + /// When the URL was discovered + pub discovered_at: String, + + /// When scraping began for this URL + pub scraped_at: Option, + + /// When processing was completed for this URL + pub completed_at: Option, +} + +/// Statistics about processed content +#[derive(Deserialize, Serialize, Debug, Clone)] +#[serde(rename_all = "camelCase")] +pub struct ContentStats { + /// Length of the raw content in characters + pub raw_content_length: u32, + + /// Length of the processed content in characters + pub processed_content_length: u32, + + /// Number of tokens used for this content + pub tokens_used: u32, +} + +/// Response for extract status check +#[derive(Deserialize, Serialize, Debug, Clone)] +#[serde(rename_all = "camelCase")] +pub struct ExtractStatusResponse { + /// Whether the request was successful + pub success: bool, + + /// Status of the extract job: "pending", "processing", "completed", "failed" + pub status: String, + + /// Extracted data, present when status is "completed" + pub data: Option, + + /// Error message if the job failed + pub error: Option, + + /// URL trace information if requested + pub url_trace: Option>, + + /// Sources information if requested + pub sources: Option>>, +} + +impl FirecrawlApp { + /// Extracts information from URLs using the Firecrawl API. + /// + /// This is the synchronous version that polls until completion. + /// + /// Either `params.prompt` or `params.schema` must be provided. + pub async fn extract( + &self, + params: impl Into, + ) -> Result { + let mut params = params.into(); + // Validation: Either prompt or schema must be provided + if params.prompt.is_none() && params.schema.is_none() { + return Err(FirecrawlError::APIError( + "Extract validation".to_string(), + crate::error::FirecrawlAPIError { + success: false, + error: "Either prompt or schema must be provided".to_string(), + details: None, + }, + )); + } + + // Set default origin if not provided + if params.origin.is_none() { + params.origin = Some("api-sdk".to_string()); + } + + // Initiate the extract job asynchronously + let response = self.async_extract(params).await?; + + // Poll for the result + let poll_interval = 2000; // Default to 2 seconds + self.monitor_extract_job_status(&response.id, poll_interval) + .await + } + + pub async fn extract_with_schemars( + &self, + params: impl Into, + ) -> Result + where + T: schemars::JsonSchema, + { + let mut params = params.into(); + let schema = schema_for!(T); + let schema_json = serde_json::to_value(schema).map_err(|e| { + FirecrawlError::APIError( + "Schema serialization".to_string(), + crate::error::FirecrawlAPIError { + success: false, + error: e.to_string(), + details: None, + }, + ) + })?; + params.schema = Some(schema_json); + self.extract(params).await + } + + /// Initiates an asynchronous extract operation. + /// + /// # Arguments + /// + /// * `params` - Parameters for the extract request + /// + /// # Returns + /// + /// A response containing the extract job ID, or a FirecrawlError if the request fails. + /// + /// # Notes + /// + /// Either `params.urls` or `params.prompt` must be provided. + /// Either `params.prompt` or `params.schema` must be provided. + pub async fn async_extract( + &self, + params: impl Into, + ) -> Result { + let params = params.into(); + // Validation: Either URLs or prompt must be provided + if params.urls.is_none() && params.prompt.is_none() { + return Err(FirecrawlError::APIError( + "Extract validation".to_string(), + crate::error::FirecrawlAPIError { + success: false, + error: "Either URLs or prompt must be provided".to_string(), + details: None, + }, + )); + } + + // Validation: Either prompt or schema must be provided + if params.prompt.is_none() && params.schema.is_none() { + return Err(FirecrawlError::APIError( + "Extract validation".to_string(), + crate::error::FirecrawlAPIError { + success: false, + error: "Either prompt or schema must be provided".to_string(), + details: None, + }, + )); + } + + let headers = self.prepare_headers(None); + + let response = self + .client + .post(format!("{}{}/extract", self.api_url, API_VERSION)) + .headers(headers) + .json(¶ms) + .send() + .await + .map_err(|e| FirecrawlError::HttpError("Initiating extract job".to_string(), e))?; + + self.handle_response(response, "initiate extract job").await + } + + /// Checks the status of an extract job. + /// + /// # Arguments + /// + /// * `id` - The ID of the extract job + /// + /// # Returns + /// + /// A response containing the status of the extract job, or a FirecrawlError if the request fails. + pub async fn get_extract_status( + &self, + id: impl AsRef, + ) -> Result { + let response = self + .client + .get(format!( + "{}{}/extract/{}", + self.api_url, + API_VERSION, + id.as_ref() + )) + .headers(self.prepare_headers(None)) + .send() + .await + .map_err(|e| { + FirecrawlError::HttpError(format!("Checking status of extract {}", id.as_ref()), e) + })?; + + self.handle_response( + response, + format!("Checking status of extract {}", id.as_ref()), + ) + .await + } + + /// Helper function to poll for extract job status until completion + async fn monitor_extract_job_status( + &self, + id: &str, + poll_interval: u64, + ) -> Result { + loop { + let status_data = self.get_extract_status(id).await?; + + match status_data.status.as_str() { + "completed" => { + break Ok(status_data); + } + "pending" | "processing" => { + tokio::time::sleep(tokio::time::Duration::from_millis(poll_interval)).await; + } + "failed" => { + let error_msg = status_data + .error + .clone() + .unwrap_or_else(|| "Extract job failed".to_string()); + break Err(FirecrawlError::APIError( + "Extract job failed".to_string(), + crate::error::FirecrawlAPIError { + success: false, + error: error_msg, + details: None, + }, + )); + } + _ => { + break Err(FirecrawlError::APIError( + "Extract job status".to_string(), + crate::error::FirecrawlAPIError { + success: false, + error: format!("Unexpected status: {}", status_data.status), + details: None, + }, + )); + } + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use serde_json::json; + + #[tokio::test] + #[ignore = "Makes real network request"] + async fn test_real_extract() { + let api_url = std::env::var("FIRECRAWL_API_URL") + .expect("Please set the FIRECRAWL_API_URL environment variable"); + let app = FirecrawlApp::new_selfhosted(api_url, None::<&str>).unwrap(); + + // Create extract params + let params = ExtractParams { + urls: Some(vec!["https://example.com".to_string()]), + prompt: Some("Extract the title and main content from this page".to_string()), + schema: None, + origin: Some("test".to_string()), + ..Default::default() + }; + + // Start an extract job + let response = app.async_extract(params).await.unwrap(); + + assert!(response.success); + assert!(!response.id.is_empty()); + } + + #[tokio::test] + async fn test_async_extract_with_mock() { + let mut server = mockito::Server::new_async().await; + + // Set up the mock for the extract request + let mock = server + .mock("POST", "/v1/extract") + .match_body(mockito::Matcher::PartialJson(json!({ + "urls": ["https://example.com"], + "prompt": "Extract the title and main content" + }))) + .with_status(200) + .with_header("content-type", "application/json") + .with_body( + json!({ + "success": true, + "id": "extract-123", + "urlTrace": [] + }) + .to_string(), + ) + .create(); + + let app = FirecrawlApp::new_selfhosted(server.url(), Some("test_key")).unwrap(); + + let params = ExtractParams { + urls: Some(vec!["https://example.com".to_string()]), + prompt: Some("Extract the title and main content".to_string()), + schema: None, + ..Default::default() + }; + + let response = app.async_extract(params).await.unwrap(); + + assert!(response.success); + assert_eq!(response.id, "extract-123"); + assert!(response.url_trace.unwrap_or_default().is_empty()); + mock.assert(); + } + + #[tokio::test] + async fn test_extract_with_schema() { + let mut server = mockito::Server::new_async().await; + + // Set up the mock for the extract request with schema + let mock = server + .mock("POST", "/v1/extract") + .match_body(mockito::Matcher::PartialJson(json!({ + "urls": ["https://example.com"], + "schema": { + "type": "object", + "properties": { + "title": { "type": "string" }, + "content": { "type": "string" } + } + } + }))) + .with_status(200) + .with_header("content-type", "application/json") + .with_body( + json!({ + "success": true, + "id": "extract-123" + }) + .to_string(), + ) + .create(); + + // Set up the mock for the status request + let status_mock = server + .mock("GET", "/v1/extract/extract-123") + .with_status(200) + .with_header("content-type", "application/json") + .with_body( + json!({ + "success": true, + "status": "completed", + "data": { + "title": "Example Domain", + "content": "This domain is for use in illustrative examples in documents." + } + }) + .to_string(), + ) + .create(); + + let app = FirecrawlApp::new_selfhosted(server.url(), Some("test_key")).unwrap(); + + let urls = Some(vec!["https://example.com".to_string()]); + let params = ExtractParams { + urls, + schema: Some(json!({ + "type": "object", + "properties": { + "title": { "type": "string" }, + "content": { "type": "string" } + } + })), + ..Default::default() + }; + + let response = app.extract(params).await.unwrap(); + + assert!(response.success); + assert_eq!(response.status, "completed"); + + let data = response.data.unwrap(); + assert_eq!(data["title"], "Example Domain"); + assert_eq!( + data["content"], + "This domain is for use in illustrative examples in documents." + ); + + mock.assert(); + status_mock.assert(); + } + + #[tokio::test] + async fn test_extract_status_with_mock() { + let mut server = mockito::Server::new_async().await; + + // Set up the mock for the status check + let mock = server + .mock("GET", "/v1/extract/extract-123") + .with_status(200) + .with_header("content-type", "application/json") + .with_body( + json!({ + "success": true, + "status": "processing", + "urlTrace": [ + { + "url": "https://example.com", + "status": "scraping", + "timing": { + "discoveredAt": "2023-01-01T00:00:00Z" + } + } + ] + }) + .to_string(), + ) + .create(); + + let app = FirecrawlApp::new_selfhosted(server.url(), Some("test_key")).unwrap(); + let status = app.get_extract_status("extract-123").await.unwrap(); + + assert!(status.success); + assert_eq!(status.status, "processing"); + assert_eq!(status.url_trace.unwrap()[0].url, "https://example.com"); + mock.assert(); + } + + #[tokio::test] + async fn test_extract_validation_errors() { + let app = FirecrawlApp::new_selfhosted("https://example.com", Some("test_key")).unwrap(); + + // Test missing both URLs and prompt + let result = app.async_extract(ExtractParams::default()).await; + assert!(result.is_err()); + + // Test having URLs but missing both prompt and schema + let params = ExtractParams { + urls: Some(vec!["https://example.com".to_string()]), + ..Default::default() + }; + let result = app.async_extract(params).await; + assert!(result.is_err()); + } + + #[tokio::test] + async fn test_extract_api_error() { + let mut server = mockito::Server::new_async().await; + + // Set up the mock for an error response + let mock = server + .mock("POST", "/v1/extract") + .with_status(400) + .with_header("content-type", "application/json") + .with_body( + json!({ + "success": false, + "error": "Invalid schema format" + }) + .to_string(), + ) + .create(); + + let app = FirecrawlApp::new_selfhosted(server.url(), Some("test_key")).unwrap(); + + let params = ExtractParams { + urls: Some(vec!["https://example.com".to_string()]), + schema: Some(json!("invalid")), // Invalid schema format + ..Default::default() + }; + + let result = app.async_extract(params).await; + assert!(result.is_err()); + mock.assert(); + } +} diff --git a/apps/rust-sdk/src/lib.rs b/apps/rust-sdk/src/lib.rs index 5d95cc7d..80bffc46 100644 --- a/apps/rust-sdk/src/lib.rs +++ b/apps/rust-sdk/src/lib.rs @@ -2,14 +2,18 @@ use reqwest::{Client, Response}; use serde::de::DeserializeOwned; use serde_json::Value; +pub mod batch_scrape; pub mod crawl; pub mod document; mod error; +pub mod extract; +pub mod llmstxt; pub mod map; pub mod scrape; +pub mod search; -pub use error::FirecrawlError; use error::FirecrawlAPIError; +pub use error::FirecrawlError; #[derive(Clone, Debug)] pub struct FirecrawlApp { @@ -26,9 +30,12 @@ impl FirecrawlApp { FirecrawlApp::new_selfhosted(CLOUD_API_URL, Some(api_key)) } - pub fn new_selfhosted(api_url: impl AsRef, api_key: Option>) -> Result { + pub fn new_selfhosted( + api_url: impl AsRef, + api_key: Option>, + ) -> Result { let url = api_url.as_ref().to_string(); - + if url == CLOUD_API_URL && api_key.is_none() { return Err(FirecrawlError::APIError( "Configuration".to_string(), @@ -36,7 +43,7 @@ impl FirecrawlApp { success: false, error: "API key is required for cloud service".to_string(), details: None, - } + }, )); } @@ -73,27 +80,43 @@ impl FirecrawlApp { .text() .await .map_err(|e| FirecrawlError::ResponseParseErrorText(e)) - .and_then(|response_json| serde_json::from_str::(&response_json).map_err(|e| FirecrawlError::ResponseParseError(e))) + .and_then(|response_json| { + serde_json::from_str::(&response_json) + .map_err(|e| FirecrawlError::ResponseParseError(e)) + .inspect(|data| { + #[cfg(debug_assertions)] + println!("Response JSON: {:#?}", data); + }) + }) .and_then(|response_value| { - if response_value["success"].as_bool().unwrap_or(false) { - Ok(serde_json::from_value::(response_value).map_err(|e| FirecrawlError::ResponseParseError(e))?) + if action.as_ref().starts_with("crawl_") // no success in check/cancel crawl responses + || response_value["success"].as_bool().unwrap_or(false) + { + Ok(serde_json::from_value::(response_value) + .map_err(|e| FirecrawlError::ResponseParseError(e))?) } else { Err(FirecrawlError::APIError( action.as_ref().to_string(), - serde_json::from_value(response_value).map_err(|e| FirecrawlError::ResponseParseError(e))? + serde_json::from_value(response_value) + .map_err(|e| FirecrawlError::ResponseParseError(e))?, )) } }); match &response { Ok(_) => response, - Err(FirecrawlError::ResponseParseError(_)) | Err(FirecrawlError::ResponseParseErrorText(_)) => { + Err(FirecrawlError::ResponseParseError(_)) + | Err(FirecrawlError::ResponseParseErrorText(_)) => { if is_success { response } else { - Err(FirecrawlError::HttpRequestFailed(action.as_ref().to_string(), status.as_u16(), status.as_str().to_string())) + Err(FirecrawlError::HttpRequestFailed( + action.as_ref().to_string(), + status.as_u16(), + status.as_str().to_string(), + )) } - }, + } Err(_) => response, } } diff --git a/apps/rust-sdk/src/llmstxt.rs b/apps/rust-sdk/src/llmstxt.rs new file mode 100644 index 00000000..f8fda7e9 --- /dev/null +++ b/apps/rust-sdk/src/llmstxt.rs @@ -0,0 +1,426 @@ +use serde::{Deserialize, Serialize}; + +use crate::{FirecrawlApp, FirecrawlError, API_VERSION}; + +/// Parameters for generating LLMs.txt +#[derive(Deserialize, Serialize, Debug, Clone)] +#[serde(rename_all = "camelCase")] +pub struct GenerateLLMsTextParams { + /// URL for which to generate LLMs.txt + pub url: String, + + /// Maximum number of URLs to process. Default: 10 + pub max_urls: u32, + + /// Whether to show the full LLMs-full.txt in the response. Default: false + pub show_full_text: bool, + + /// Experimental streaming option + #[serde(rename = "__experimental_stream")] + pub experimental_stream: bool, +} + +impl Default for GenerateLLMsTextParams { + fn default() -> Self { + Self { + url: String::new(), + max_urls: 1, + show_full_text: false, + experimental_stream: false, + } + } +} + +/// Response from initiating a LLMs.txt generation job +#[derive(Deserialize, Serialize, Debug, Clone)] +#[serde(rename_all = "camelCase")] +pub struct GenerateLLMsTextResponse { + /// Whether the request was successful + pub success: bool, + + /// Job ID for the LLMs.txt generation + pub id: String, +} + +#[derive(Deserialize, Serialize, Debug, Clone, Default)] +pub struct LLMTextData { + #[serde(rename = "llmstxt")] + pub compact: Option, + #[serde(rename = "llmsfulltxt")] + pub full: Option, +} + +/// Response from checking the status of a LLMs.txt generation job +#[derive(Deserialize, Serialize, Debug, Clone)] +#[serde(rename_all = "camelCase")] +pub struct GenerateLLMsTextStatusResponse { + /// Whether the request was successful + pub success: bool, + + /// Status of the job: "pending", "processing", "completed", "failed" + pub status: String, + + /// Generated LLMs.txt data, present when status is "completed" + #[serde(default)] + pub data: LLMTextData, + + /// Error message if the job failed + pub error: Option, + + /// Expiration timestamp for the data + pub expires_at: String, +} + +impl FirecrawlApp { + /// Generates LLMs.txt for a given URL and polls until completion. + /// + /// # Arguments + /// + /// * `params` - Parameters for the LLMs.txt generation + /// + /// # Returns + /// + /// A response containing the generation results, or a FirecrawlError if the request fails. + pub async fn generate_llms_text( + &self, + params: impl Into, + ) -> Result { + // Initiate the LLMs.txt generation job asynchronously + let response = self.async_generate_llms_text(params).await?; + + // Poll for the result + let poll_interval = 2000; // Default to 2 seconds + self.monitor_llms_text_job_status(&response.id, poll_interval) + .await + } + + /// Initiates an asynchronous LLMs.txt generation operation. + /// + /// # Arguments + /// + /// * `params` - Parameters for the LLMs.txt generation + /// + /// # Returns + /// + /// A response containing the generation job ID, or a FirecrawlError if the request fails. + pub async fn async_generate_llms_text( + &self, + params: impl Into, + ) -> Result { + let params = params.into(); + + // Validation: URL must be provided + if params.url.is_empty() { + return Err(FirecrawlError::APIError( + "Generate LLMs.txt validation".to_string(), + crate::error::FirecrawlAPIError { + success: false, + error: "URL must be provided".to_string(), + details: None, + }, + )); + } + + let headers = self.prepare_headers(None); + + let response = self + .client + .post(format!("{}{}/llmstxt", self.api_url, API_VERSION)) + .headers(headers) + .json(¶ms) + .send() + .await + .map_err(|e| { + FirecrawlError::HttpError("Initiating LLMs.txt generation".to_string(), e) + })?; + + self.handle_response(response, "initiate LLMs.txt generation") + .await + } + + /// Checks the status of a LLMs.txt generation operation. + /// + /// # Arguments + /// + /// * `id` - The ID of the LLMs.txt generation operation + /// + /// # Returns + /// + /// A response containing the current status and results of the generation operation, + /// or a FirecrawlError if the request fails. + pub async fn check_generate_llms_text_status( + &self, + id: impl AsRef, + ) -> Result { + let response = self + .client + .get(format!( + "{}{}/llmstxt/{}", + self.api_url, + API_VERSION, + id.as_ref() + )) + .headers(self.prepare_headers(None)) + .send() + .await + .map_err(|e| { + FirecrawlError::HttpError( + format!("Checking status of LLMs.txt generation {}", id.as_ref()), + e, + ) + })?; + + self.handle_response( + response, + format!("Checking status of LLMs.txt generation {}", id.as_ref()), + ) + .await + } + + /// Helper function to poll for LLMs.txt generation job status until completion + async fn monitor_llms_text_job_status( + &self, + id: &str, + poll_interval: u64, + ) -> Result { + loop { + let status_data = self.check_generate_llms_text_status(id).await?; + + match status_data.status.as_str() { + "completed" => { + break Ok(status_data); + } + "pending" | "processing" => { + tokio::time::sleep(tokio::time::Duration::from_millis(poll_interval)).await; + } + "failed" => { + let error_msg = status_data + .error + .clone() + .unwrap_or_else(|| "LLMs.txt generation failed".to_string()); + break Err(FirecrawlError::APIError( + "LLMs.txt generation failed".to_string(), + crate::error::FirecrawlAPIError { + success: false, + error: error_msg, + details: None, + }, + )); + } + _ => { + break Err(FirecrawlError::APIError( + "LLMs.txt generation status".to_string(), + crate::error::FirecrawlAPIError { + success: false, + error: format!("Unexpected status: {}", status_data.status), + details: None, + }, + )); + } + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use serde_json::json; + + #[tokio::test] + #[ignore = "Makes real network request"] + async fn test_real_generate_llms_text() { + let api_url = std::env::var("FIRECRAWL_API_URL") + .expect("Please set the FIRECRAWL_API_URL environment variable"); + let app = FirecrawlApp::new_selfhosted(api_url, None::<&str>).unwrap(); + + let params = GenerateLLMsTextParams { + url: "https://example.com".to_string(), + max_urls: 5, + show_full_text: true, + ..Default::default() + }; + + let response = app.async_generate_llms_text(params).await.unwrap(); + + assert!(response.success); + assert!(!response.id.is_empty()); + } + + #[tokio::test] + async fn test_async_generate_llms_text_with_mock() { + let mut server = mockito::Server::new_async().await; + + let mock = server + .mock("POST", "/v1/llmstxt") + .match_body(mockito::Matcher::PartialJson(json!({ + "url": "https://example.com", + "maxUrls": 5 + }))) + .with_status(200) + .with_header("content-type", "application/json") + .with_body( + json!({ + "success": true, + "id": "llmstxt-123" + }) + .to_string(), + ) + .create(); + + let app = FirecrawlApp::new_selfhosted(server.url(), Some("test_key")).unwrap(); + + let params = GenerateLLMsTextParams { + url: "https://example.com".to_string(), + max_urls: 5, + ..Default::default() + }; + + let response = app.async_generate_llms_text(params).await.unwrap(); + + assert!(response.success); + assert_eq!(response.id, "llmstxt-123"); + mock.assert(); + } + + #[tokio::test] + async fn test_check_generate_llms_text_status_with_mock() { + let mut server = mockito::Server::new_async().await; + + let mock = server + .mock("GET", "/v1/llmstxt/llmstxt-123") + .with_status(200) + .with_header("content-type", "application/json") + .with_body( + json!({ + "success": true, + "status": "processing", + "expiresAt": "2023-01-01T00:00:00Z" + }) + .to_string(), + ) + .create(); + + let app = FirecrawlApp::new_selfhosted(server.url(), Some("test_key")).unwrap(); + let status = app + .check_generate_llms_text_status("llmstxt-123") + .await + .unwrap(); + + assert!(status.success); + assert_eq!(status.status, "processing"); + assert_eq!(status.expires_at, "2023-01-01T00:00:00Z"); + mock.assert(); + } + + #[tokio::test] + async fn test_generate_llms_text_with_mock() { + let mut server = mockito::Server::new_async().await; + + // Set up the mock for the generate request + let mock = server + .mock("POST", "/v1/llmstxt") + .match_body(mockito::Matcher::PartialJson(json!({ + "url": "https://example.com", + "showFullText": true + }))) + .with_status(200) + .with_header("content-type", "application/json") + .with_body( + json!({ + "success": true, + "id": "llmstxt-123" + }) + .to_string(), + ) + .create(); + + // Set up the mock for the status request + let status_mock = server + .mock("GET", "/v1/llmstxt/llmstxt-123") + .with_status(200) + .with_header("content-type", "application/json") + .with_body( + json!({ + "success": true, + "status": "completed", + "data": { + "llmstxt": "Allow: /about\nDisallow: /admin\n", + "llmsfulltxt": "# LLMs.txt\n\nAllow: /about\nDisallow: /admin\n" + }, + "expiresAt": "2023-01-01T00:00:00Z" + }) + .to_string(), + ) + .create(); + + let app = FirecrawlApp::new_selfhosted(server.url(), Some("test_key")).unwrap(); + + let params = GenerateLLMsTextParams { + url: "https://example.com".to_string(), + show_full_text: true, + ..Default::default() + }; + + let response = app.generate_llms_text(params).await.unwrap(); + + assert!(response.success); + assert_eq!(response.status, "completed"); + + let data = response.data; + assert_eq!( + data.compact, + Some("Allow: /about\nDisallow: /admin\n".into()) + ); + assert_eq!( + data.full, + Some("# LLMs.txt\n\nAllow: /about\nDisallow: /admin\n".into()) + ); + + mock.assert(); + status_mock.assert(); + } + + #[tokio::test] + async fn test_generate_llms_text_validation_errors() { + let app = FirecrawlApp::new_selfhosted("https://example.com", Some("test_key")).unwrap(); + + // Test missing URL + let params = GenerateLLMsTextParams { + url: "".to_string(), + ..Default::default() + }; + let result = app.async_generate_llms_text(params).await; + assert!(result.is_err()); + } + + #[tokio::test] + async fn test_generate_llms_text_api_error() { + let mut server = mockito::Server::new_async().await; + + // Set up the mock for an error response + let mock = server + .mock("POST", "/v1/llmstxt") + .with_status(400) + .with_header("content-type", "application/json") + .with_body( + json!({ + "success": false, + "error": "Invalid URL format" + }) + .to_string(), + ) + .create(); + + let app = FirecrawlApp::new_selfhosted(server.url(), Some("test_key")).unwrap(); + + let params = GenerateLLMsTextParams { + url: "not-a-valid-url".to_string(), + ..Default::default() + }; + + let result = app.async_generate_llms_text(params).await; + assert!(result.is_err()); + mock.assert(); + } +} diff --git a/apps/rust-sdk/src/map.rs b/apps/rust-sdk/src/map.rs index 7c3b3a43..44016064 100644 --- a/apps/rust-sdk/src/map.rs +++ b/apps/rust-sdk/src/map.rs @@ -16,7 +16,7 @@ pub struct MapOptions { pub include_subdomains: Option, /// Maximum number of links to return (default: `5000`) - pub exclude_tags: Option, + pub limit: Option, } #[derive(Deserialize, Serialize, Debug, Default)] @@ -59,7 +59,9 @@ impl FirecrawlApp { .await .map_err(|e| FirecrawlError::HttpError(format!("Mapping {:?}", url.as_ref()), e))?; - let response = self.handle_response::(response, "scrape URL").await?; + let response = self + .handle_response::(response, "scrape URL") + .await?; Ok(response.links) } diff --git a/apps/rust-sdk/src/scrape.rs b/apps/rust-sdk/src/scrape.rs index b879fdaf..6432b04a 100644 --- a/apps/rust-sdk/src/scrape.rs +++ b/apps/rust-sdk/src/scrape.rs @@ -24,26 +24,26 @@ pub enum ScrapeFormats { Links, /// Will result in a URL to a screenshot of the page. - /// + /// /// Can not be used in conjunction with `ScrapeFormats::ScreenshotFullPage`. #[serde(rename = "screenshot")] Screenshot, /// Will result in a URL to a full-page screenshot of the page. - /// + /// /// Can not be used in conjunction with `ScrapeFormats::Screenshot`. #[serde(rename = "screenshot@fullPage")] ScreenshotFullPage, /// Will result in the results of an LLM extraction. - /// + /// /// See `ScrapeOptions.extract` for more options. #[serde(rename = "extract")] Extract, } #[serde_with::skip_serializing_none] -#[derive(Deserialize, Serialize, Debug, Default)] +#[derive(Deserialize, Serialize, Debug, Default, Clone)] #[serde(rename_all = "camelCase")] pub struct ExtractOptions { /// Schema the output should adhere to, provided in JSON Schema format. @@ -56,7 +56,7 @@ pub struct ExtractOptions { } #[serde_with::skip_serializing_none] -#[derive(Deserialize, Serialize, Debug, Default)] +#[derive(Deserialize, Serialize, Debug, Default, Clone)] #[serde(rename_all = "camelCase")] pub struct ScrapeOptions { /// Formats to extract from the page. (default: `[ Markdown ]`) @@ -66,12 +66,12 @@ pub struct ScrapeOptions { pub only_main_content: Option, /// HTML tags to exclusively include. - /// + /// /// For example, if you pass `div`, you will only get content from `
`s and their children. pub include_tags: Option>, /// HTML tags to exclude. - /// + /// /// For example, if you pass `img`, you will never get image URLs in your results. pub exclude_tags: Option>, @@ -131,7 +131,9 @@ impl FirecrawlApp { .await .map_err(|e| FirecrawlError::HttpError(format!("Scraping {:?}", url.as_ref()), e))?; - let response = self.handle_response::(response, "scrape URL").await?; + let response = self + .handle_response::(response, "scrape URL") + .await?; Ok(response.data) } diff --git a/apps/rust-sdk/src/search.rs b/apps/rust-sdk/src/search.rs new file mode 100644 index 00000000..397bc8d1 --- /dev/null +++ b/apps/rust-sdk/src/search.rs @@ -0,0 +1,245 @@ +use crate::{scrape::ScrapeOptions, FirecrawlApp, FirecrawlError, API_VERSION}; +use serde::{Deserialize, Serialize}; + +#[derive(Deserialize, Serialize, Debug, Clone)] +#[serde(rename_all = "camelCase")] +pub struct SearchParams { + /// The search query string + pub query: String, + /// Maximum number of results to return. Default: 5, Max: 20 + pub limit: Option, + /// Time-based search filter. + #[serde(skip_serializing_if = "Option::is_none")] + pub tbs: Option, + /// Query string to filter search results. Example: "site:example.com" + #[serde(skip_serializing_if = "Option::is_none")] + pub filter: Option, + /// Language code. Default: "en" + pub lang: Option, + /// Country code. Default: "us" + pub country: Option, + /// Geographic location string for local search results + #[serde(skip_serializing_if = "Option::is_none")] + pub location: Option, + /// Origin identifier. Default: "api" + pub origin: Option, + /// Timeout in milliseconds. Default: 60000 + pub timeout: Option, + /// Additional options for webpage scraping behavior + #[serde(skip_serializing_if = "Option::is_none")] + pub scrape_options: Option, +} + +impl Default for SearchParams { + fn default() -> Self { + Self { + query: String::new(), + limit: Some(5), + tbs: None, + filter: None, + lang: Some("en".to_string()), + country: Some("us".to_string()), + location: None, + origin: Some("api".to_string()), + timeout: Some(60000), + scrape_options: None, + } + } +} + +#[derive(Deserialize, Serialize, Debug, Clone)] +#[serde(rename_all = "camelCase")] +pub struct SearchResponse { + pub success: bool, + pub data: Vec, + pub warning: Option, +} + +// TODO: Consider merging fields into document::Document (url, title, description) while preserving optionality +/// A document returned from a search or scrape request +#[serde_with::skip_serializing_none] +#[derive(Deserialize, Serialize, Debug, Clone)] +#[serde(rename_all = "camelCase")] +pub struct SearchDocument { + /// Document URL + pub url: String, + /// Document title + pub title: String, + /// Document description + pub description: String, +} + +impl FirecrawlApp { + /// Search for content using the Firecrawl API. + /// + /// # Arguments + /// + /// * `query` - The search query string + /// * `params` - Optional parameters for the search request + /// + /// # Returns + /// + /// A SearchResponse containing the search results, or a FirecrawlError if the request fails. + pub async fn search( + &self, + query: impl AsRef, + params: impl Into>, + ) -> Result { + let mut search_params = params.into().unwrap_or_default(); + search_params.query = query.as_ref().to_string(); + + self.search_with_params(search_params).await + } + + /// Alternative method that takes SearchParams directly + /// + /// # Arguments + /// + /// * `params` - Search parameters including the query + /// + /// # Returns + /// + /// A SearchResponse containing the search results, or a FirecrawlError if the request fails. + pub async fn search_with_params( + &self, + params: SearchParams, + ) -> Result { + let headers = self.prepare_headers(None); + + let response = self + .client + .post(format!("{}{}/search", self.api_url, API_VERSION)) + .headers(headers) + .json(¶ms) + .send() + .await + .map_err(|e| { + FirecrawlError::HttpError(format!("Searching with query: {:?}", params.query), e) + })?; + + self.handle_response::(response, "search") + .await + } +} + +#[cfg(test)] +pub mod tests { + use super::*; + use serde_json::json; + + #[tokio::test] + #[ignore = "Makes real network request"] + async fn test_real_search() { + let api_url = std::env::var("FIRECRAWL_API_URL") + .expect("Please set the FIRECRAWL_API_URL environment variable"); + let app = FirecrawlApp::new_selfhosted(api_url, None::<&str>).unwrap(); + let response = app.search("test query", None).await.unwrap(); + assert!(response.success); + } + + #[tokio::test] + async fn test_search_with_mock() { + let mut server = mockito::Server::new_async().await; + + let mock = server + .mock("POST", "/v1/search") + .with_status(200) + .with_header("content-type", "application/json") + .with_body( + json!({ + "success": true, + "data": [{ + "url": "https://example.com", + "title": "Example Domain", + "description": "...." + }], + }) + .to_string(), + ) + .create(); + + let app = FirecrawlApp::new_selfhosted(server.url(), Some("test_key")).unwrap(); + let response = app.search("test", None).await.unwrap(); + + assert!(response.success); + assert_eq!(response.data.len(), 1); + assert_eq!(response.data[0].url, "https://example.com"); + assert_eq!(response.data[0].title, "Example Domain".to_string()); + assert_eq!(response.data[0].description, "....".to_string()); + mock.assert(); + } + + #[tokio::test] + async fn test_search_with_params() { + let mut server = mockito::Server::new_async().await; + let mock = server + .mock("POST", "/v1/search") + .with_header("content-type", "application/json") + .match_body(mockito::Matcher::Json(json!({ + "query": "test", + "limit": 10, + "lang": "fr", + "country": "fr", + "origin": "api", + "timeout": 30000 + }))) + .with_status(200) + .with_header("content-type", "application/json") + .with_body( + json!({ + "success": true, + "data": [], + "warning": "No results found" + }) + .to_string(), + ) + .create(); + + let app = FirecrawlApp::new_selfhosted(server.url(), Some("test_key")).unwrap(); + let params = SearchParams { + query: "test".to_string(), + limit: Some(10), + lang: Some("fr".to_string()), + country: Some("fr".to_string()), + timeout: Some(30000), + ..Default::default() + }; + + let response = app.search_with_params(params).await.unwrap(); + + assert!(response.success); + assert_eq!(response.data.len(), 0); + assert_eq!(response.warning, Some("No results found".to_string())); + mock.assert(); + } + + #[tokio::test] + async fn test_search_error_response() { + let mut server = mockito::Server::new_async().await; + let mock = server + .mock("POST", "/v1/search") + .with_status(400) + .with_header("content-type", "application/json") + .with_body( + json!({ + "success": false, + "error": "Invalid query" + }) + .to_string(), + ) + .create(); + + let app = FirecrawlApp::new_selfhosted(server.url(), Some("test_key")).unwrap(); + let result = app.search("", None).await; + + assert!(result.is_err()); + mock.assert(); + } + + #[tokio::test] + async fn test_search_network_error() { + let app = FirecrawlApp::new_selfhosted("http://invalid-url", Some("test_key")).unwrap(); + let result = app.search("test", None).await; + assert!(result.is_err()); + } +} diff --git a/apps/rust-sdk/tests/e2e_with_auth.rs b/apps/rust-sdk/tests/e2e_with_auth.rs index 882a2941..071537f5 100644 --- a/apps/rust-sdk/tests/e2e_with_auth.rs +++ b/apps/rust-sdk/tests/e2e_with_auth.rs @@ -1,4 +1,3 @@ -use assert_matches::assert_matches; use dotenvy::dotenv; use firecrawl::scrape::{ExtractOptions, ScrapeFormats, ScrapeOptions}; use firecrawl::{FirecrawlApp, FirecrawlError}; @@ -24,11 +23,8 @@ use std::env; async fn test_successful_response_with_valid_preview_token() { dotenv().ok(); let api_url = env::var("API_URL").unwrap(); - let app = FirecrawlApp::new_selfhosted( - api_url, - Some(env::var("PREVIEW_TOKEN").unwrap()), - ) - .unwrap(); + let app = + FirecrawlApp::new_selfhosted(api_url, Some(env::var("PREVIEW_TOKEN").unwrap())).unwrap(); let result = app .scrape_url("https://roastmywebsite.ai", None) .await @@ -58,7 +54,7 @@ async fn test_successful_response_with_valid_api_key_and_include_html() { let api_key = env::var("TEST_API_KEY").ok(); let app = FirecrawlApp::new_selfhosted(api_url, api_key).unwrap(); let params = ScrapeOptions { - formats: vec! [ ScrapeFormats::Markdown, ScrapeFormats::HTML ].into(), + formats: vec![ScrapeFormats::Markdown, ScrapeFormats::HTML].into(), ..Default::default() }; let result = app @@ -82,7 +78,8 @@ async fn test_successful_response_for_valid_scrape_with_pdf_file() { .await .unwrap(); assert!(result.markdown.is_some()); - assert!(result.markdown + assert!(result + .markdown .unwrap() .contains("We present spectrophotometric observations of the Broad Line Radio Galaxy")); } @@ -98,12 +95,12 @@ async fn test_successful_response_for_valid_scrape_with_pdf_file_without_explici .await .unwrap(); assert!(result.markdown.is_some()); - assert!(result.markdown + assert!(result + .markdown .unwrap() .contains("We present spectrophotometric observations of the Broad Line Radio Galaxy")); } - // #[tokio::test] // async fn test_should_return_error_for_blocklisted_url() { // dotenv().ok(); @@ -159,14 +156,18 @@ async fn test_llm_extraction() { #[test] fn test_api_key_requirements() { dotenv().ok(); - + let api_url = env::var("API_URL").unwrap_or("http://localhost:3002".to_string()); let api_key = env::var("TEST_API_KEY").ok(); match (api_url.contains("api.firecrawl.dev"), api_key) { (false, _) => { let result = FirecrawlApp::new_selfhosted(&api_url, None::); - assert!(result.is_ok(), "Local setup failed: {:?}", result.err().unwrap()); + assert!( + result.is_ok(), + "Local setup failed: {:?}", + result.err().unwrap() + ); } (true, None) => { let result = FirecrawlApp::new_selfhosted(&api_url, None::); From ec3d679c5b158311907b2601f4a694f7fbde23af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 17 Apr 2025 22:15:13 -0700 Subject: [PATCH 160/160] feat(rust-sdk): add agent options --- apps/rust-sdk/src/extract.rs | 11 +++++++++++ apps/rust-sdk/src/scrape.rs | 35 +++++++++++++++++++++++++++++------ 2 files changed, 40 insertions(+), 6 deletions(-) diff --git a/apps/rust-sdk/src/extract.rs b/apps/rust-sdk/src/extract.rs index a1dd2ef9..a8721ee0 100644 --- a/apps/rust-sdk/src/extract.rs +++ b/apps/rust-sdk/src/extract.rs @@ -6,6 +6,14 @@ use serde_json::Value; use crate::{FirecrawlApp, FirecrawlError, API_VERSION}; +/// Agent options for extract requests +#[derive(Deserialize, Serialize, Debug, Default, Clone)] +#[serde(rename_all = "camelCase")] +pub struct AgentOptionsExtract { + /// Model to use for the agent + pub model: String, +} + /// Parameters for extract requests #[serde_with::skip_serializing_none] #[derive(Deserialize, Serialize, Debug, Default, Clone)] @@ -50,6 +58,9 @@ pub struct ExtractParams { /// Maximum number of URLs to process pub limit: Option, + /// Agent options + pub agent: Option, + /// Experimental: Stream steps information #[serde(rename = "__experimental_streamSteps")] pub experimental_stream_steps: Option, diff --git a/apps/rust-sdk/src/scrape.rs b/apps/rust-sdk/src/scrape.rs index 6432b04a..5e2496fb 100644 --- a/apps/rust-sdk/src/scrape.rs +++ b/apps/rust-sdk/src/scrape.rs @@ -37,22 +37,42 @@ pub enum ScrapeFormats { /// Will result in the results of an LLM extraction. /// - /// See `ScrapeOptions.extract` for more options. - #[serde(rename = "extract")] - Extract, + /// See `ScrapeOptions.json_options` for more options. + #[serde(rename = "json")] + Json, +} + +#[derive(Deserialize, Serialize, Debug, Default, Clone)] +#[serde(rename_all = "camelCase")] +pub struct AgentOptionsJson { + pub model: String, + pub prompt: Option, } #[serde_with::skip_serializing_none] #[derive(Deserialize, Serialize, Debug, Default, Clone)] #[serde(rename_all = "camelCase")] -pub struct ExtractOptions { +pub struct JsonOptions { /// Schema the output should adhere to, provided in JSON Schema format. pub schema: Option, + /// System prompt to send to the LLM agent along with the page content. pub system_prompt: Option, /// Extraction prompt to send to the LLM agent along with the page content. pub prompt: Option, + + /// Agent options for JSON extraction. + pub agent: Option, +} + +#[derive(Deserialize, Serialize, Debug, Default, Clone)] +#[serde(rename_all = "camelCase")] +pub struct AgentOptions { + pub model: String, + pub prompt: Option, + pub session_id: Option, + pub wait_before_closing_ms: Option, } #[serde_with::skip_serializing_none] @@ -84,8 +104,11 @@ pub struct ScrapeOptions { // Timeout before returning an error, in milliseconds. (default: `60000`) pub timeout: Option, - /// Extraction options, to be used in conjunction with `ScrapeFormats::Extract`. - pub extract: Option, + /// JSON extraction options, to be used in conjunction with `ScrapeFormats::Json`. + pub json_options: Option, + + /// Agent options for smart scrape. + pub agent: Option, } #[derive(Deserialize, Serialize, Debug, Default)]