diff --git a/apps/api/package.json b/apps/api/package.json index 53324783..fc7cf224 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -84,7 +84,6 @@ "escape-html": "^1.0.3", "express-rate-limit": "^7.3.1", "express-ws": "^5.0.2", - "form-data": "^4.0.0", "glob": "^10.4.2", "gpt3-tokenizer": "^1.1.5", "ioredis": "^5.4.1", @@ -117,6 +116,7 @@ "turndown": "^7.1.3", "turndown-plugin-gfm": "^1.0.2", "typesense": "^1.5.4", + "undici": "^6.20.1", "unstructured-client": "^0.11.3", "uuid": "^10.0.0", "winston": "^3.14.2", diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index 9a1c9a22..42e05c4c 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -110,9 +110,6 @@ importers: express-ws: specifier: ^5.0.2 version: 5.0.2(express@4.19.2) - form-data: - specifier: ^4.0.0 - version: 4.0.0 glob: specifier: ^10.4.2 version: 10.4.2 @@ -209,6 +206,9 @@ importers: typesense: specifier: ^1.5.4 version: 1.8.2(@babel/runtime@7.24.6) + undici: + specifier: ^6.20.1 + version: 6.20.1 unstructured-client: specifier: ^0.11.3 version: 0.11.3(zod@3.23.8) @@ -5007,6 +5007,10 @@ packages: undici-types@5.26.5: resolution: {integrity: sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==} + undici@6.20.1: + resolution: {integrity: sha512-AjQF1QsmqfJys+LXfGTNum+qw4S88CojRInG/6t31W/1fk6G59s92bnAvGz5Cmur+kQv2SURXEvvudLmbrE8QA==} + engines: {node: '>=18.17'} + union@0.5.0: resolution: {integrity: sha512-N6uOhuW6zO95P3Mel2I2zMsbsanvvtgn6jVqJv4vbVcz/JN0OkL9suomjQGmWtxJQXOCqUJvquc1sMeNz/IwlA==} engines: {node: '>= 0.8.0'} @@ -11209,6 +11213,8 @@ snapshots: undici-types@5.26.5: {} + undici@6.20.1: {} + union@0.5.0: dependencies: qs: 6.12.2 diff --git a/apps/api/requests.http b/apps/api/requests.http index 809bae7b..4ce40b2c 100644 --- a/apps/api/requests.http +++ b/apps/api/requests.http @@ -1,5 +1,10 @@ -### Crawl Website -POST http://localhost:3002/v0/scrape HTTP/1.1 +# Pick your baseUrl here: +@baseUrl = http://localhost:3002 +# @baseUrl = https://api.firecrawl.dev + +### Scrape Website +# @name scrape +POST {{baseUrl}}/v1/scrape HTTP/1.1 Authorization: Bearer {{$dotenv TEST_API_KEY}} content-type: application/json @@ -7,17 +12,9 @@ content-type: application/json "url":"firecrawl.dev" } -### Check Job Status -GET http://localhost:3002/v1/crawl/1dd0f924-a36f-4b96-94ea-32ed954dac67 HTTP/1.1 -Authorization: Bearer {{$dotenv TEST_API_KEY}} - - -### Check Job Status -GET http://localhost:3002/v0/jobs/active HTTP/1.1 - - -### Scrape Website -POST http://localhost:3002/v0/crawl HTTP/1.1 +### Crawl Website +# @name crawl +POST {{baseUrl}}/v1/crawl HTTP/1.1 Authorization: Bearer {{$dotenv TEST_API_KEY}} content-type: application/json @@ -25,58 +22,28 @@ content-type: application/json "url": "firecrawl.dev" } -## "reoveTags": [], - # "mode": "crawl", - # "crawlerOptions": { - # "allowBackwardCrawling": false - # }, - # "pageOptions": { - # "onlyMainContent": false, - # "includeHtml": false, - # "parsePDF": true - # } +### Check Crawl Status +@crawlId = {{crawl.response.body.$.id}} +# @name crawlStatus +GET {{baseUrl}}/v1/crawl/{{crawlId}} HTTP/1.1 +Authorization: Bearer {{$dotenv TEST_API_KEY}} - - - - - - -### Scrape Website -POST http://localhost:3002/v0/scrape HTTP/1.1 +### Batch Scrape Websites +# @name batchScrape +POST {{baseUrl}}/v1/batch/scrape HTTP/1.1 Authorization: Bearer {{$dotenv TEST_API_KEY}} content-type: application/json { - "url":"https://mendable.ai" + "urls": [ + "firecrawl.dev", + "mendable.ai" + ] } - - -### Check Job Status -GET http://localhost:3002/v0/crawl/status/a6053912-d602-4709-841f-3d2cb46fea0a HTTP/1.1 -Authorization: Bearer {{$dotenv TEST_API_KEY}} - -### Get Job Result - -POST https://api.firecrawl.dev/v0/crawl HTTP/1.1 -Authorization: Bearer {{$dotenv TEST_API_KEY}} -content-type: application/json - -{ - "url":"https://mendable.ai" -} - -### Check Job Status -GET https://api.firecrawl.dev/v0/crawl/status/cfcb71ac-23a3-4da5-bd85-d4e58b871d66 -Authorization: Bearer {{$dotenv TEST_API_KEY}} - -### Get Active Jobs Count -GET http://localhost:3002/serverHealthCheck -content-type: application/json - -### Notify Server Health Check -GET http://localhost:3002/serverHealthCheck/notify -content-type: application/json - +### Check Batch Scrape Status +@batchScrapeId = {{batchScrape.response.body.$.id}} +# @name batchScrapeStatus +GET {{baseUrl}}/v1/crawl/{{batchScrapeId}} HTTP/1.1 +Authorization: Bearer {{$dotenv TEST_API_KEY}} \ No newline at end of file diff --git a/apps/api/src/controllers/v0/crawl-status.ts b/apps/api/src/controllers/v0/crawl-status.ts index 7b6e610a..9c799eeb 100644 --- a/apps/api/src/controllers/v0/crawl-status.ts +++ b/apps/api/src/controllers/v0/crawl-status.ts @@ -75,7 +75,7 @@ export async function crawlStatusController(req: Request, res: Response) { const jobStatus = sc.cancelled ? "failed" : jobStatuses.every(x => x === "completed") ? "completed" : "active"; - const data = jobs.filter(x => x.failedReason !== "Concurreny limit hit").map(x => Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue); + const data = jobs.filter(x => x.failedReason !== "Concurreny limit hit" && x.returnvalue !== null).map(x => Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue); if ( jobs.length > 0 && diff --git a/apps/api/src/controllers/v0/crawl.ts b/apps/api/src/controllers/v0/crawl.ts index cb7a3ccc..fa7627da 100644 --- a/apps/api/src/controllers/v0/crawl.ts +++ b/apps/api/src/controllers/v0/crawl.ts @@ -138,6 +138,8 @@ export async function crawlController(req: Request, res: Response) { const { scrapeOptions, internalOptions } = fromLegacyScrapeOptions(pageOptions, undefined, undefined); + delete (scrapeOptions as any).timeout; + const sc: StoredCrawl = { originUrl: url, crawlerOptions, diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index f9048fd6..96dea785 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -239,11 +239,22 @@ const crawlerOptions = z.object({ export type CrawlerOptions = z.infer; +export const webhookSchema = z.preprocess(x => { + if (typeof x === "string") { + return { url: x }; + } else { + return x; + } +}, z.object({ + url: z.string().url(), + headers: z.record(z.string(), z.string()).default({}), +}).strict(strictMessage)) + export const crawlRequestSchema = crawlerOptions.extend({ url, origin: z.string().optional().default("api"), scrapeOptions: scrapeOptions.omit({ timeout: true }).default({}), - webhook: z.string().url().optional(), + webhook: webhookSchema.optional(), limit: z.number().default(10000), }).strict(strictMessage); diff --git a/apps/api/src/lib/crawl-redis.ts b/apps/api/src/lib/crawl-redis.ts index bd79a86d..2b255971 100644 --- a/apps/api/src/lib/crawl-redis.ts +++ b/apps/api/src/lib/crawl-redis.ts @@ -166,10 +166,11 @@ export async function lockURLs(id: string, sc: StoredCrawl, urls: string[]): Pro return res; } -export function crawlToCrawler(id: string, sc: StoredCrawl, initialUrl?: string): WebCrawler { +export function crawlToCrawler(id: string, sc: StoredCrawl, newBase?: string): WebCrawler { const crawler = new WebCrawler({ jobId: id, - initialUrl: initialUrl ?? sc.originUrl!, + initialUrl: sc.originUrl!, + baseUrl: newBase ? new URL(newBase).origin : undefined, includes: sc.crawlerOptions?.includes ?? [], excludes: sc.crawlerOptions?.excludes ?? [], maxCrawledLinks: sc.crawlerOptions?.maxCrawledLinks ?? 1000, diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index e5a25f37..7b4a97d9 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -27,6 +27,7 @@ export class WebCrawler { constructor({ jobId, initialUrl, + baseUrl, includes, excludes, maxCrawledLinks = 10000, @@ -38,6 +39,7 @@ export class WebCrawler { }: { jobId: string; initialUrl: string; + baseUrl?: string; includes?: string[]; excludes?: string[]; maxCrawledLinks?: number; @@ -49,7 +51,7 @@ export class WebCrawler { }) { this.jobId = jobId; this.initialUrl = initialUrl; - this.baseUrl = new URL(initialUrl).origin; + this.baseUrl = baseUrl ?? new URL(initialUrl).origin; this.includes = Array.isArray(includes) ? includes : []; this.excludes = Array.isArray(excludes) ? excludes : []; this.limit = limit; diff --git a/apps/api/src/scraper/scrapeURL/engines/docx/index.ts b/apps/api/src/scraper/scrapeURL/engines/docx/index.ts index f8196ccd..9881fae7 100644 --- a/apps/api/src/scraper/scrapeURL/engines/docx/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/docx/index.ts @@ -1,6 +1,6 @@ import { Meta } from "../.."; import { EngineScrapeResult } from ".."; -import { downloadFile, fetchFileToBuffer } from "../utils/downloadFile"; +import { downloadFile } from "../utils/downloadFile"; import mammoth from "mammoth"; export async function scrapeDOCX(meta: Meta): Promise { diff --git a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts index ea44b051..bdc916e0 100644 --- a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts @@ -1,5 +1,4 @@ import { createReadStream, promises as fs } from "node:fs"; -import FormData from "form-data"; import { Meta } from "../.."; import { EngineScrapeResult } from ".."; import * as marked from "marked"; @@ -16,10 +15,26 @@ async function scrapePDFWithLlamaParse(meta: Meta, tempFilePath: string): Promis meta.logger.debug("Processing PDF document with LlamaIndex", { tempFilePath }); const uploadForm = new FormData(); - uploadForm.append("file", createReadStream(tempFilePath), { - filename: tempFilePath, - contentType: "application/pdf", // NOTE: request.headers["Content-Type"]? - }); + + // This is utterly stupid but it works! - mogery + uploadForm.append("file", { + [Symbol.toStringTag]: "Blob", + name: tempFilePath, + stream() { + return createReadStream(tempFilePath) as unknown as ReadableStream + }, + arrayBuffer() { + throw Error("Unimplemented in mock Blob: arrayBuffer") + }, + size: (await fs.stat(tempFilePath)).size, + text() { + throw Error("Unimplemented in mock Blob: text") + }, + slice(start, end, contentType) { + throw Error("Unimplemented in mock Blob: slice") + }, + type: "application/pdf", + } as Blob); const upload = await robustFetch({ url: "https://api.cloud.llamaindex.ai/api/parsing/upload", @@ -47,6 +62,8 @@ async function scrapePDFWithLlamaParse(meta: Meta, tempFilePath: string): Promis schema: z.object({ markdown: z.string(), }), + tryCount: 32, + tryCooldown: 250, }); return { diff --git a/apps/api/src/scraper/scrapeURL/engines/utils/downloadFile.ts b/apps/api/src/scraper/scrapeURL/engines/utils/downloadFile.ts index 8db8892b..736faba7 100644 --- a/apps/api/src/scraper/scrapeURL/engines/utils/downloadFile.ts +++ b/apps/api/src/scraper/scrapeURL/engines/utils/downloadFile.ts @@ -4,6 +4,7 @@ import { createWriteStream, promises as fs } from "node:fs"; import { EngineError } from "../../error"; import { Writable } from "stream"; import { v4 as uuid } from "uuid"; +import * as undici from "undici"; export async function fetchFileToBuffer(url: string): Promise<{ response: Response, @@ -17,13 +18,21 @@ export async function fetchFileToBuffer(url: string): Promise<{ } export async function downloadFile(id: string, url: string): Promise<{ - response: Response + response: undici.Response tempFilePath: string }> { const tempFilePath = path.join(os.tmpdir(), `tempFile-${id}--${uuid()}`); const tempFileWrite = createWriteStream(tempFilePath); - const response = await fetch(url); // TODO: maybe we could use tlsclient for this? for proxying + // TODO: maybe we could use tlsclient for this? for proxying + // use undici to ignore SSL for now + const response = await undici.fetch(url, { + dispatcher: new undici.Agent({ + connect: { + rejectUnauthorized: false, + }, + }) + }); // This should never happen in the current state of JS (2024), but let's check anyways. if (response.body === null) { diff --git a/apps/api/src/scraper/scrapeURL/lib/fetch.ts b/apps/api/src/scraper/scrapeURL/lib/fetch.ts index 03bbd80c..09a280b8 100644 --- a/apps/api/src/scraper/scrapeURL/lib/fetch.ts +++ b/apps/api/src/scraper/scrapeURL/lib/fetch.ts @@ -2,7 +2,6 @@ import { Logger } from "winston"; import { z, ZodError } from "zod"; import { v4 as uuid } from "uuid"; import * as Sentry from "@sentry/node"; -import FormData from "form-data"; export type RobustFetchParams> = { url: string; @@ -16,6 +15,7 @@ export type RobustFetchParams> = { ignoreFailure?: boolean; requestId?: string; tryCount?: number; + tryCooldown?: number; }; export async function robustFetch, Output = z.infer>({ @@ -29,8 +29,9 @@ export async function robustFetch, Output = z.infer ignoreFailure = false, requestId = uuid(), tryCount = 1, + tryCooldown, }: RobustFetchParams): Promise { - const params = { url, logger, method, body, headers, schema, ignoreResponse, ignoreFailure, tryCount }; + const params = { url, logger, method, body, headers, schema, ignoreResponse, ignoreFailure, tryCount, tryCooldown }; let request: Response; try { @@ -38,14 +39,14 @@ export async function robustFetch, Output = z.infer method, headers: { ...(body instanceof FormData - ? body.getHeaders() + ? ({}) : body !== undefined ? ({ "Content-Type": "application/json", }) : {}), ...(headers !== undefined ? headers : {}), }, ...(body instanceof FormData ? ({ - body: body.getBuffer(), + body, }) : body !== undefined ? ({ body: JSON.stringify(body), }) : {}), @@ -87,6 +88,9 @@ export async function robustFetch, Output = z.infer if (request.status >= 300) { if (tryCount > 1) { logger.debug("Request sent failure status, trying " + (tryCount - 1) + " more times", { params, request, response, requestId }); + if (tryCooldown !== undefined) { + await new Promise((resolve) => setTimeout(() => resolve(null), tryCooldown)); + } return await robustFetch({ ...params, requestId, diff --git a/apps/api/src/services/notification/email_notification.ts b/apps/api/src/services/notification/email_notification.ts index 5aa95b30..e451e0c0 100644 --- a/apps/api/src/services/notification/email_notification.ts +++ b/apps/api/src/services/notification/email_notification.ts @@ -6,6 +6,7 @@ import { logger } from "../../../src/lib/logger"; import { sendSlackWebhook } from "../alerts/slack"; import { getNotificationString } from "./notification_string"; import { AuthCreditUsageChunk } from "../../controllers/v1/types"; +import { redlock } from "../redlock"; const emailTemplates: Record< NotificationType, @@ -88,6 +89,7 @@ export async function sendNotificationInternal( if (team_id === "preview") { return { success: true }; } + return await redlock.using([`notification-lock:${team_id}:${notificationType}`], 5000, async () => { if (!bypassRecentChecks) { const fifteenDaysAgo = new Date(); @@ -171,5 +173,6 @@ export async function sendNotificationInternal( return { success: false }; } - return { success: true }; + return { success: true }; + }); } diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 5a0b28db..33b2ca9a 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -352,7 +352,7 @@ async function processJob(job: Job & { id: string }, token: string) { if (!job.data.sitemapped && job.data.crawlerOptions !== null) { if (!sc.cancelled) { - const crawler = crawlToCrawler(job.data.crawl_id, sc, doc.metadata?.url ?? doc.metadata?.sourceURL ?? undefined); + const crawler = crawlToCrawler(job.data.crawl_id, sc, doc.metadata.url ?? doc.metadata.sourceURL ?? sc.originUrl); const links = crawler.filterLinks( crawler.extractLinksFromHTML(rawHtml ?? "", doc.metadata?.url ?? doc.metadata?.sourceURL ?? sc.originUrl as string), @@ -568,24 +568,24 @@ async function processJob(job: Job & { id: string }, token: string) { crawl_id: job.data.crawl_id, }); - await logJob({ - job_id: job.data.crawl_id, - success: false, - message: - typeof error === "string" - ? error - : error.message ?? - "Something went wrong... Contact help@mendable.ai", - num_docs: 0, - docs: [], - time_taken: 0, - team_id: job.data.team_id, - mode: job.data.crawlerOptions !== null ? "crawl" : "batch_scrape", - url: sc ? sc.originUrl ?? job.data.url : job.data.url, - crawlerOptions: sc ? sc.crawlerOptions : undefined, - scrapeOptions: sc ? sc.scrapeOptions : job.data.scrapeOptions, - origin: job.data.origin, - }); + // await logJob({ + // job_id: job.data.crawl_id, + // success: false, + // message: + // typeof error === "string" + // ? error + // : error.message ?? + // "Something went wrong... Contact help@mendable.ai", + // num_docs: 0, + // docs: [], + // time_taken: 0, + // team_id: job.data.team_id, + // mode: job.data.crawlerOptions !== null ? "crawl" : "batch_scrape", + // url: sc ? sc.originUrl ?? job.data.url : job.data.url, + // crawlerOptions: sc ? sc.crawlerOptions : undefined, + // scrapeOptions: sc ? sc.scrapeOptions : job.data.scrapeOptions, + // origin: job.data.origin, + // }); } // done(null, data); return data; diff --git a/apps/api/src/services/webhook.ts b/apps/api/src/services/webhook.ts index 620b6832..1cc4db84 100644 --- a/apps/api/src/services/webhook.ts +++ b/apps/api/src/services/webhook.ts @@ -1,15 +1,17 @@ import axios from "axios"; -import { logger } from "../../src/lib/logger"; +import { logger } from "../lib/logger"; import { supabase_service } from "./supabase"; import { WebhookEventType } from "../types"; import { configDotenv } from "dotenv"; +import { z } from "zod"; +import { webhookSchema } from "../controllers/v1/types"; configDotenv(); export const callWebhook = async ( teamId: string, id: string, data: any | null, - specified?: string, + specified?: z.infer, v1 = false, eventType: WebhookEventType = "crawl.page", awaitWebhook: boolean = false @@ -20,7 +22,7 @@ export const callWebhook = async ( id ); const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true"; - let webhookUrl = specified ?? selfHostedUrl; + let webhookUrl = specified ?? (selfHostedUrl ? webhookSchema.parse({ url: selfHostedUrl }) : undefined); // Only fetch the webhook URL from the database if the self-hosted webhook URL and specified webhook are not set // and the USE_DB_AUTHENTICATION environment variable is set to true @@ -73,7 +75,7 @@ export const callWebhook = async ( if (awaitWebhook) { try { await axios.post( - webhookUrl, + webhookUrl.url, { success: !v1 ? data.success @@ -92,6 +94,7 @@ export const callWebhook = async ( { headers: { "Content-Type": "application/json", + ...webhookUrl.headers, }, timeout: v1 ? 10000 : 30000, // 10 seconds timeout (v1) } @@ -104,7 +107,7 @@ export const callWebhook = async ( } else { axios .post( - webhookUrl, + webhookUrl.url, { success: !v1 ? data.success @@ -123,6 +126,7 @@ export const callWebhook = async ( { headers: { "Content-Type": "application/json", + ...webhookUrl.headers, }, timeout: v1 ? 10000 : 30000, // 10 seconds timeout (v1) } diff --git a/apps/api/src/types.ts b/apps/api/src/types.ts index 2da97bd1..d7821407 100644 --- a/apps/api/src/types.ts +++ b/apps/api/src/types.ts @@ -1,4 +1,5 @@ -import { AuthCreditUsageChunk, ScrapeOptions, Document as V1Document } from "./controllers/v1/types"; +import { z } from "zod"; +import { AuthCreditUsageChunk, ScrapeOptions, Document as V1Document, webhookSchema } from "./controllers/v1/types"; import { ExtractorOptions, Document } from "./lib/entities"; import { InternalOptions } from "./scraper/scrapeURL"; @@ -33,7 +34,7 @@ export interface WebScraperOptions { origin?: string; crawl_id?: string; sitemapped?: boolean; - webhook?: string; + webhook?: z.infer; v1?: boolean; is_scrape?: boolean; } diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index f7891b9e..5d0a7fc9 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.8.1", + "version": "1.8.2", "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts", diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 401b1c20..45e19197 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -153,7 +153,10 @@ export interface CrawlParams { allowExternalLinks?: boolean; ignoreSitemap?: boolean; scrapeOptions?: CrawlScrapeOptions; - webhook?: string; + webhook?: string | { + url: string; + headers?: Record; + }; deduplicateSimilarURLs?: boolean; ignoreQueryParameters?: boolean; }