diff --git a/apps/api/requests.http b/apps/api/requests.http index a3997371..4c69d011 100644 --- a/apps/api/requests.http +++ b/apps/api/requests.http @@ -19,7 +19,7 @@ Authorization: Bearer {{$dotenv TEST_API_KEY}} content-type: application/json { - "url": "https://firecrawl.dev" + "url":"https://firecrawl.dev" } ### Check Crawl Status diff --git a/apps/api/src/__tests__/snips/scrape.test.ts b/apps/api/src/__tests__/snips/scrape.test.ts index 2b697ca9..7495e789 100644 --- a/apps/api/src/__tests__/snips/scrape.test.ts +++ b/apps/api/src/__tests__/snips/scrape.test.ts @@ -84,6 +84,18 @@ describe("Scrape tests", () => { // expect(response.markdown).toMatch(/(\.g\.doubleclick\.net|amazon-adsystem\.com)\//); // }, 30000); // }); + + describe("Compare format", () => { + it.concurrent("works", async () => { + const response = await scrape({ + url: "https://example.com", + formats: ["markdown", "compare"], + }); + + expect(response.compare).toBeDefined(); + expect(response.compare?.previousScrapeAt).not.toBeNull(); + }); + }); describe("Location API (f-e dependant)", () => { it.concurrent("works without specifying an explicit location", async () => { diff --git a/apps/api/src/controllers/v0/crawl.ts b/apps/api/src/controllers/v0/crawl.ts index 2eba651d..c8b186b0 100644 --- a/apps/api/src/controllers/v0/crawl.ts +++ b/apps/api/src/controllers/v0/crawl.ts @@ -158,6 +158,7 @@ export async function crawlController(req: Request, res: Response) { pageOptions, undefined, undefined, + team_id ); internalOptions.disableSmartWaitCache = true; // NOTE: smart wait disabled for crawls to ensure contentful scrape, speed does not matter diff --git a/apps/api/src/controllers/v0/crawlPreview.ts b/apps/api/src/controllers/v0/crawlPreview.ts index ffb8ebba..9153ea79 100644 --- a/apps/api/src/controllers/v0/crawlPreview.ts +++ b/apps/api/src/controllers/v0/crawlPreview.ts @@ -99,6 +99,7 @@ export async function crawlPreviewController(req: Request, res: Response) { pageOptions, undefined, undefined, + team_id ); const sc: StoredCrawl = { diff --git a/apps/api/src/controllers/v0/scrape.ts b/apps/api/src/controllers/v0/scrape.ts index 62d62b09..0bdd197b 100644 --- a/apps/api/src/controllers/v0/scrape.ts +++ b/apps/api/src/controllers/v0/scrape.ts @@ -66,6 +66,7 @@ export async function scrapeHelper( extractorOptions, timeout, crawlerOptions, + team_id, ); await addScrapeJob( @@ -297,6 +298,7 @@ export async function scrapeController(req: Request, res: Response) { pageOptions, extractorOptions, timeout, + team_id, ); logJob({ diff --git a/apps/api/src/controllers/v0/search.ts b/apps/api/src/controllers/v0/search.ts index ac7d7f62..d8649a52 100644 --- a/apps/api/src/controllers/v0/search.ts +++ b/apps/api/src/controllers/v0/search.ts @@ -72,6 +72,7 @@ export async function searchHelper( undefined, 60000, crawlerOptions, + team_id, ); if (justSearch) { diff --git a/apps/api/src/controllers/v1/batch-scrape.ts b/apps/api/src/controllers/v1/batch-scrape.ts index d2c079bf..20fab47c 100644 --- a/apps/api/src/controllers/v1/batch-scrape.ts +++ b/apps/api/src/controllers/v1/batch-scrape.ts @@ -82,7 +82,7 @@ export async function batchScrapeController( : { crawlerOptions: null, scrapeOptions: req.body, - internalOptions: { disableSmartWaitCache: true }, // NOTE: smart wait disabled for batch scrapes to ensure contentful scrape, speed does not matter + internalOptions: { disableSmartWaitCache: true, teamId: req.auth.team_id }, // NOTE: smart wait disabled for batch scrapes to ensure contentful scrape, speed does not matter team_id: req.auth.team_id, createdAt: Date.now(), plan: req.auth.plan, diff --git a/apps/api/src/controllers/v1/crawl.ts b/apps/api/src/controllers/v1/crawl.ts index 51d373ee..31e39502 100644 --- a/apps/api/src/controllers/v1/crawl.ts +++ b/apps/api/src/controllers/v1/crawl.ts @@ -81,7 +81,7 @@ export async function crawlController( originUrl: req.body.url, crawlerOptions: toLegacyCrawlerOptions(crawlerOptions), scrapeOptions, - internalOptions: { disableSmartWaitCache: true }, // NOTE: smart wait disabled for crawls to ensure contentful scrape, speed does not matter + internalOptions: { disableSmartWaitCache: true, teamId: req.auth.team_id }, // NOTE: smart wait disabled for crawls to ensure contentful scrape, speed does not matter team_id: req.auth.team_id, createdAt: Date.now(), plan: req.auth.plan, diff --git a/apps/api/src/controllers/v1/map.ts b/apps/api/src/controllers/v1/map.ts index ebb0b324..49890d90 100644 --- a/apps/api/src/controllers/v1/map.ts +++ b/apps/api/src/controllers/v1/map.ts @@ -85,7 +85,7 @@ export async function getMapResults({ scrapeOptions: undefined, }, scrapeOptions: scrapeOptions.parse({}), - internalOptions: {}, + internalOptions: { teamId }, team_id: teamId, createdAt: Date.now(), plan: plan, diff --git a/apps/api/src/controllers/v1/scrape.ts b/apps/api/src/controllers/v1/scrape.ts index ec11e2cb..44214ee2 100644 --- a/apps/api/src/controllers/v1/scrape.ts +++ b/apps/api/src/controllers/v1/scrape.ts @@ -50,7 +50,7 @@ export async function scrapeController( mode: "single_urls", team_id: req.auth.team_id, scrapeOptions: req.body, - internalOptions: {}, + internalOptions: { teamId: req.auth.team_id }, plan: req.auth.plan!, origin: req.body.origin, is_scrape: true, diff --git a/apps/api/src/controllers/v1/search.ts b/apps/api/src/controllers/v1/search.ts index 18ff9579..082cd8cd 100644 --- a/apps/api/src/controllers/v1/search.ts +++ b/apps/api/src/controllers/v1/search.ts @@ -83,7 +83,7 @@ async function scrapeSearchResult( mode: "single_urls" as Mode, team_id: options.teamId, scrapeOptions: options.scrapeOptions, - internalOptions: {}, + internalOptions: { teamId: options.teamId }, plan: options.plan || "free", origin: options.origin, is_scrape: true, diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 459e5e56..b610826d 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -20,7 +20,8 @@ export type Format = | "links" | "screenshot" | "screenshot@fullPage" - | "extract"; + | "extract" + | "compare"; export const url = z.preprocess( (x) => { @@ -165,6 +166,7 @@ const baseScrapeOptions = z "screenshot@fullPage", "extract", "json", + "compare", ]) .array() .optional() @@ -172,6 +174,10 @@ const baseScrapeOptions = z .refine( (x) => !(x.includes("screenshot") && x.includes("screenshot@fullPage")), "You may only specify either screenshot or screenshot@fullPage", + ) + .refine( + (x) => !x.includes("compare") || x.includes("markdown"), + "The compare format requires the markdown format to be specified as well", ), headers: z.record(z.string(), z.string()).optional(), includeTags: z.string().array().optional(), @@ -546,6 +552,11 @@ export type Document = { value: unknown }[]; }; + compare?: { + previousScrapeAt: string | null; + changeStatus: "new" | "same" | "changed" | "removed"; + visibility: "visible" | "hidden"; + } metadata: { title?: string; description?: string; @@ -812,7 +823,7 @@ export function toLegacyCrawlerOptions(x: CrawlerOptions) { }; } -export function fromLegacyCrawlerOptions(x: any): { +export function fromLegacyCrawlerOptions(x: any, teamId: string): { crawlOptions: CrawlerOptions; internalOptions: InternalOptions; } { @@ -834,6 +845,7 @@ export function fromLegacyCrawlerOptions(x: any): { }), internalOptions: { v0CrawlOnlyUrls: x.returnOnlyUrls, + teamId, }, }; } @@ -847,6 +859,7 @@ export function fromLegacyScrapeOptions( pageOptions: PageOptions, extractorOptions: ExtractorOptions | undefined, timeout: number | undefined, + teamId: string, ): { scrapeOptions: ScrapeOptions; internalOptions: InternalOptions } { return { scrapeOptions: scrapeOptions.parse({ @@ -896,6 +909,7 @@ export function fromLegacyScrapeOptions( internalOptions: { atsv: pageOptions.atsv, v0DisableJsDom: pageOptions.disableJsDom, + teamId, }, // TODO: fallback, fetchPageContent, replaceAllPathsWithAbsolutePaths, includeLinks }; @@ -906,13 +920,15 @@ export function fromLegacyCombo( extractorOptions: ExtractorOptions | undefined, timeout: number | undefined, crawlerOptions: any, + teamId: string, ): { scrapeOptions: ScrapeOptions; internalOptions: InternalOptions } { const { scrapeOptions, internalOptions: i1 } = fromLegacyScrapeOptions( pageOptions, extractorOptions, timeout, + teamId, ); - const { internalOptions: i2 } = fromLegacyCrawlerOptions(crawlerOptions); + const { internalOptions: i2 } = fromLegacyCrawlerOptions(crawlerOptions, teamId); return { scrapeOptions, internalOptions: Object.assign(i1, i2) }; } diff --git a/apps/api/src/lib/crawl-redis.ts b/apps/api/src/lib/crawl-redis.ts index b741e615..e261800f 100644 --- a/apps/api/src/lib/crawl-redis.ts +++ b/apps/api/src/lib/crawl-redis.ts @@ -154,20 +154,20 @@ export async function finishCrawlKickoff(id: string) { ); } -export async function finishCrawl(id: string) { +export async function finishCrawlPre(id: string) { if (await isCrawlFinished(id)) { - _logger.debug("Marking crawl as finished.", { + _logger.debug("Marking crawl as pre-finished.", { module: "crawl-redis", - method: "finishCrawl", + method: "finishCrawlPre", crawlId: id, }); - const set = await redisConnection.setnx("crawl:" + id + ":finish", "yes"); - await redisConnection.expire("crawl:" + id + ":finish", 24 * 60 * 60); + const set = await redisConnection.setnx("crawl:" + id + ":finished_pre", "yes"); + await redisConnection.expire("crawl:" + id + ":finished_pre", 24 * 60 * 60); return set === 1; } else { - _logger.debug("Crawl can not be finished yet, not marking as finished.", { + _logger.debug("Crawl can not be pre-finished yet, not marking as finished.", { module: "crawl-redis", - method: "finishCrawl", + method: "finishCrawlPre", crawlId: id, jobs_done: await redisConnection.scard("crawl:" + id + ":jobs_done"), jobs: await redisConnection.scard("crawl:" + id + ":jobs"), @@ -177,6 +177,16 @@ export async function finishCrawl(id: string) { } } +export async function finishCrawl(id: string) { + _logger.debug("Marking crawl as finished.", { + module: "crawl-redis", + method: "finishCrawl", + crawlId: id, + }); + await redisConnection.set("crawl:" + id + ":finish", "yes"); + await redisConnection.expire("crawl:" + id + ":finish", 24 * 60 * 60); +} + export async function getCrawlJobs(id: string): Promise { return await redisConnection.smembers("crawl:" + id + ":jobs"); } @@ -250,7 +260,7 @@ export function generateURLPermutations(url: string | URL): URL[] { return [urlWithHTML, urlWithPHP, urlWithSlash, urlWithBare]; }); - return permutations; + return [...new Set(permutations.map(x => x.href))].map(x => new URL(x)); } export async function lockURL( diff --git a/apps/api/src/lib/extract/document-scraper.ts b/apps/api/src/lib/extract/document-scraper.ts index 8cbc75fd..e9bd729a 100644 --- a/apps/api/src/lib/extract/document-scraper.ts +++ b/apps/api/src/lib/extract/document-scraper.ts @@ -44,6 +44,7 @@ export async function scrapeDocument( scrapeOptions: scrapeOptions.parse({ ...internalScrapeOptions }), internalOptions: { useCache: true, + teamId: options.teamId, }, plan: options.plan, origin: options.origin, diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index c6751218..ba983e07 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -51,6 +51,7 @@ export async function startWebScraperPipeline({ priority: job.opts.priority, is_scrape: job.data.is_scrape ?? false, is_crawl: !!(job.data.crawl_id && job.data.crawlerOptions !== null), + urlInvisibleInCurrentCrawl: job.data.crawlerOptions?.urlInvisibleInCurrentCrawl ?? false, }); } @@ -66,6 +67,7 @@ export async function runWebScraper({ priority, is_scrape = false, is_crawl = false, + urlInvisibleInCurrentCrawl = false, }: RunWebScraperParams): Promise { const logger = _logger.child({ method: "runWebScraper", @@ -97,6 +99,8 @@ export async function runWebScraper({ response = await scrapeURL(bull_job_id, url, scrapeOptions, { priority, ...internalOptions, + urlInvisibleInCurrentCrawl, + teamId: internalOptions?.teamId ?? team_id, }); if (!response.success) { if (response.error instanceof Error) { diff --git a/apps/api/src/scraper/WebScraper/sitemap.ts b/apps/api/src/scraper/WebScraper/sitemap.ts index c2c60383..f945cd22 100644 --- a/apps/api/src/scraper/WebScraper/sitemap.ts +++ b/apps/api/src/scraper/WebScraper/sitemap.ts @@ -47,6 +47,7 @@ export async function getLinksFromSitemap( ], v0DisableJsDom: true, abort, + teamId: "sitemap", }, ); diff --git a/apps/api/src/scraper/scrapeURL/index.ts b/apps/api/src/scraper/scrapeURL/index.ts index eaf5497a..e047d1cb 100644 --- a/apps/api/src/scraper/scrapeURL/index.ts +++ b/apps/api/src/scraper/scrapeURL/index.ts @@ -162,6 +162,8 @@ async function buildMetaObject( } export type InternalOptions = { + teamId: string; + priority?: number; // Passed along to fire-engine forceEngine?: Engine | Engine[]; atsv?: boolean; // anti-bot solver, beta @@ -173,6 +175,7 @@ export type InternalOptions = { isBackgroundIndex?: boolean; fromCache?: boolean; // Indicates if the document was retrieved from cache abort?: AbortSignal; + urlInvisibleInCurrentCrawl?: boolean; }; export type EngineResultsTracker = { @@ -383,7 +386,7 @@ export async function scrapeURL( id: string, url: string, options: ScrapeOptions, - internalOptions: InternalOptions = {}, + internalOptions: InternalOptions, ): Promise { const meta = await buildMetaObject(id, url, options, internalOptions); try { diff --git a/apps/api/src/scraper/scrapeURL/scrapeURL.test.ts b/apps/api/src/scraper/scrapeURL/scrapeURL.test.ts index 8b783821..b545266f 100644 --- a/apps/api/src/scraper/scrapeURL/scrapeURL.test.ts +++ b/apps/api/src/scraper/scrapeURL/scrapeURL.test.ts @@ -31,7 +31,7 @@ describe("Standalone scrapeURL tests", () => { "test:scrape-basic", "https://www.roastmywebsite.ai/", scrapeOptions.parse({}), - { forceEngine }, + { forceEngine, teamId: "test" }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -78,7 +78,7 @@ describe("Standalone scrapeURL tests", () => { scrapeOptions.parse({ formats: ["markdown", "html"], }), - { forceEngine }, + { forceEngine, teamId: "test" }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -102,7 +102,7 @@ describe("Standalone scrapeURL tests", () => { scrapeOptions.parse({ onlyMainContent: false, }), - { forceEngine }, + { forceEngine, teamId: "test" }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -125,7 +125,7 @@ describe("Standalone scrapeURL tests", () => { onlyMainContent: false, excludeTags: [".nav", "#footer", "strong"], }), - { forceEngine }, + { forceEngine, teamId: "test" }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -145,7 +145,7 @@ describe("Standalone scrapeURL tests", () => { "test:scrape-400", "https://httpstat.us/400", scrapeOptions.parse({}), - { forceEngine }, + { forceEngine, teamId: "test" }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -163,7 +163,7 @@ describe("Standalone scrapeURL tests", () => { "test:scrape-401", "https://httpstat.us/401", scrapeOptions.parse({}), - { forceEngine }, + { forceEngine, teamId: "test" }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -181,7 +181,7 @@ describe("Standalone scrapeURL tests", () => { "test:scrape-403", "https://httpstat.us/403", scrapeOptions.parse({}), - { forceEngine }, + { forceEngine, teamId: "test" }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -199,7 +199,7 @@ describe("Standalone scrapeURL tests", () => { "test:scrape-404", "https://httpstat.us/404", scrapeOptions.parse({}), - { forceEngine }, + { forceEngine, teamId: "test" }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -217,7 +217,7 @@ describe("Standalone scrapeURL tests", () => { "test:scrape-405", "https://httpstat.us/405", scrapeOptions.parse({}), - { forceEngine }, + { forceEngine, teamId: "test" }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -235,7 +235,7 @@ describe("Standalone scrapeURL tests", () => { "test:scrape-500", "https://httpstat.us/500", scrapeOptions.parse({}), - { forceEngine }, + { forceEngine, teamId: "test" }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -253,7 +253,7 @@ describe("Standalone scrapeURL tests", () => { "test:scrape-redirect", "https://scrapethissite.com/", scrapeOptions.parse({}), - { forceEngine }, + { forceEngine, teamId: "test" }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -285,7 +285,7 @@ describe("Standalone scrapeURL tests", () => { scrapeOptions.parse({ formats: ["screenshot"], }), - { forceEngine }, + { forceEngine, teamId: "test" }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -313,7 +313,7 @@ describe("Standalone scrapeURL tests", () => { scrapeOptions.parse({ formats: ["screenshot@fullPage"], }), - { forceEngine }, + { forceEngine, teamId: "test" }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -341,6 +341,7 @@ describe("Standalone scrapeURL tests", () => { "test:scrape-pdf", "https://arxiv.org/pdf/astro-ph/9301001.pdf", scrapeOptions.parse({}), + { teamId: "test" }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -359,6 +360,7 @@ describe("Standalone scrapeURL tests", () => { "test:scrape-docx", "https://nvca.org/wp-content/uploads/2019/06/NVCA-Model-Document-Stock-Purchase-Agreement.docx", scrapeOptions.parse({}), + { teamId: "test" }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -395,6 +397,7 @@ describe("Standalone scrapeURL tests", () => { }, }, }), + { teamId: "test" }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -430,6 +433,7 @@ describe("Standalone scrapeURL tests", () => { }, }, }), + { teamId: "test" }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -451,7 +455,7 @@ describe("Standalone scrapeURL tests", () => { async (i) => { const url = "https://www.scrapethissite.com/?i=" + i; const id = "test:concurrent:" + url; - const out = await scrapeURL(id, url, scrapeOptions.parse({})); + const out = await scrapeURL(id, url, scrapeOptions.parse({}), { teamId: "test" }); const replacer = (key: string, value: any) => { if (value instanceof Error) { diff --git a/apps/api/src/scraper/scrapeURL/transformers/diff.ts b/apps/api/src/scraper/scrapeURL/transformers/diff.ts new file mode 100644 index 00000000..9628844d --- /dev/null +++ b/apps/api/src/scraper/scrapeURL/transformers/diff.ts @@ -0,0 +1,42 @@ +import { supabase_service } from "../../../services/supabase"; +import { Document } from "../../../controllers/v1/types"; +import { Meta } from "../index"; + +export async function deriveDiff(meta: Meta, document: Document): Promise { + if (meta.options.formats.includes("compare")) { + const res = await supabase_service + .rpc("diff_get_last_scrape_1", { + i_team_id: meta.internalOptions.teamId, + i_url: document.metadata.sourceURL ?? meta.url, + }); + + const data: { + o_docs: Document[], + o_date_added: string, + } | undefined | null = (res.data ?? [])[0] as any; + + if (data && data.o_docs.length > 0) { + const previousMarkdown = data.o_docs[0].markdown!; + const currentMarkdown = document.markdown!; + + const transformer = (x: string) => [...x.replace(/\s+/g, "").replace(/\[iframe\]\(.+?\)/g, "")].sort().join(""); + + document.compare = { + previousScrapeAt: data.o_date_added, + changeStatus: document.metadata.statusCode === 404 ? "removed" : transformer(previousMarkdown) === transformer(currentMarkdown) ? "same" : "changed", + visibility: meta.internalOptions.urlInvisibleInCurrentCrawl ? "hidden" : "visible", + } + } else if (!res.error) { + document.compare = { + previousScrapeAt: null, + changeStatus: document.metadata.statusCode === 404 ? "removed" : "new", + visibility: meta.internalOptions.urlInvisibleInCurrentCrawl ? "hidden" : "visible", + } + } else { + meta.logger.error("Error fetching previous scrape", { error: res.error }); + document.warning = "Comparing failed, please try again later." + (document.warning ? ` ${document.warning}` : ""); + } + } + + return document; +} diff --git a/apps/api/src/scraper/scrapeURL/transformers/index.ts b/apps/api/src/scraper/scrapeURL/transformers/index.ts index ea149dba..114c59a8 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/index.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/index.ts @@ -8,7 +8,7 @@ import { performLLMExtract } from "./llmExtract"; import { uploadScreenshot } from "./uploadScreenshot"; import { removeBase64Images } from "./removeBase64Images"; import { saveToCache } from "./cache"; - +import { deriveDiff } from "./diff"; export type Transformer = ( meta: Meta, document: Document, @@ -148,6 +148,17 @@ export function coerceFieldsToFormats( ); } + if (!formats.has("compare") && document.compare !== undefined) { + meta.logger.warn( + "Removed compare from Document because it wasn't in formats -- this is extremely wasteful and indicates a bug.", + ); + delete document.compare; + } else if (formats.has("compare") && document.compare === undefined) { + meta.logger.warn( + "Request had format compare, but there was no compare field in the result.", + ); + } + if (meta.options.actions === undefined || meta.options.actions.length === 0) { delete document.actions; } @@ -164,6 +175,7 @@ export const transformerStack: Transformer[] = [ deriveMetadataFromRawHTML, uploadScreenshot, performLLMExtract, + deriveDiff, coerceFieldsToFormats, removeBase64Images, ]; diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index b6cfc454..a7da10f7 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -28,6 +28,7 @@ import { addCrawlJobs, crawlToCrawler, finishCrawl, + finishCrawlPre, finishCrawlKickoff, generateURLPermutations, getCrawl, @@ -100,7 +101,77 @@ const gotJobInterval = Number(process.env.CONNECTION_MONITOR_INTERVAL) || 20; const runningJobs: Set = new Set(); async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) { - if (await finishCrawl(job.data.crawl_id)) { + if (await finishCrawlPre(job.data.crawl_id)) { + if (job.data.crawlerOptions && !await redisConnection.exists("crawl:" + job.data.crawl_id + ":invisible_urls")) { + await redisConnection.set("crawl:" + job.data.crawl_id + ":invisible_urls", "done", "EX", 60 * 60 * 24); + + const sc = (await getCrawl(job.data.crawl_id))!; + + const visitedUrls = new Set(await redisConnection.smembers( + "crawl:" + job.data.crawl_id + ":visited_unique", + )); + + const lastUrls: string[] = ((await supabase_service.rpc("diff_get_last_crawl_urls", { + i_team_id: job.data.team_id, + i_url: sc.originUrl!, + })).data ?? []).map(x => x.url); + + const lastUrlsSet = new Set(lastUrls); + + const univistedUrls = Array.from(lastUrlsSet).filter(x => !visitedUrls.has(x)); + const addableJobCount = sc.crawlerOptions.limit === undefined ? Infinity : (sc.crawlerOptions.limit - await getDoneJobsOrderedLength(job.data.crawl_id)); + + console.log(sc.originUrl!, univistedUrls, visitedUrls, lastUrls, addableJobCount); + + if (univistedUrls.length !== 0 && addableJobCount > 0) { + const jobs = univistedUrls.slice(0, addableJobCount).map((url) => { + const uuid = uuidv4(); + return { + name: uuid, + data: { + url, + mode: "single_urls" as const, + team_id: job.data.team_id, + plan: job.data.plan!, + crawlerOptions: { + ...job.data.crawlerOptions, + urlInvisibleInCurrentCrawl: true, + }, + scrapeOptions: job.data.scrapeOptions, + internalOptions: sc.internalOptions, + origin: job.data.origin, + crawl_id: job.data.crawl_id, + sitemapped: true, + webhook: job.data.webhook, + v1: job.data.v1, + }, + opts: { + jobId: uuid, + priority: 20, + }, + }; + }); + + const lockedIds = await lockURLsIndividually( + job.data.crawl_id, + sc, + jobs.map((x) => ({ id: x.opts.jobId, url: x.data.url })), + ); + const lockedJobs = jobs.filter((x) => + lockedIds.find((y) => y.id === x.opts.jobId), + ); + await addCrawlJobs( + job.data.crawl_id, + lockedJobs.map((x) => x.opts.jobId), + ); + await addScrapeJobs(lockedJobs); + + return; + } + } + + await finishCrawl(job.data.crawl_id); + (async () => { const originUrl = sc.originUrl ? normalizeUrlOnlyHostname(sc.originUrl) diff --git a/apps/api/src/types.ts b/apps/api/src/types.ts index f051b183..adc7df37 100644 --- a/apps/api/src/types.ts +++ b/apps/api/src/types.ts @@ -59,6 +59,7 @@ export interface RunWebScraperParams { priority?: number; is_scrape?: boolean; is_crawl?: boolean; + urlInvisibleInCurrentCrawl?: boolean; } export type RunWebScraperResult = diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 0e6dbf7c..772f2a3e 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.21.0", + "version": "1.21.1", "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts", diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 41d13da0..bec5288f 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -69,6 +69,11 @@ export interface FirecrawlDocument; includeTags?: string[]; excludeTags?: string[];