Files
firecrawl/apps/api/src/controllers/v1/crawl.ts
T

126 lines
3.4 KiB
TypeScript
Raw Normal View History

import { Response } from "express";
2024-08-06 15:24:45 -03:00
import { v4 as uuidv4 } from "uuid";
2024-08-20 14:19:20 -03:00
import {
CrawlRequest,
crawlRequestSchema,
CrawlResponse,
RequestWithAuth,
2024-12-11 19:51:08 -03:00
toLegacyCrawlerOptions,
2024-08-20 14:19:20 -03:00
} from "./types";
2025-01-10 18:35:10 -03:00
import { crawlToCrawler, saveCrawl, StoredCrawl } from "../../lib/crawl-redis";
import { logCrawl } from "../../services/logging/crawl_log";
2025-01-07 19:15:23 +01:00
import { _addScrapeJobToBullMQ } from "../../services/queue-jobs";
import { logger as _logger } from "../../lib/logger";
2024-08-06 15:24:45 -03:00
2024-08-20 14:19:20 -03:00
export async function crawlController(
req: RequestWithAuth<{}, CrawlResponse, CrawlRequest>,
2024-12-11 19:51:08 -03:00
res: Response<CrawlResponse>,
2024-08-20 14:19:20 -03:00
) {
const preNormalizedBody = req.body;
req.body = crawlRequestSchema.parse(req.body);
2024-08-20 14:19:20 -03:00
const id = uuidv4();
2024-12-11 19:46:11 -03:00
const logger = _logger.child({
crawlId: id,
module: "api/v1",
method: "crawlController",
teamId: req.auth.team_id,
});
logger.debug("Crawl " + id + " starting", {
request: req.body,
originalRequest: preNormalizedBody,
2024-12-11 19:51:08 -03:00
account: req.account,
2024-12-11 19:46:11 -03:00
});
2024-08-06 15:24:45 -03:00
await logCrawl(id, req.auth.team_id);
2024-08-06 15:24:45 -03:00
2024-11-07 20:57:33 +01:00
let { remainingCredits } = req.account!;
2024-12-11 19:46:11 -03:00
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true";
if (!useDbAuthentication) {
remainingCredits = Infinity;
}
2024-08-20 14:39:52 -03:00
2024-11-07 20:57:33 +01:00
const crawlerOptions = {
...req.body,
url: undefined,
2024-12-11 19:51:08 -03:00
scrapeOptions: undefined,
2024-11-07 20:57:33 +01:00
};
const scrapeOptions = req.body.scrapeOptions;
2024-08-06 15:24:45 -03:00
// TODO: @rafa, is this right? copied from v0
2024-11-07 20:57:33 +01:00
if (Array.isArray(crawlerOptions.includePaths)) {
for (const x of crawlerOptions.includePaths) {
try {
new RegExp(x);
} catch (e) {
return res.status(400).json({ success: false, error: e.message });
}
}
}
2024-11-07 20:57:33 +01:00
if (Array.isArray(crawlerOptions.excludePaths)) {
for (const x of crawlerOptions.excludePaths) {
try {
new RegExp(x);
} catch (e) {
return res.status(400).json({ success: false, error: e.message });
}
}
}
const originalLimit = crawlerOptions.limit;
2024-08-20 14:39:52 -03:00
crawlerOptions.limit = Math.min(remainingCredits, crawlerOptions.limit);
2024-12-11 19:46:11 -03:00
logger.debug("Determined limit: " + crawlerOptions.limit, {
remainingCredits,
bodyLimit: originalLimit,
2024-12-11 19:51:08 -03:00
originalBodyLimit: preNormalizedBody.limit,
2024-12-11 19:46:11 -03:00
});
const sc: StoredCrawl = {
originUrl: req.body.url,
2024-11-07 20:57:33 +01:00
crawlerOptions: toLegacyCrawlerOptions(crawlerOptions),
scrapeOptions,
2025-04-02 19:52:43 +02:00
internalOptions: { disableSmartWaitCache: true, teamId: req.auth.team_id }, // NOTE: smart wait disabled for crawls to ensure contentful scrape, speed does not matter
team_id: req.auth.team_id,
createdAt: Date.now(),
};
2024-08-06 15:24:45 -03:00
const crawler = crawlToCrawler(id, sc);
2024-08-06 15:24:45 -03:00
try {
2024-11-07 20:57:33 +01:00
sc.robots = await crawler.getRobotsTxt(scrapeOptions.skipTlsVerification);
} catch (e) {
2024-12-11 19:46:11 -03:00
logger.debug("Failed to get robots.txt (this is probably fine!)", {
2024-12-11 19:51:08 -03:00
error: e,
2024-12-11 19:46:11 -03:00
});
}
2024-08-06 15:24:45 -03:00
await saveCrawl(id, sc);
2025-01-10 18:35:10 -03:00
await _addScrapeJobToBullMQ(
{
url: req.body.url,
mode: "kickoff" as const,
team_id: req.auth.team_id,
crawlerOptions,
scrapeOptions: sc.scrapeOptions,
internalOptions: sc.internalOptions,
2025-03-28 12:47:34 +01:00
origin: req.body.origin,
2025-01-10 18:35:10 -03:00
crawl_id: id,
webhook: req.body.webhook,
v1: true,
},
{},
crypto.randomUUID(),
10,
);
2024-09-05 13:03:43 -03:00
const protocol = process.env.ENV === "local" ? req.protocol : "https";
2024-12-11 19:46:11 -03:00
return res.status(200).json({
success: true,
id,
2024-12-11 19:51:08 -03:00
url: `${protocol}://${req.get("host")}/v1/crawl/${id}`,
});
2024-08-06 15:24:45 -03:00
}