2024-08-15 23:30:33 +02:00
|
|
|
import { Response } from "express";
|
2024-08-06 15:24:45 -03:00
|
|
|
import { v4 as uuidv4 } from "uuid";
|
2024-08-20 14:19:20 -03:00
|
|
|
import {
|
|
|
|
|
CrawlRequest,
|
|
|
|
|
crawlRequestSchema,
|
|
|
|
|
CrawlResponse,
|
|
|
|
|
RequestWithAuth,
|
2024-12-11 19:51:08 -03:00
|
|
|
toLegacyCrawlerOptions,
|
2024-08-20 14:19:20 -03:00
|
|
|
} from "./types";
|
2025-01-10 18:35:10 -03:00
|
|
|
import { crawlToCrawler, saveCrawl, StoredCrawl } from "../../lib/crawl-redis";
|
2024-08-15 23:30:33 +02:00
|
|
|
import { logCrawl } from "../../services/logging/crawl_log";
|
2025-01-07 19:15:23 +01:00
|
|
|
import { _addScrapeJobToBullMQ } from "../../services/queue-jobs";
|
2024-12-05 20:50:36 +01:00
|
|
|
import { logger as _logger } from "../../lib/logger";
|
2024-08-06 15:24:45 -03:00
|
|
|
|
2024-08-20 14:19:20 -03:00
|
|
|
export async function crawlController(
|
|
|
|
|
req: RequestWithAuth<{}, CrawlResponse, CrawlRequest>,
|
2024-12-11 19:51:08 -03:00
|
|
|
res: Response<CrawlResponse>,
|
2024-08-20 14:19:20 -03:00
|
|
|
) {
|
2024-12-05 20:50:36 +01:00
|
|
|
const preNormalizedBody = req.body;
|
2024-08-15 23:30:33 +02:00
|
|
|
req.body = crawlRequestSchema.parse(req.body);
|
2024-08-20 14:19:20 -03:00
|
|
|
|
2024-08-15 23:30:33 +02:00
|
|
|
const id = uuidv4();
|
2024-12-11 19:46:11 -03:00
|
|
|
const logger = _logger.child({
|
|
|
|
|
crawlId: id,
|
|
|
|
|
module: "api/v1",
|
|
|
|
|
method: "crawlController",
|
|
|
|
|
teamId: req.auth.team_id,
|
2024-12-11 19:51:08 -03:00
|
|
|
plan: req.auth.plan,
|
2024-12-11 19:46:11 -03:00
|
|
|
});
|
|
|
|
|
logger.debug("Crawl " + id + " starting", {
|
|
|
|
|
request: req.body,
|
|
|
|
|
originalRequest: preNormalizedBody,
|
2024-12-11 19:51:08 -03:00
|
|
|
account: req.account,
|
2024-12-11 19:46:11 -03:00
|
|
|
});
|
2024-08-06 15:24:45 -03:00
|
|
|
|
2024-08-15 23:30:33 +02:00
|
|
|
await logCrawl(id, req.auth.team_id);
|
2024-08-06 15:24:45 -03:00
|
|
|
|
2024-11-07 20:57:33 +01:00
|
|
|
let { remainingCredits } = req.account!;
|
2024-12-11 19:46:11 -03:00
|
|
|
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true";
|
|
|
|
|
if (!useDbAuthentication) {
|
2024-10-09 22:52:49 +00:00
|
|
|
remainingCredits = Infinity;
|
|
|
|
|
}
|
2024-08-20 14:39:52 -03:00
|
|
|
|
2024-11-07 20:57:33 +01:00
|
|
|
const crawlerOptions = {
|
|
|
|
|
...req.body,
|
|
|
|
|
url: undefined,
|
2024-12-11 19:51:08 -03:00
|
|
|
scrapeOptions: undefined,
|
2024-11-07 20:57:33 +01:00
|
|
|
};
|
|
|
|
|
const scrapeOptions = req.body.scrapeOptions;
|
2024-08-06 15:24:45 -03:00
|
|
|
|
2024-08-26 19:07:14 -03:00
|
|
|
// TODO: @rafa, is this right? copied from v0
|
2024-11-07 20:57:33 +01:00
|
|
|
if (Array.isArray(crawlerOptions.includePaths)) {
|
|
|
|
|
for (const x of crawlerOptions.includePaths) {
|
2024-08-26 19:07:14 -03:00
|
|
|
try {
|
|
|
|
|
new RegExp(x);
|
|
|
|
|
} catch (e) {
|
|
|
|
|
return res.status(400).json({ success: false, error: e.message });
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-11-07 20:57:33 +01:00
|
|
|
if (Array.isArray(crawlerOptions.excludePaths)) {
|
|
|
|
|
for (const x of crawlerOptions.excludePaths) {
|
2024-08-26 19:07:14 -03:00
|
|
|
try {
|
|
|
|
|
new RegExp(x);
|
|
|
|
|
} catch (e) {
|
|
|
|
|
return res.status(400).json({ success: false, error: e.message });
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-12-05 20:50:36 +01:00
|
|
|
const originalLimit = crawlerOptions.limit;
|
2024-08-20 14:39:52 -03:00
|
|
|
crawlerOptions.limit = Math.min(remainingCredits, crawlerOptions.limit);
|
2024-12-11 19:46:11 -03:00
|
|
|
logger.debug("Determined limit: " + crawlerOptions.limit, {
|
|
|
|
|
remainingCredits,
|
|
|
|
|
bodyLimit: originalLimit,
|
2024-12-11 19:51:08 -03:00
|
|
|
originalBodyLimit: preNormalizedBody.limit,
|
2024-12-11 19:46:11 -03:00
|
|
|
});
|
|
|
|
|
|
2024-08-15 23:30:33 +02:00
|
|
|
const sc: StoredCrawl = {
|
|
|
|
|
originUrl: req.body.url,
|
2024-11-07 20:57:33 +01:00
|
|
|
crawlerOptions: toLegacyCrawlerOptions(crawlerOptions),
|
|
|
|
|
scrapeOptions,
|
2024-12-10 21:12:31 +01:00
|
|
|
internalOptions: { disableSmartWaitCache: true }, // NOTE: smart wait disabled for crawls to ensure contentful scrape, speed does not matter
|
2024-08-15 23:30:33 +02:00
|
|
|
team_id: req.auth.team_id,
|
|
|
|
|
createdAt: Date.now(),
|
2024-12-11 19:51:08 -03:00
|
|
|
plan: req.auth.plan,
|
2024-08-15 23:30:33 +02:00
|
|
|
};
|
2024-08-06 15:24:45 -03:00
|
|
|
|
2024-08-15 23:30:33 +02:00
|
|
|
const crawler = crawlToCrawler(id, sc);
|
2024-08-06 15:24:45 -03:00
|
|
|
|
2024-08-15 23:30:33 +02:00
|
|
|
try {
|
2024-11-07 20:57:33 +01:00
|
|
|
sc.robots = await crawler.getRobotsTxt(scrapeOptions.skipTlsVerification);
|
2024-08-15 23:30:33 +02:00
|
|
|
} catch (e) {
|
2024-12-11 19:46:11 -03:00
|
|
|
logger.debug("Failed to get robots.txt (this is probably fine!)", {
|
2024-12-11 19:51:08 -03:00
|
|
|
error: e,
|
2024-12-11 19:46:11 -03:00
|
|
|
});
|
2024-08-15 23:30:33 +02:00
|
|
|
}
|
2024-08-06 15:24:45 -03:00
|
|
|
|
2024-08-15 23:30:33 +02:00
|
|
|
await saveCrawl(id, sc);
|
|
|
|
|
|
2025-01-10 18:35:10 -03:00
|
|
|
await _addScrapeJobToBullMQ(
|
|
|
|
|
{
|
|
|
|
|
url: req.body.url,
|
|
|
|
|
mode: "kickoff" as const,
|
|
|
|
|
team_id: req.auth.team_id,
|
|
|
|
|
plan: req.auth.plan,
|
|
|
|
|
crawlerOptions,
|
|
|
|
|
scrapeOptions: sc.scrapeOptions,
|
|
|
|
|
internalOptions: sc.internalOptions,
|
2025-03-28 12:47:34 +01:00
|
|
|
origin: req.body.origin,
|
2025-01-10 18:35:10 -03:00
|
|
|
crawl_id: id,
|
|
|
|
|
webhook: req.body.webhook,
|
|
|
|
|
v1: true,
|
|
|
|
|
},
|
|
|
|
|
{},
|
|
|
|
|
crypto.randomUUID(),
|
|
|
|
|
10,
|
|
|
|
|
);
|
|
|
|
|
|
2024-09-05 13:03:43 -03:00
|
|
|
const protocol = process.env.ENV === "local" ? req.protocol : "https";
|
2024-12-11 19:46:11 -03:00
|
|
|
|
2024-08-15 23:30:33 +02:00
|
|
|
return res.status(200).json({
|
|
|
|
|
success: true,
|
|
|
|
|
id,
|
2024-12-11 19:51:08 -03:00
|
|
|
url: `${protocol}://${req.get("host")}/v1/crawl/${id}`,
|
2024-08-15 23:30:33 +02:00
|
|
|
});
|
2024-08-06 15:24:45 -03:00
|
|
|
}
|