import { Response } from "express"; import { v4 as uuidv4 } from "uuid"; import { CrawlRequest, crawlRequestSchema, CrawlResponse, RequestWithAuth, toLegacyCrawlerOptions, } from "./types"; import { addCrawlJob, addCrawlJobs, crawlToCrawler, lockURL, lockURLs, saveCrawl, StoredCrawl, } from "../../lib/crawl-redis"; import { logCrawl } from "../../services/logging/crawl_log"; import { getScrapeQueue } from "../../services/queue-service"; import { addScrapeJob } from "../../services/queue-jobs"; import { logger } from "../../lib/logger"; import { getJobPriority } from "../../lib/job-priority"; import { callWebhook } from "../../services/webhook"; import { scrapeOptions as scrapeOptionsSchema } from "./types"; export async function crawlController( req: RequestWithAuth<{}, CrawlResponse, CrawlRequest>, res: Response ) { req.body = crawlRequestSchema.parse(req.body); const id = uuidv4(); await logCrawl(id, req.auth.team_id); let { remainingCredits } = req.account!; const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; if(!useDbAuthentication){ remainingCredits = Infinity; } const crawlerOptions = { ...req.body, url: undefined, scrapeOptions: undefined, }; const scrapeOptions = req.body.scrapeOptions; // TODO: @rafa, is this right? copied from v0 if (Array.isArray(crawlerOptions.includePaths)) { for (const x of crawlerOptions.includePaths) { try { new RegExp(x); } catch (e) { return res.status(400).json({ success: false, error: e.message }); } } } if (Array.isArray(crawlerOptions.excludePaths)) { for (const x of crawlerOptions.excludePaths) { try { new RegExp(x); } catch (e) { return res.status(400).json({ success: false, error: e.message }); } } } crawlerOptions.limit = Math.min(remainingCredits, crawlerOptions.limit); const sc: StoredCrawl = { originUrl: req.body.url, crawlerOptions: toLegacyCrawlerOptions(crawlerOptions), scrapeOptions, internalOptions: {}, team_id: req.auth.team_id, createdAt: Date.now(), plan: req.auth.plan, }; const crawler = crawlToCrawler(id, sc); try { sc.robots = await crawler.getRobotsTxt(scrapeOptions.skipTlsVerification); } catch (e) { logger.debug( `[Crawl] Failed to get robots.txt (this is probably fine!): ${JSON.stringify( e )}` ); } await saveCrawl(id, sc); const sitemap = sc.crawlerOptions.ignoreSitemap ? null : await crawler.tryGetSitemap(); if (sitemap !== null && sitemap.length > 0) { let jobPriority = 20; // If it is over 1000, we need to get the job priority, // otherwise we can use the default priority of 20 if(sitemap.length > 1000){ // set base to 21 jobPriority = await getJobPriority({plan: req.auth.plan, team_id: req.auth.team_id, basePriority: 21}) } const jobs = sitemap.map((x) => { const url = x.url; const uuid = uuidv4(); return { name: uuid, data: { url, mode: "single_urls", team_id: req.auth.team_id, plan: req.auth.plan, crawlerOptions, scrapeOptions, origin: "api", crawl_id: id, sitemapped: true, webhook: req.body.webhook, v1: true, }, opts: { jobId: uuid, priority: 20, }, }; }) await lockURLs( id, sc, jobs.map((x) => x.data.url) ); await addCrawlJobs( id, jobs.map((x) => x.opts.jobId) ); await getScrapeQueue().addBulk(jobs); } else { await lockURL(id, sc, req.body.url); const jobId = uuidv4(); await addScrapeJob( { url: req.body.url, mode: "single_urls", team_id: req.auth.team_id, crawlerOptions, scrapeOptions: scrapeOptionsSchema.parse(scrapeOptions), plan: req.auth.plan!, origin: "api", crawl_id: id, webhook: req.body.webhook, v1: true, }, { priority: 15, }, jobId, ); await addCrawlJob(id, jobId); } if(req.body.webhook) { await callWebhook(req.auth.team_id, id, null, req.body.webhook, true, "crawl.started"); } const protocol = process.env.ENV === "local" ? req.protocol : "https"; return res.status(200).json({ success: true, id, url: `${protocol}://${req.get("host")}/v1/crawl/${id}`, }); }