apps/api/src/controllers/v1/crawl.ts

import { Response } from "express";
import { v4 as uuidv4 } from "uuid";
import {
  CrawlRequest,
  crawlRequestSchema,
  CrawlResponse,
  RequestWithAuth,
  toLegacyCrawlerOptions
} from "./types";
import {
  addCrawlJob,
  addCrawlJobs,
  crawlToCrawler,
  lockURL,
  lockURLs,
  saveCrawl,
  StoredCrawl
} from "../../lib/crawl-redis";
import { logCrawl } from "../../services/logging/crawl_log";
import { getScrapeQueue } from "../../services/queue-service";
import { addScrapeJob } from "../../services/queue-jobs";
import { logger as _logger } from "../../lib/logger";
import { getJobPriority } from "../../lib/job-priority";
import { callWebhook } from "../../services/webhook";
import { scrapeOptions as scrapeOptionsSchema } from "./types";

export async function crawlController(
  req: RequestWithAuth<{}, CrawlResponse, CrawlRequest>,
  res: Response<CrawlResponse>
) {
  const preNormalizedBody = req.body;
  req.body = crawlRequestSchema.parse(req.body);

  const id = uuidv4();
  const logger = _logger.child({
    crawlId: id,
    module: "api/v1",
    method: "crawlController",
    teamId: req.auth.team_id,
    plan: req.auth.plan
  });
  logger.debug("Crawl " + id + " starting", {
    request: req.body,
    originalRequest: preNormalizedBody,
    account: req.account
  });

  await logCrawl(id, req.auth.team_id);

  let { remainingCredits } = req.account!;
  const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true";
  if (!useDbAuthentication) {
    remainingCredits = Infinity;
  }

  const crawlerOptions = {
    ...req.body,
    url: undefined,
    scrapeOptions: undefined
  };
  const scrapeOptions = req.body.scrapeOptions;

  // TODO: @rafa, is this right? copied from v0
  if (Array.isArray(crawlerOptions.includePaths)) {
    for (const x of crawlerOptions.includePaths) {
      try {
        new RegExp(x);
      } catch (e) {
        return res.status(400).json({ success: false, error: e.message });
      }
    }
  }

  if (Array.isArray(crawlerOptions.excludePaths)) {
    for (const x of crawlerOptions.excludePaths) {
      try {
        new RegExp(x);
      } catch (e) {
        return res.status(400).json({ success: false, error: e.message });
      }
    }
  }

  const originalLimit = crawlerOptions.limit;
  crawlerOptions.limit = Math.min(remainingCredits, crawlerOptions.limit);
  logger.debug("Determined limit: " + crawlerOptions.limit, {
    remainingCredits,
    bodyLimit: originalLimit,
    originalBodyLimit: preNormalizedBody.limit
  });

  const sc: StoredCrawl = {
    originUrl: req.body.url,
    crawlerOptions: toLegacyCrawlerOptions(crawlerOptions),
    scrapeOptions,
    internalOptions: { disableSmartWaitCache: true }, // NOTE: smart wait disabled for crawls to ensure contentful scrape, speed does not matter
    team_id: req.auth.team_id,
    createdAt: Date.now(),
    plan: req.auth.plan
  };

  const crawler = crawlToCrawler(id, sc);

  try {
    sc.robots = await crawler.getRobotsTxt(scrapeOptions.skipTlsVerification);
  } catch (e) {
    logger.debug("Failed to get robots.txt (this is probably fine!)", {
      error: e
    });
  }

  await saveCrawl(id, sc);

  const sitemap = sc.crawlerOptions.ignoreSitemap
    ? null
    : await crawler.tryGetSitemap();

  if (sitemap !== null && sitemap.length > 0) {
    logger.debug("Using sitemap of length " + sitemap.length, {
      sitemapLength: sitemap.length
    });
    let jobPriority = 20;
    // If it is over 1000, we need to get the job priority,
    // otherwise we can use the default priority of 20
    if (sitemap.length > 1000) {
      // set base to 21
      jobPriority = await getJobPriority({
        plan: req.auth.plan,
        team_id: req.auth.team_id,
        basePriority: 21
      });
    }
    logger.debug("Using job priority " + jobPriority, { jobPriority });

    const jobs = sitemap.map((x) => {
      const url = x.url;
      const uuid = uuidv4();
      return {
        name: uuid,
        data: {
          url,
          mode: "single_urls",
          team_id: req.auth.team_id,
          plan: req.auth.plan,
          crawlerOptions,
          scrapeOptions,
          internalOptions: sc.internalOptions,
          origin: "api",
          crawl_id: id,
          sitemapped: true,
          webhook: req.body.webhook,
          v1: true
        },
        opts: {
          jobId: uuid,
          priority: 20
        }
      };
    });

    logger.debug("Locking URLs...");
    await lockURLs(
      id,
      sc,
      jobs.map((x) => x.data.url)
    );
    logger.debug("Adding scrape jobs to Redis...");
    await addCrawlJobs(
      id,
      jobs.map((x) => x.opts.jobId)
    );
    logger.debug("Adding scrape jobs to BullMQ...");
    await getScrapeQueue().addBulk(jobs);
  } else {
    logger.debug("Sitemap not found or ignored.", {
      ignoreSitemap: sc.crawlerOptions.ignoreSitemap
    });

    logger.debug("Locking URL...");
    await lockURL(id, sc, req.body.url);
    const jobId = uuidv4();
    logger.debug("Adding scrape job to Redis...", { jobId });
    await addScrapeJob(
      {
        url: req.body.url,
        mode: "single_urls",
        team_id: req.auth.team_id,
        crawlerOptions,
        scrapeOptions: scrapeOptionsSchema.parse(scrapeOptions),
        internalOptions: sc.internalOptions,
        plan: req.auth.plan!,
        origin: "api",
        crawl_id: id,
        webhook: req.body.webhook,
        v1: true
      },
      {
        priority: 15
      },
      jobId
    );
    logger.debug("Adding scrape job to BullMQ...", { jobId });
    await addCrawlJob(id, jobId);
  }
  logger.debug("Done queueing jobs!");

  if (req.body.webhook) {
    logger.debug("Calling webhook with crawl.started...", {
      webhook: req.body.webhook
    });
    await callWebhook(
      req.auth.team_id,
      id,
      null,
      req.body.webhook,
      true,
      "crawl.started"
    );
  }

  const protocol = process.env.ENV === "local" ? req.protocol : "https";

  return res.status(200).json({
    success: true,
    id,
    url: `${protocol}://${req.get("host")}/v1/crawl/${id}`
  });
}
add zod, create middleware, update openapi declaration, add crawl logic 2024-08-15 23:30:33 +02:00			`import { Response } from "express";`
wip: map, crawl, scrape mockups 2024-08-06 15:24:45 -03:00			`import { v4 as uuidv4 } from "uuid";`
Update crawl.ts 2024-08-20 14:19:20 -03:00			`import {`
			`CrawlRequest,`
			`crawlRequestSchema,`
			`CrawlResponse,`
			`RequestWithAuth,`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`toLegacyCrawlerOptions`
Update crawl.ts 2024-08-20 14:19:20 -03:00			`} from "./types";`
			`import {`
			`addCrawlJob,`
			`addCrawlJobs,`
			`crawlToCrawler,`
			`lockURL,`
			`lockURLs,`
			`saveCrawl,`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`StoredCrawl`
Update crawl.ts 2024-08-20 14:19:20 -03:00			`} from "../../lib/crawl-redis";`
add zod, create middleware, update openapi declaration, add crawl logic 2024-08-15 23:30:33 +02:00			`import { logCrawl } from "../../services/logging/crawl_log";`
			`import { getScrapeQueue } from "../../services/queue-service";`
			`import { addScrapeJob } from "../../services/queue-jobs";`
feat(app): add extra crawl logging (app-side only for now) 2024-12-05 20:50:36 +01:00			`import { logger as _logger } from "../../lib/logger";`
Nick: 2024-08-28 12:46:59 -03:00			`import { getJobPriority } from "../../lib/job-priority";`
Nick: webhooks v1 working great 2024-09-01 13:44:36 -03:00			`import { callWebhook } from "../../services/webhook";`
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00			`import { scrapeOptions as scrapeOptionsSchema } from "./types";`
wip: map, crawl, scrape mockups 2024-08-06 15:24:45 -03:00
Update crawl.ts 2024-08-20 14:19:20 -03:00			`export async function crawlController(`
			`req: RequestWithAuth<{}, CrawlResponse, CrawlRequest>,`
			`res: Response<CrawlResponse>`
			`) {`
feat(app): add extra crawl logging (app-side only for now) 2024-12-05 20:50:36 +01:00			`const preNormalizedBody = req.body;`
add zod, create middleware, update openapi declaration, add crawl logic 2024-08-15 23:30:33 +02:00			`req.body = crawlRequestSchema.parse(req.body);`
Update crawl.ts 2024-08-20 14:19:20 -03:00
add zod, create middleware, update openapi declaration, add crawl logic 2024-08-15 23:30:33 +02:00			`const id = uuidv4();`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`const logger = _logger.child({`
			`crawlId: id,`
			`module: "api/v1",`
			`method: "crawlController",`
			`teamId: req.auth.team_id,`
			`plan: req.auth.plan`
			`});`
			`logger.debug("Crawl " + id + " starting", {`
			`request: req.body,`
			`originalRequest: preNormalizedBody,`
			`account: req.account`
			`});`
wip: map, crawl, scrape mockups 2024-08-06 15:24:45 -03:00
add zod, create middleware, update openapi declaration, add crawl logic 2024-08-15 23:30:33 +02:00			`await logCrawl(id, req.auth.team_id);`
wip: map, crawl, scrape mockups 2024-08-06 15:24:45 -03:00
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00			`let { remainingCredits } = req.account!;`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true";`
			`if (!useDbAuthentication) {`
bugfix: self-host crawling doesnt respect limit 2024-10-09 22:52:49 +00:00			`remainingCredits = Infinity;`
			`}`
Nick: 2024-08-20 14:39:52 -03:00
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00			`const crawlerOptions = {`
			`...req.body,`
			`url: undefined,`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`scrapeOptions: undefined`
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00			`};`
			`const scrapeOptions = req.body.scrapeOptions;`
wip: map, crawl, scrape mockups 2024-08-06 15:24:45 -03:00
Nick: @rafaelsideguide isarray for includes/excludes 2024-08-26 19:07:14 -03:00			`// TODO: @rafa, is this right? copied from v0`
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00			`if (Array.isArray(crawlerOptions.includePaths)) {`
			`for (const x of crawlerOptions.includePaths) {`
Nick: @rafaelsideguide isarray for includes/excludes 2024-08-26 19:07:14 -03:00			`try {`
			`new RegExp(x);`
			`} catch (e) {`
			`return res.status(400).json({ success: false, error: e.message });`
			`}`
			`}`
			`}`

`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00			`if (Array.isArray(crawlerOptions.excludePaths)) {`
			`for (const x of crawlerOptions.excludePaths) {`
Nick: @rafaelsideguide isarray for includes/excludes 2024-08-26 19:07:14 -03:00			`try {`
			`new RegExp(x);`
			`} catch (e) {`
			`return res.status(400).json({ success: false, error: e.message });`
			`}`
			`}`
			`}`

feat(app): add extra crawl logging (app-side only for now) 2024-12-05 20:50:36 +01:00			`const originalLimit = crawlerOptions.limit;`
Nick: 2024-08-20 14:39:52 -03:00			`crawlerOptions.limit = Math.min(remainingCredits, crawlerOptions.limit);`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`logger.debug("Determined limit: " + crawlerOptions.limit, {`
			`remainingCredits,`
			`bodyLimit: originalLimit,`
			`originalBodyLimit: preNormalizedBody.limit`
			`});`

add zod, create middleware, update openapi declaration, add crawl logic 2024-08-15 23:30:33 +02:00			`const sc: StoredCrawl = {`
			`originUrl: req.body.url,`
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00			`crawlerOptions: toLegacyCrawlerOptions(crawlerOptions),`
			`scrapeOptions,`
fix(crawl): disable smart wait 2024-12-10 21:12:31 +01:00			`internalOptions: { disableSmartWaitCache: true }, // NOTE: smart wait disabled for crawls to ensure contentful scrape, speed does not matter`
add zod, create middleware, update openapi declaration, add crawl logic 2024-08-15 23:30:33 +02:00			`team_id: req.auth.team_id,`
			`createdAt: Date.now(),`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`plan: req.auth.plan`
add zod, create middleware, update openapi declaration, add crawl logic 2024-08-15 23:30:33 +02:00			`};`
wip: map, crawl, scrape mockups 2024-08-06 15:24:45 -03:00
add zod, create middleware, update openapi declaration, add crawl logic 2024-08-15 23:30:33 +02:00			`const crawler = crawlToCrawler(id, sc);`
wip: map, crawl, scrape mockups 2024-08-06 15:24:45 -03:00
add zod, create middleware, update openapi declaration, add crawl logic 2024-08-15 23:30:33 +02:00			`try {`
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00			`sc.robots = await crawler.getRobotsTxt(scrapeOptions.skipTlsVerification);`
add zod, create middleware, update openapi declaration, add crawl logic 2024-08-15 23:30:33 +02:00			`} catch (e) {`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`logger.debug("Failed to get robots.txt (this is probably fine!)", {`
			`error: e`
			`});`
add zod, create middleware, update openapi declaration, add crawl logic 2024-08-15 23:30:33 +02:00			`}`
wip: map, crawl, scrape mockups 2024-08-06 15:24:45 -03:00
add zod, create middleware, update openapi declaration, add crawl logic 2024-08-15 23:30:33 +02:00			`await saveCrawl(id, sc);`

Update crawl.ts 2024-08-20 14:19:20 -03:00			`const sitemap = sc.crawlerOptions.ignoreSitemap`
			`? null`
			`: await crawler.tryGetSitemap();`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00
Nick: 2024-08-28 12:46:59 -03:00			`if (sitemap !== null && sitemap.length > 0) {`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`logger.debug("Using sitemap of length " + sitemap.length, {`
			`sitemapLength: sitemap.length`
			`});`
Nick: 2024-08-28 12:46:59 -03:00			`let jobPriority = 20;`
feat(app): add extra crawl logging (app-side only for now) 2024-12-05 20:50:36 +01:00			`// If it is over 1000, we need to get the job priority,`
			`// otherwise we can use the default priority of 20`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`if (sitemap.length > 1000) {`
feat(app): add extra crawl logging (app-side only for now) 2024-12-05 20:50:36 +01:00			`// set base to 21`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`jobPriority = await getJobPriority({`
			`plan: req.auth.plan,`
			`team_id: req.auth.team_id,`
			`basePriority: 21`
			`});`
feat(app): add extra crawl logging (app-side only for now) 2024-12-05 20:50:36 +01:00			`}`
			`logger.debug("Using job priority " + jobPriority, { jobPriority });`

Update crawl.ts 2024-08-20 14:19:20 -03:00			`const jobs = sitemap.map((x) => {`
add zod, create middleware, update openapi declaration, add crawl logic 2024-08-15 23:30:33 +02:00			`const url = x.url;`
			`const uuid = uuidv4();`
			`return {`
			`name: uuid,`
			`data: {`
			`url,`
			`mode: "single_urls",`
			`team_id: req.auth.team_id,`
feat(concurrency-limit): set limit based on plan 2024-09-28 00:19:46 +02:00			`plan: req.auth.plan,`
add zod, create middleware, update openapi declaration, add crawl logic 2024-08-15 23:30:33 +02:00			`crawlerOptions,`
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00			`scrapeOptions,`
fix(crawl): disable smart wait 2024-12-10 21:12:31 +01:00			`internalOptions: sc.internalOptions,`
add zod, create middleware, update openapi declaration, add crawl logic 2024-08-15 23:30:33 +02:00			`origin: "api",`
			`crawl_id: id,`
			`sitemapped: true,`
Update crawl.ts 2024-08-30 15:21:22 -03:00			`webhook: req.body.webhook,`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`v1: true`
add zod, create middleware, update openapi declaration, add crawl logic 2024-08-15 23:30:33 +02:00			`},`
			`opts: {`
			`jobId: uuid,`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`priority: 20`
			`}`
add zod, create middleware, update openapi declaration, add crawl logic 2024-08-15 23:30:33 +02:00			`};`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`});`
add zod, create middleware, update openapi declaration, add crawl logic 2024-08-15 23:30:33 +02:00
feat(app): add extra crawl logging (app-side only for now) 2024-12-05 20:50:36 +01:00			`logger.debug("Locking URLs...");`
Update crawl.ts 2024-08-20 14:19:20 -03:00			`await lockURLs(`
			`id,`
feat(crawl): add parameter to treat differing query parameters as different URLs (#892 ) 2024-11-11 21:36:22 +01:00			`sc,`
Update crawl.ts 2024-08-20 14:19:20 -03:00			`jobs.map((x) => x.data.url)`
			`);`
feat(app): add extra crawl logging (app-side only for now) 2024-12-05 20:50:36 +01:00			`logger.debug("Adding scrape jobs to Redis...");`
Update crawl.ts 2024-08-20 14:19:20 -03:00			`await addCrawlJobs(`
			`id,`
			`jobs.map((x) => x.opts.jobId)`
			`);`
feat(app): add extra crawl logging (app-side only for now) 2024-12-05 20:50:36 +01:00			`logger.debug("Adding scrape jobs to BullMQ...");`
add zod, create middleware, update openapi declaration, add crawl logic 2024-08-15 23:30:33 +02:00			`await getScrapeQueue().addBulk(jobs);`
			`} else {`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`logger.debug("Sitemap not found or ignored.", {`
			`ignoreSitemap: sc.crawlerOptions.ignoreSitemap`
			`});`
feat(app): add extra crawl logging (app-side only for now) 2024-12-05 20:50:36 +01:00
			`logger.debug("Locking URL...");`
add zod, create middleware, update openapi declaration, add crawl logic 2024-08-15 23:30:33 +02:00			`await lockURL(id, sc, req.body.url);`
concurrency limit fix PoC II. 2024-10-25 20:21:12 +02:00			`const jobId = uuidv4();`
feat(app): add extra crawl logging (app-side only for now) 2024-12-05 20:50:36 +01:00			`logger.debug("Adding scrape job to Redis...", { jobId });`
concurrency limit fix PoC II. 2024-10-25 20:21:12 +02:00			`await addScrapeJob(`
Update crawl.ts 2024-08-20 14:19:20 -03:00			`{`
			`url: req.body.url,`
			`mode: "single_urls",`
			`team_id: req.auth.team_id,`
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00			`crawlerOptions,`
			`scrapeOptions: scrapeOptionsSchema.parse(scrapeOptions),`
fix(crawl): disable smart wait 2024-12-10 21:12:31 +01:00			`internalOptions: sc.internalOptions,`
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00			`plan: req.auth.plan!,`
Update crawl.ts 2024-08-20 14:19:20 -03:00			`origin: "api",`
			`crawl_id: id,`
			`webhook: req.body.webhook,`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`v1: true`
Update crawl.ts 2024-08-20 14:19:20 -03:00			`},`
			`{`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`priority: 15`
concurrency limit fix PoC II. 2024-10-25 20:21:12 +02:00			`},`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`jobId`
Update crawl.ts 2024-08-20 14:19:20 -03:00			`);`
feat(app): add extra crawl logging (app-side only for now) 2024-12-05 20:50:36 +01:00			`logger.debug("Adding scrape job to BullMQ...", { jobId });`
concurrency limit fix PoC II. 2024-10-25 20:21:12 +02:00			`await addCrawlJob(id, jobId);`
wip: map, crawl, scrape mockups 2024-08-06 15:24:45 -03:00			`}`
feat(app): add extra crawl logging (app-side only for now) 2024-12-05 20:50:36 +01:00			`logger.debug("Done queueing jobs!");`
add zod, create middleware, update openapi declaration, add crawl logic 2024-08-15 23:30:33 +02:00
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`if (req.body.webhook) {`
			`logger.debug("Calling webhook with crawl.started...", {`
			`webhook: req.body.webhook`
			`});`
			`await callWebhook(`
			`req.auth.team_id,`
			`id,`
			`null,`
			`req.body.webhook,`
			`true,`
			`"crawl.started"`
			`);`
Nick: webhooks v1 working great 2024-09-01 13:44:36 -03:00			`}`

Update crawl.ts 2024-09-05 13:03:43 -03:00			`const protocol = process.env.ENV === "local" ? req.protocol : "https";`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00
add zod, create middleware, update openapi declaration, add crawl logic 2024-08-15 23:30:33 +02:00			`return res.status(200).json({`
			`success: true,`
			`id,`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			url: `${protocol}://${req.get("host")}/v1/crawl/${id}`
add zod, create middleware, update openapi declaration, add crawl logic 2024-08-15 23:30:33 +02:00			`});`
wip: map, crawl, scrape mockups 2024-08-06 15:24:45 -03:00			`}`