Update queue-worker.ts

2024-12-30 21:43:59 -03:00
parent e6da214aeb
commit bd81b41d5f
1 changed files with 94 additions and 61 deletions
@@ -48,6 +48,9 @@ import {
 } from "../lib/concurrency-limit";
 import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
 import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings";
 import { indexPage } from "../lib/extract/index/pinecone";
 import { Document } from "../controllers/v1/types";
 configDotenv();
 class RacedRedirectError extends Error {
@@ -209,7 +212,10 @@ const processJobInternal = async (token: string, job: Job & { id: string }) => {
      const result = await processJob(job, token);
      if (result.success) {
        try {
-          if (job.data.crawl_id && process.env.USE_DB_AUTHENTICATION === "true") {
+          if (
            job.data.crawl_id &&
            process.env.USE_DB_AUTHENTICATION === "true"
          ) {
            logger.debug(
              "Job succeeded -- has crawl associated, putting null in Redis",
            );
@@ -411,7 +417,7 @@ async function processKickoffJob(job: Job & { id: string }, token: string) {
    const sitemap = sc.crawlerOptions.ignoreSitemap
      ? 0
-        : await crawler.tryGetSitemap(async urls => {
+      : await crawler.tryGetSitemap(async (urls) => {
          if (urls.length === 0) return;
          logger.debug("Using sitemap chunk of length " + urls.length, {
@@ -425,7 +431,7 @@ async function processKickoffJob(job: Job & { id: string }, token: string) {
          });
          logger.debug("Using job priority " + jobPriority, { jobPriority });
-            const jobs = urls.map(url => {
+          const jobs = urls.map((url) => {
            const uuid = uuidv4();
            return {
              name: uuid,
@@ -512,13 +518,32 @@ async function processKickoffJob(job: Job & { id: string }, token: string) {
      );
    }
-    return { success: true }
+    return { success: true };
  } catch (error) {
-    logger.error("An error occurred!", { error })
+    logger.error("An error occurred!", { error });
    return { success: false, error };
  }
 }
 async function indexJob(job: Job & { id: string }, document: Document) {
  if (
    document &&
    document.markdown &&
    job.data.team_id === process.env.BACKGROUND_INDEX_TEAM_ID!
  ) {
    indexPage({
      document: document,
      originUrl: job.data.crawl_id
        ? (await getCrawl(job.data.crawl_id))?.originUrl!
        : document.metadata.sourceURL!,
      crawlId: job.data.crawl_id,
      teamId: job.data.team_id,
    }).catch((error) => {
      _logger.error("Error indexing page", { error });
    });
  }
 }
 async function processJob(job: Job & { id: string }, token: string) {
  const logger = _logger.child({
    module: "queue-worker",
@@ -623,8 +648,12 @@ async function processJob(job: Job & { id: string }, token: string) {
          normalizeURL(doc.metadata.sourceURL, sc)
      ) {
        const crawler = crawlToCrawler(job.data.crawl_id, sc);
-        if (crawler.filterURL(doc.metadata.url, doc.metadata.sourceURL) === null) {
+        if (
-          throw new Error("Redirected target URL is not allowed by crawlOptions"); // TODO: make this its own error type that is ignored by error tracking
+          crawler.filterURL(doc.metadata.url, doc.metadata.sourceURL) === null
        ) {
          throw new Error(
            "Redirected target URL is not allowed by crawlOptions",
          ); // TODO: make this its own error type that is ignored by error tracking
        }
        if (isUrlBlocked(doc.metadata.url)) {
@@ -675,6 +704,8 @@ async function processJob(job: Job & { id: string }, token: string) {
        true,
      );
      indexJob(job, doc);
      logger.debug("Declaring job as done...");
      await addCrawlJobDone(job.data.crawl_id, job.id, true);
@@ -755,6 +786,8 @@ async function processJob(job: Job & { id: string }, token: string) {
      }
      await finishCrawlIfNeeded(job, sc);
    } else {
      indexJob(job, doc);
    }
    logger.info(`🐂 Job done ${job.id}`);