Update queue-worker.ts
This commit is contained in:
@@ -48,6 +48,9 @@ import {
|
||||
} from "../lib/concurrency-limit";
|
||||
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
|
||||
import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings";
|
||||
import { indexPage } from "../lib/extract/index/pinecone";
|
||||
import { Document } from "../controllers/v1/types";
|
||||
|
||||
configDotenv();
|
||||
|
||||
class RacedRedirectError extends Error {
|
||||
@@ -209,7 +212,10 @@ const processJobInternal = async (token: string, job: Job & { id: string }) => {
|
||||
const result = await processJob(job, token);
|
||||
if (result.success) {
|
||||
try {
|
||||
if (job.data.crawl_id && process.env.USE_DB_AUTHENTICATION === "true") {
|
||||
if (
|
||||
job.data.crawl_id &&
|
||||
process.env.USE_DB_AUTHENTICATION === "true"
|
||||
) {
|
||||
logger.debug(
|
||||
"Job succeeded -- has crawl associated, putting null in Redis",
|
||||
);
|
||||
@@ -411,7 +417,7 @@ async function processKickoffJob(job: Job & { id: string }, token: string) {
|
||||
|
||||
const sitemap = sc.crawlerOptions.ignoreSitemap
|
||||
? 0
|
||||
: await crawler.tryGetSitemap(async urls => {
|
||||
: await crawler.tryGetSitemap(async (urls) => {
|
||||
if (urls.length === 0) return;
|
||||
|
||||
logger.debug("Using sitemap chunk of length " + urls.length, {
|
||||
@@ -425,7 +431,7 @@ async function processKickoffJob(job: Job & { id: string }, token: string) {
|
||||
});
|
||||
logger.debug("Using job priority " + jobPriority, { jobPriority });
|
||||
|
||||
const jobs = urls.map(url => {
|
||||
const jobs = urls.map((url) => {
|
||||
const uuid = uuidv4();
|
||||
return {
|
||||
name: uuid,
|
||||
@@ -512,13 +518,32 @@ async function processKickoffJob(job: Job & { id: string }, token: string) {
|
||||
);
|
||||
}
|
||||
|
||||
return { success: true }
|
||||
return { success: true };
|
||||
} catch (error) {
|
||||
logger.error("An error occurred!", { error })
|
||||
logger.error("An error occurred!", { error });
|
||||
return { success: false, error };
|
||||
}
|
||||
}
|
||||
|
||||
async function indexJob(job: Job & { id: string }, document: Document) {
|
||||
if (
|
||||
document &&
|
||||
document.markdown &&
|
||||
job.data.team_id === process.env.BACKGROUND_INDEX_TEAM_ID!
|
||||
) {
|
||||
indexPage({
|
||||
document: document,
|
||||
originUrl: job.data.crawl_id
|
||||
? (await getCrawl(job.data.crawl_id))?.originUrl!
|
||||
: document.metadata.sourceURL!,
|
||||
crawlId: job.data.crawl_id,
|
||||
teamId: job.data.team_id,
|
||||
}).catch((error) => {
|
||||
_logger.error("Error indexing page", { error });
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
async function processJob(job: Job & { id: string }, token: string) {
|
||||
const logger = _logger.child({
|
||||
module: "queue-worker",
|
||||
@@ -623,8 +648,12 @@ async function processJob(job: Job & { id: string }, token: string) {
|
||||
normalizeURL(doc.metadata.sourceURL, sc)
|
||||
) {
|
||||
const crawler = crawlToCrawler(job.data.crawl_id, sc);
|
||||
if (crawler.filterURL(doc.metadata.url, doc.metadata.sourceURL) === null) {
|
||||
throw new Error("Redirected target URL is not allowed by crawlOptions"); // TODO: make this its own error type that is ignored by error tracking
|
||||
if (
|
||||
crawler.filterURL(doc.metadata.url, doc.metadata.sourceURL) === null
|
||||
) {
|
||||
throw new Error(
|
||||
"Redirected target URL is not allowed by crawlOptions",
|
||||
); // TODO: make this its own error type that is ignored by error tracking
|
||||
}
|
||||
|
||||
if (isUrlBlocked(doc.metadata.url)) {
|
||||
@@ -675,6 +704,8 @@ async function processJob(job: Job & { id: string }, token: string) {
|
||||
true,
|
||||
);
|
||||
|
||||
indexJob(job, doc);
|
||||
|
||||
logger.debug("Declaring job as done...");
|
||||
await addCrawlJobDone(job.data.crawl_id, job.id, true);
|
||||
|
||||
@@ -755,6 +786,8 @@ async function processJob(job: Job & { id: string }, token: string) {
|
||||
}
|
||||
|
||||
await finishCrawlIfNeeded(job, sc);
|
||||
} else {
|
||||
indexJob(job, doc);
|
||||
}
|
||||
|
||||
logger.info(`🐂 Job done ${job.id}`);
|
||||
|
||||
Reference in New Issue
Block a user