Update queue-worker.ts
This commit is contained in:
@@ -48,6 +48,9 @@ import {
|
|||||||
} from "../lib/concurrency-limit";
|
} from "../lib/concurrency-limit";
|
||||||
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
|
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
|
||||||
import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings";
|
import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings";
|
||||||
|
import { indexPage } from "../lib/extract/index/pinecone";
|
||||||
|
import { Document } from "../controllers/v1/types";
|
||||||
|
|
||||||
configDotenv();
|
configDotenv();
|
||||||
|
|
||||||
class RacedRedirectError extends Error {
|
class RacedRedirectError extends Error {
|
||||||
@@ -209,7 +212,10 @@ const processJobInternal = async (token: string, job: Job & { id: string }) => {
|
|||||||
const result = await processJob(job, token);
|
const result = await processJob(job, token);
|
||||||
if (result.success) {
|
if (result.success) {
|
||||||
try {
|
try {
|
||||||
if (job.data.crawl_id && process.env.USE_DB_AUTHENTICATION === "true") {
|
if (
|
||||||
|
job.data.crawl_id &&
|
||||||
|
process.env.USE_DB_AUTHENTICATION === "true"
|
||||||
|
) {
|
||||||
logger.debug(
|
logger.debug(
|
||||||
"Job succeeded -- has crawl associated, putting null in Redis",
|
"Job succeeded -- has crawl associated, putting null in Redis",
|
||||||
);
|
);
|
||||||
@@ -411,7 +417,7 @@ async function processKickoffJob(job: Job & { id: string }, token: string) {
|
|||||||
|
|
||||||
const sitemap = sc.crawlerOptions.ignoreSitemap
|
const sitemap = sc.crawlerOptions.ignoreSitemap
|
||||||
? 0
|
? 0
|
||||||
: await crawler.tryGetSitemap(async urls => {
|
: await crawler.tryGetSitemap(async (urls) => {
|
||||||
if (urls.length === 0) return;
|
if (urls.length === 0) return;
|
||||||
|
|
||||||
logger.debug("Using sitemap chunk of length " + urls.length, {
|
logger.debug("Using sitemap chunk of length " + urls.length, {
|
||||||
@@ -425,7 +431,7 @@ async function processKickoffJob(job: Job & { id: string }, token: string) {
|
|||||||
});
|
});
|
||||||
logger.debug("Using job priority " + jobPriority, { jobPriority });
|
logger.debug("Using job priority " + jobPriority, { jobPriority });
|
||||||
|
|
||||||
const jobs = urls.map(url => {
|
const jobs = urls.map((url) => {
|
||||||
const uuid = uuidv4();
|
const uuid = uuidv4();
|
||||||
return {
|
return {
|
||||||
name: uuid,
|
name: uuid,
|
||||||
@@ -512,13 +518,32 @@ async function processKickoffJob(job: Job & { id: string }, token: string) {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
return { success: true }
|
return { success: true };
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.error("An error occurred!", { error })
|
logger.error("An error occurred!", { error });
|
||||||
return { success: false, error };
|
return { success: false, error };
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async function indexJob(job: Job & { id: string }, document: Document) {
|
||||||
|
if (
|
||||||
|
document &&
|
||||||
|
document.markdown &&
|
||||||
|
job.data.team_id === process.env.BACKGROUND_INDEX_TEAM_ID!
|
||||||
|
) {
|
||||||
|
indexPage({
|
||||||
|
document: document,
|
||||||
|
originUrl: job.data.crawl_id
|
||||||
|
? (await getCrawl(job.data.crawl_id))?.originUrl!
|
||||||
|
: document.metadata.sourceURL!,
|
||||||
|
crawlId: job.data.crawl_id,
|
||||||
|
teamId: job.data.team_id,
|
||||||
|
}).catch((error) => {
|
||||||
|
_logger.error("Error indexing page", { error });
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
async function processJob(job: Job & { id: string }, token: string) {
|
async function processJob(job: Job & { id: string }, token: string) {
|
||||||
const logger = _logger.child({
|
const logger = _logger.child({
|
||||||
module: "queue-worker",
|
module: "queue-worker",
|
||||||
@@ -623,8 +648,12 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||||||
normalizeURL(doc.metadata.sourceURL, sc)
|
normalizeURL(doc.metadata.sourceURL, sc)
|
||||||
) {
|
) {
|
||||||
const crawler = crawlToCrawler(job.data.crawl_id, sc);
|
const crawler = crawlToCrawler(job.data.crawl_id, sc);
|
||||||
if (crawler.filterURL(doc.metadata.url, doc.metadata.sourceURL) === null) {
|
if (
|
||||||
throw new Error("Redirected target URL is not allowed by crawlOptions"); // TODO: make this its own error type that is ignored by error tracking
|
crawler.filterURL(doc.metadata.url, doc.metadata.sourceURL) === null
|
||||||
|
) {
|
||||||
|
throw new Error(
|
||||||
|
"Redirected target URL is not allowed by crawlOptions",
|
||||||
|
); // TODO: make this its own error type that is ignored by error tracking
|
||||||
}
|
}
|
||||||
|
|
||||||
if (isUrlBlocked(doc.metadata.url)) {
|
if (isUrlBlocked(doc.metadata.url)) {
|
||||||
@@ -675,6 +704,8 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||||||
true,
|
true,
|
||||||
);
|
);
|
||||||
|
|
||||||
|
indexJob(job, doc);
|
||||||
|
|
||||||
logger.debug("Declaring job as done...");
|
logger.debug("Declaring job as done...");
|
||||||
await addCrawlJobDone(job.data.crawl_id, job.id, true);
|
await addCrawlJobDone(job.data.crawl_id, job.id, true);
|
||||||
|
|
||||||
@@ -755,6 +786,8 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
await finishCrawlIfNeeded(job, sc);
|
await finishCrawlIfNeeded(job, sc);
|
||||||
|
} else {
|
||||||
|
indexJob(job, doc);
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.info(`🐂 Job done ${job.id}`);
|
logger.info(`🐂 Job done ${job.id}`);
|
||||||
|
|||||||
Reference in New Issue
Block a user