feat(queue-worker): add more logs around crawl finishing logic

This commit is contained in:
Gergő Móricz
2025-05-09 16:52:38 +02:00
parent 907cf1cf41
commit fdeb01847d
+29 -8
View File
@@ -109,7 +109,16 @@ const gotJobInterval = Number(process.env.CONNECTION_MONITOR_INTERVAL) || 20;
const runningJobs: Set<string> = new Set(); const runningJobs: Set<string> = new Set();
async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) { async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
const logger = _logger.child({
module: "queue-worker",
method: "finishCrawlIfNeeded",
jobId: job.id,
scrapeId: job.id,
crawlId: job.data.crawl_id,
});
if (await finishCrawlPre(job.data.crawl_id)) { if (await finishCrawlPre(job.data.crawl_id)) {
logger.info("Crawl is pre-finished, checking if we need to add more jobs");
if ( if (
job.data.crawlerOptions && job.data.crawlerOptions &&
!(await redisConnection.exists( !(await redisConnection.exists(
@@ -131,6 +140,10 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
), ),
); );
logger.info("Visited URLs", {
visitedUrls: visitedUrls.size,
});
const lastUrls: string[] = ( const lastUrls: string[] = (
( (
await supabase_service.rpc("diff_get_last_crawl_urls", { await supabase_service.rpc("diff_get_last_crawl_urls", {
@@ -142,6 +155,10 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
const lastUrlsSet = new Set(lastUrls); const lastUrlsSet = new Set(lastUrls);
logger.info("Last URLs", {
lastUrls: lastUrlsSet.size,
});
const crawler = crawlToCrawler( const crawler = crawlToCrawler(
job.data.crawl_id, job.data.crawl_id,
sc, sc,
@@ -162,15 +179,12 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
: sc.crawlerOptions.limit - : sc.crawlerOptions.limit -
(await getDoneJobsOrderedLength(job.data.crawl_id)); (await getDoneJobsOrderedLength(job.data.crawl_id));
console.log(
sc.originUrl!,
univistedUrls,
visitedUrls,
lastUrls,
addableJobCount,
);
if (univistedUrls.length !== 0 && addableJobCount > 0) { if (univistedUrls.length !== 0 && addableJobCount > 0) {
logger.info("Adding jobs", {
univistedUrls: univistedUrls.length,
addableJobCount,
});
const jobs = univistedUrls.slice(0, addableJobCount).map((url) => { const jobs = univistedUrls.slice(0, addableJobCount).map((url) => {
const uuid = uuidv4(); const uuid = uuidv4();
return { return {
@@ -212,10 +226,15 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
); );
await addScrapeJobs(lockedJobs); await addScrapeJobs(lockedJobs);
logger.info("Added jobs, not going for the full finish", {
lockedJobs: lockedJobs.length,
});
return; return;
} }
} }
logger.info("Finishing crawl");
await finishCrawl(job.data.crawl_id); await finishCrawl(job.data.crawl_id);
(async () => { (async () => {
@@ -267,6 +286,7 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
) )
.filter((x) => x !== null); .filter((x) => x !== null);
logger.info("Logging crawl NOW!");
await logJob({ await logJob({
job_id: job.data.crawl_id, job_id: job.data.crawl_id,
success: jobStatus === "completed", success: jobStatus === "completed",
@@ -281,6 +301,7 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
crawlerOptions: sc.crawlerOptions, crawlerOptions: sc.crawlerOptions,
origin: job.data.origin, origin: job.data.origin,
}); });
logger.info("Logged crawl!");
const data = { const data = {
success: jobStatus !== "failed", success: jobStatus !== "failed",