feat(queue-worker): add more logs around crawl finishing logic
This commit is contained in:
@@ -109,7 +109,16 @@ const gotJobInterval = Number(process.env.CONNECTION_MONITOR_INTERVAL) || 20;
|
|||||||
const runningJobs: Set<string> = new Set();
|
const runningJobs: Set<string> = new Set();
|
||||||
|
|
||||||
async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
|
async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
|
||||||
|
const logger = _logger.child({
|
||||||
|
module: "queue-worker",
|
||||||
|
method: "finishCrawlIfNeeded",
|
||||||
|
jobId: job.id,
|
||||||
|
scrapeId: job.id,
|
||||||
|
crawlId: job.data.crawl_id,
|
||||||
|
});
|
||||||
|
|
||||||
if (await finishCrawlPre(job.data.crawl_id)) {
|
if (await finishCrawlPre(job.data.crawl_id)) {
|
||||||
|
logger.info("Crawl is pre-finished, checking if we need to add more jobs");
|
||||||
if (
|
if (
|
||||||
job.data.crawlerOptions &&
|
job.data.crawlerOptions &&
|
||||||
!(await redisConnection.exists(
|
!(await redisConnection.exists(
|
||||||
@@ -131,6 +140,10 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
|
|||||||
),
|
),
|
||||||
);
|
);
|
||||||
|
|
||||||
|
logger.info("Visited URLs", {
|
||||||
|
visitedUrls: visitedUrls.size,
|
||||||
|
});
|
||||||
|
|
||||||
const lastUrls: string[] = (
|
const lastUrls: string[] = (
|
||||||
(
|
(
|
||||||
await supabase_service.rpc("diff_get_last_crawl_urls", {
|
await supabase_service.rpc("diff_get_last_crawl_urls", {
|
||||||
@@ -142,6 +155,10 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
|
|||||||
|
|
||||||
const lastUrlsSet = new Set(lastUrls);
|
const lastUrlsSet = new Set(lastUrls);
|
||||||
|
|
||||||
|
logger.info("Last URLs", {
|
||||||
|
lastUrls: lastUrlsSet.size,
|
||||||
|
});
|
||||||
|
|
||||||
const crawler = crawlToCrawler(
|
const crawler = crawlToCrawler(
|
||||||
job.data.crawl_id,
|
job.data.crawl_id,
|
||||||
sc,
|
sc,
|
||||||
@@ -162,15 +179,12 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
|
|||||||
: sc.crawlerOptions.limit -
|
: sc.crawlerOptions.limit -
|
||||||
(await getDoneJobsOrderedLength(job.data.crawl_id));
|
(await getDoneJobsOrderedLength(job.data.crawl_id));
|
||||||
|
|
||||||
console.log(
|
|
||||||
sc.originUrl!,
|
|
||||||
univistedUrls,
|
|
||||||
visitedUrls,
|
|
||||||
lastUrls,
|
|
||||||
addableJobCount,
|
|
||||||
);
|
|
||||||
|
|
||||||
if (univistedUrls.length !== 0 && addableJobCount > 0) {
|
if (univistedUrls.length !== 0 && addableJobCount > 0) {
|
||||||
|
logger.info("Adding jobs", {
|
||||||
|
univistedUrls: univistedUrls.length,
|
||||||
|
addableJobCount,
|
||||||
|
});
|
||||||
|
|
||||||
const jobs = univistedUrls.slice(0, addableJobCount).map((url) => {
|
const jobs = univistedUrls.slice(0, addableJobCount).map((url) => {
|
||||||
const uuid = uuidv4();
|
const uuid = uuidv4();
|
||||||
return {
|
return {
|
||||||
@@ -212,10 +226,15 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
|
|||||||
);
|
);
|
||||||
await addScrapeJobs(lockedJobs);
|
await addScrapeJobs(lockedJobs);
|
||||||
|
|
||||||
|
logger.info("Added jobs, not going for the full finish", {
|
||||||
|
lockedJobs: lockedJobs.length,
|
||||||
|
});
|
||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
logger.info("Finishing crawl");
|
||||||
await finishCrawl(job.data.crawl_id);
|
await finishCrawl(job.data.crawl_id);
|
||||||
|
|
||||||
(async () => {
|
(async () => {
|
||||||
@@ -267,6 +286,7 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
|
|||||||
)
|
)
|
||||||
.filter((x) => x !== null);
|
.filter((x) => x !== null);
|
||||||
|
|
||||||
|
logger.info("Logging crawl NOW!");
|
||||||
await logJob({
|
await logJob({
|
||||||
job_id: job.data.crawl_id,
|
job_id: job.data.crawl_id,
|
||||||
success: jobStatus === "completed",
|
success: jobStatus === "completed",
|
||||||
@@ -281,6 +301,7 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
|
|||||||
crawlerOptions: sc.crawlerOptions,
|
crawlerOptions: sc.crawlerOptions,
|
||||||
origin: job.data.origin,
|
origin: job.data.origin,
|
||||||
});
|
});
|
||||||
|
logger.info("Logged crawl!");
|
||||||
|
|
||||||
const data = {
|
const data = {
|
||||||
success: jobStatus !== "failed",
|
success: jobStatus !== "failed",
|
||||||
|
|||||||
Reference in New Issue
Block a user