fix(queue-worker/crawl): use SCARD to generate num_docs field

This commit is contained in:
Móricz Gergő
2025-01-09 09:51:16 +01:00
parent 9e8c629ff4
commit 49e584f8e1
2 changed files with 7 additions and 2 deletions
+4
View File
@@ -160,6 +160,10 @@ export async function getCrawlJobs(id: string): Promise<string[]> {
return await redisConnection.smembers("crawl:" + id + ":jobs"); return await redisConnection.smembers("crawl:" + id + ":jobs");
} }
export async function getCrawlJobCount(id: string): Promise<number> {
return await redisConnection.scard("crawl:" + id + ":jobs");
}
export async function getThrottledJobs(teamId: string): Promise<string[]> { export async function getThrottledJobs(teamId: string): Promise<string[]> {
return await redisConnection.zrangebyscore( return await redisConnection.zrangebyscore(
"concurrency-limiter:" + teamId + ":throttled", "concurrency-limiter:" + teamId + ":throttled",
+3 -2
View File
@@ -25,6 +25,7 @@ import {
finishCrawl, finishCrawl,
generateURLPermutations, generateURLPermutations,
getCrawl, getCrawl,
getCrawlJobCount,
getCrawlJobs, getCrawlJobs,
lockURL, lockURL,
lockURLs, lockURLs,
@@ -212,7 +213,7 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
); );
} }
} else { } else {
const jobIDs = await getCrawlJobs(job.data.crawl_id); const num_docs = await getCrawlJobCount(job.data.crawl_id);
const jobStatus = sc.cancelled ? "failed" : "completed"; const jobStatus = sc.cancelled ? "failed" : "completed";
await logJob( await logJob(
@@ -220,7 +221,7 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
job_id: job.data.crawl_id, job_id: job.data.crawl_id,
success: jobStatus === "completed", success: jobStatus === "completed",
message: sc.cancelled ? "Cancelled" : undefined, message: sc.cancelled ? "Cancelled" : undefined,
num_docs: jobIDs.length, num_docs,
docs: [], docs: [],
time_taken: (Date.now() - sc.createdAt) / 1000, time_taken: (Date.now() - sc.createdAt) / 1000,
team_id: job.data.team_id, team_id: job.data.team_id,