From 52806807a183aa80222a206b9caf7a2b6b8e0e80 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 3 Dec 2024 16:25:55 -0300 Subject: [PATCH] Nick: crawl fixes --- apps/api/src/controllers/v1/crawl.ts | 2 +- apps/api/src/lib/crawl-redis.ts | 21 ++++++++++++++++----- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/apps/api/src/controllers/v1/crawl.ts b/apps/api/src/controllers/v1/crawl.ts index aaf33f29..2195913f 100644 --- a/apps/api/src/controllers/v1/crawl.ts +++ b/apps/api/src/controllers/v1/crawl.ts @@ -129,7 +129,7 @@ export async function crawlController( priority: 20, }, }; - }); + }) await lockURLs( id, diff --git a/apps/api/src/lib/crawl-redis.ts b/apps/api/src/lib/crawl-redis.ts index ba7487bd..68087993 100644 --- a/apps/api/src/lib/crawl-redis.ts +++ b/apps/api/src/lib/crawl-redis.ts @@ -158,11 +158,22 @@ export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise /// NOTE: does not check limit. only use if limit is checked beforehand e.g. with sitemap export async function lockURLs(id: string, sc: StoredCrawl, urls: string[]): Promise { - urls = urls.map(url => { - return normalizeURL(url, sc); - }); - - const res = (await redisConnection.sadd("crawl:" + id + ":visited", ...urls)) !== 0 + urls = urls.map(url => normalizeURL(url, sc)); + + // Add to visited_unique set + await redisConnection.sadd("crawl:" + id + ":visited_unique", ...urls); + await redisConnection.expire("crawl:" + id + ":visited_unique", 24 * 60 * 60, "NX"); + + let res: boolean; + if (!sc.crawlerOptions?.deduplicateSimilarURLs) { + const x = await redisConnection.sadd("crawl:" + id + ":visited", ...urls); + res = x === urls.length; + } else { + const allPermutations = urls.flatMap(url => generateURLPermutations(url).map(x => x.href)); + const x = await redisConnection.sadd("crawl:" + id + ":visited", ...allPermutations); + res = x === allPermutations.length; + } + await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX"); return res; }