Nick: crawl fixes
This commit is contained in:
@@ -129,7 +129,7 @@ export async function crawlController(
|
|||||||
priority: 20,
|
priority: 20,
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
});
|
})
|
||||||
|
|
||||||
await lockURLs(
|
await lockURLs(
|
||||||
id,
|
id,
|
||||||
|
|||||||
@@ -158,11 +158,22 @@ export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise
|
|||||||
|
|
||||||
/// NOTE: does not check limit. only use if limit is checked beforehand e.g. with sitemap
|
/// NOTE: does not check limit. only use if limit is checked beforehand e.g. with sitemap
|
||||||
export async function lockURLs(id: string, sc: StoredCrawl, urls: string[]): Promise<boolean> {
|
export async function lockURLs(id: string, sc: StoredCrawl, urls: string[]): Promise<boolean> {
|
||||||
urls = urls.map(url => {
|
urls = urls.map(url => normalizeURL(url, sc));
|
||||||
return normalizeURL(url, sc);
|
|
||||||
});
|
// Add to visited_unique set
|
||||||
|
await redisConnection.sadd("crawl:" + id + ":visited_unique", ...urls);
|
||||||
const res = (await redisConnection.sadd("crawl:" + id + ":visited", ...urls)) !== 0
|
await redisConnection.expire("crawl:" + id + ":visited_unique", 24 * 60 * 60, "NX");
|
||||||
|
|
||||||
|
let res: boolean;
|
||||||
|
if (!sc.crawlerOptions?.deduplicateSimilarURLs) {
|
||||||
|
const x = await redisConnection.sadd("crawl:" + id + ":visited", ...urls);
|
||||||
|
res = x === urls.length;
|
||||||
|
} else {
|
||||||
|
const allPermutations = urls.flatMap(url => generateURLPermutations(url).map(x => x.href));
|
||||||
|
const x = await redisConnection.sadd("crawl:" + id + ":visited", ...allPermutations);
|
||||||
|
res = x === allPermutations.length;
|
||||||
|
}
|
||||||
|
|
||||||
await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX");
|
await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX");
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user