Merge branch 'main' into nsc/new-extract

This commit is contained in:
Nicolas
2024-11-12 12:24:47 -05:00
4 changed files with 25 additions and 14 deletions
+11
View File
@@ -80,6 +80,17 @@ function startServer(port = DEFAULT_PORT) {
`For the Queue UI, open: http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues` `For the Queue UI, open: http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues`
); );
}); });
const exitHandler = () => {
logger.info('SIGTERM signal received: closing HTTP server')
server.close(() => {
logger.info("Server closed.");
process.exit(0);
});
};
process.on('SIGTERM', exitHandler);
process.on('SIGINT', exitHandler);
return server; return server;
} }
+2 -2
View File
@@ -166,10 +166,10 @@ export async function lockURLs(id: string, sc: StoredCrawl, urls: string[]): Pro
return res; return res;
} }
export function crawlToCrawler(id: string, sc: StoredCrawl): WebCrawler { export function crawlToCrawler(id: string, sc: StoredCrawl, initialUrl?: string): WebCrawler {
const crawler = new WebCrawler({ const crawler = new WebCrawler({
jobId: id, jobId: id,
initialUrl: sc.originUrl!, initialUrl: initialUrl ?? sc.originUrl!,
includes: sc.crawlerOptions?.includes ?? [], includes: sc.crawlerOptions?.includes ?? [],
excludes: sc.crawlerOptions?.excludes ?? [], excludes: sc.crawlerOptions?.excludes ?? [],
maxCrawledLinks: sc.crawlerOptions?.maxCrawledLinks ?? 1000, maxCrawledLinks: sc.crawlerOptions?.maxCrawledLinks ?? 1000,
+1 -1
View File
@@ -171,7 +171,7 @@ export class WebCrawler {
let fullUrl = href; let fullUrl = href;
if (!href.startsWith("http")) { if (!href.startsWith("http")) {
try { try {
fullUrl = new URL(href, this.baseUrl).toString(); fullUrl = new URL(href, url).toString();
} catch (_) { } catch (_) {
return null; return null;
} }
+11 -11
View File
@@ -352,10 +352,10 @@ async function processJob(job: Job & { id: string }, token: string) {
if (!job.data.sitemapped && job.data.crawlerOptions !== null) { if (!job.data.sitemapped && job.data.crawlerOptions !== null) {
if (!sc.cancelled) { if (!sc.cancelled) {
const crawler = crawlToCrawler(job.data.crawl_id, sc); const crawler = crawlToCrawler(job.data.crawl_id, sc, doc.metadata?.url ?? doc.metadata?.sourceURL ?? undefined);
const links = crawler.filterLinks( const links = crawler.filterLinks(
crawler.extractLinksFromHTML(rawHtml ?? "", sc.originUrl as string), crawler.extractLinksFromHTML(rawHtml ?? "", doc.metadata?.url ?? doc.metadata?.sourceURL ?? sc.originUrl as string),
Infinity, Infinity,
sc.crawlerOptions?.maxDepth ?? 10 sc.crawlerOptions?.maxDepth ?? 10
); );
@@ -504,19 +504,19 @@ async function processJob(job: Job & { id: string }, token: string) {
job: job.id, job: job.id,
}, },
}); });
if (error instanceof CustomError) {
// Here we handle the error, then save the failed job
logger.error(error.message); // or any other error handling
}
logger.error(error);
if (error.stack) {
logger.error(error.stack);
}
} else { } else {
logger.error(`🐂 Job timed out ${job.id}`); logger.error(`🐂 Job timed out ${job.id}`);
} }
if (error instanceof CustomError) {
// Here we handle the error, then save the failed job
logger.error(error.message); // or any other error handling
}
logger.error(error);
if (error.stack) {
logger.error(error.stack);
}
const data = { const data = {
success: false, success: false,
document: null, document: null,