Merge branch 'main' into nsc/new-extract
This commit is contained in:
@@ -80,6 +80,17 @@ function startServer(port = DEFAULT_PORT) {
|
|||||||
`For the Queue UI, open: http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues`
|
`For the Queue UI, open: http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues`
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
const exitHandler = () => {
|
||||||
|
logger.info('SIGTERM signal received: closing HTTP server')
|
||||||
|
server.close(() => {
|
||||||
|
logger.info("Server closed.");
|
||||||
|
process.exit(0);
|
||||||
|
});
|
||||||
|
};
|
||||||
|
|
||||||
|
process.on('SIGTERM', exitHandler);
|
||||||
|
process.on('SIGINT', exitHandler);
|
||||||
return server;
|
return server;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -166,10 +166,10 @@ export async function lockURLs(id: string, sc: StoredCrawl, urls: string[]): Pro
|
|||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
export function crawlToCrawler(id: string, sc: StoredCrawl): WebCrawler {
|
export function crawlToCrawler(id: string, sc: StoredCrawl, initialUrl?: string): WebCrawler {
|
||||||
const crawler = new WebCrawler({
|
const crawler = new WebCrawler({
|
||||||
jobId: id,
|
jobId: id,
|
||||||
initialUrl: sc.originUrl!,
|
initialUrl: initialUrl ?? sc.originUrl!,
|
||||||
includes: sc.crawlerOptions?.includes ?? [],
|
includes: sc.crawlerOptions?.includes ?? [],
|
||||||
excludes: sc.crawlerOptions?.excludes ?? [],
|
excludes: sc.crawlerOptions?.excludes ?? [],
|
||||||
maxCrawledLinks: sc.crawlerOptions?.maxCrawledLinks ?? 1000,
|
maxCrawledLinks: sc.crawlerOptions?.maxCrawledLinks ?? 1000,
|
||||||
|
|||||||
@@ -171,7 +171,7 @@ export class WebCrawler {
|
|||||||
let fullUrl = href;
|
let fullUrl = href;
|
||||||
if (!href.startsWith("http")) {
|
if (!href.startsWith("http")) {
|
||||||
try {
|
try {
|
||||||
fullUrl = new URL(href, this.baseUrl).toString();
|
fullUrl = new URL(href, url).toString();
|
||||||
} catch (_) {
|
} catch (_) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -352,10 +352,10 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||||||
|
|
||||||
if (!job.data.sitemapped && job.data.crawlerOptions !== null) {
|
if (!job.data.sitemapped && job.data.crawlerOptions !== null) {
|
||||||
if (!sc.cancelled) {
|
if (!sc.cancelled) {
|
||||||
const crawler = crawlToCrawler(job.data.crawl_id, sc);
|
const crawler = crawlToCrawler(job.data.crawl_id, sc, doc.metadata?.url ?? doc.metadata?.sourceURL ?? undefined);
|
||||||
|
|
||||||
const links = crawler.filterLinks(
|
const links = crawler.filterLinks(
|
||||||
crawler.extractLinksFromHTML(rawHtml ?? "", sc.originUrl as string),
|
crawler.extractLinksFromHTML(rawHtml ?? "", doc.metadata?.url ?? doc.metadata?.sourceURL ?? sc.originUrl as string),
|
||||||
Infinity,
|
Infinity,
|
||||||
sc.crawlerOptions?.maxDepth ?? 10
|
sc.crawlerOptions?.maxDepth ?? 10
|
||||||
);
|
);
|
||||||
@@ -504,9 +504,6 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||||||
job: job.id,
|
job: job.id,
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
} else {
|
|
||||||
logger.error(`🐂 Job timed out ${job.id}`);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (error instanceof CustomError) {
|
if (error instanceof CustomError) {
|
||||||
// Here we handle the error, then save the failed job
|
// Here we handle the error, then save the failed job
|
||||||
@@ -516,6 +513,9 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||||||
if (error.stack) {
|
if (error.stack) {
|
||||||
logger.error(error.stack);
|
logger.error(error.stack);
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
logger.error(`🐂 Job timed out ${job.id}`);
|
||||||
|
}
|
||||||
|
|
||||||
const data = {
|
const data = {
|
||||||
success: false,
|
success: false,
|
||||||
|
|||||||
Reference in New Issue
Block a user