diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 3828f830..cb93eede 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -230,8 +230,11 @@ export class WebCrawler { const $ = load(html); $("a").each((_, element) => { - const href = $(element).attr("href"); + let href = $(element).attr("href"); if (href) { + if (href.match(/^https?:\/[^\/]/)) { + href = href.replace(/^https?:\/[^\/]/, "$&/"); + } const u = this.filterURL(href, url); if (u !== null) { links.push(u); diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index db976927..c75acd32 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -352,12 +352,10 @@ async function processJob(job: Job & { id: string }, token: string) { if (job.data.crawlerOptions !== null) { if (!sc.cancelled) { - const newURL = new URL(doc.metadata.url ?? doc.metadata.sourceURL ?? sc.originUrl!); - const useNewURLAsBase = newURL.hostname.split(".").slice(-2).join(".") === new URL(sc.originUrl!).hostname.split(".").slice(-2).join("."); - const crawler = crawlToCrawler(job.data.crawl_id, sc, useNewURLAsBase ? newURL.href : undefined); + const crawler = crawlToCrawler(job.data.crawl_id, sc, doc.metadata.url ?? doc.metadata.sourceURL ?? sc.originUrl!); const links = crawler.filterLinks( - crawler.extractLinksFromHTML(rawHtml ?? "", doc.metadata?.url ?? doc.metadata?.sourceURL ?? sc.originUrl as string), + crawler.extractLinksFromHTML(rawHtml ?? "", doc.metadata?.url ?? doc.metadata?.sourceURL ?? sc.originUrl!), Infinity, sc.crawlerOptions?.maxDepth ?? 10 );