fix handling of badly formatted URLs
This commit is contained in:
@@ -230,8 +230,11 @@ export class WebCrawler {
|
|||||||
|
|
||||||
const $ = load(html);
|
const $ = load(html);
|
||||||
$("a").each((_, element) => {
|
$("a").each((_, element) => {
|
||||||
const href = $(element).attr("href");
|
let href = $(element).attr("href");
|
||||||
if (href) {
|
if (href) {
|
||||||
|
if (href.match(/^https?:\/[^\/]/)) {
|
||||||
|
href = href.replace(/^https?:\/[^\/]/, "$&/");
|
||||||
|
}
|
||||||
const u = this.filterURL(href, url);
|
const u = this.filterURL(href, url);
|
||||||
if (u !== null) {
|
if (u !== null) {
|
||||||
links.push(u);
|
links.push(u);
|
||||||
|
|||||||
@@ -352,12 +352,10 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||||||
|
|
||||||
if (job.data.crawlerOptions !== null) {
|
if (job.data.crawlerOptions !== null) {
|
||||||
if (!sc.cancelled) {
|
if (!sc.cancelled) {
|
||||||
const newURL = new URL(doc.metadata.url ?? doc.metadata.sourceURL ?? sc.originUrl!);
|
const crawler = crawlToCrawler(job.data.crawl_id, sc, doc.metadata.url ?? doc.metadata.sourceURL ?? sc.originUrl!);
|
||||||
const useNewURLAsBase = newURL.hostname.split(".").slice(-2).join(".") === new URL(sc.originUrl!).hostname.split(".").slice(-2).join(".");
|
|
||||||
const crawler = crawlToCrawler(job.data.crawl_id, sc, useNewURLAsBase ? newURL.href : undefined);
|
|
||||||
|
|
||||||
const links = crawler.filterLinks(
|
const links = crawler.filterLinks(
|
||||||
crawler.extractLinksFromHTML(rawHtml ?? "", doc.metadata?.url ?? doc.metadata?.sourceURL ?? sc.originUrl as string),
|
crawler.extractLinksFromHTML(rawHtml ?? "", doc.metadata?.url ?? doc.metadata?.sourceURL ?? sc.originUrl!),
|
||||||
Infinity,
|
Infinity,
|
||||||
sc.crawlerOptions?.maxDepth ?? 10
|
sc.crawlerOptions?.maxDepth ?? 10
|
||||||
);
|
);
|
||||||
|
|||||||
Reference in New Issue
Block a user