2024-11-07 20:57:33 +01:00
|
|
|
// TODO: refactor
|
2025-01-24 22:04:54 +01:00
|
|
|
import { load } from "cheerio"; // rustified
|
2024-11-07 20:57:33 +01:00
|
|
|
import { logger } from "../../../lib/logger";
|
2025-01-24 22:04:54 +01:00
|
|
|
import { extractLinks as _extractLinks } from "../../../lib/html-transformer";
|
|
|
|
|
|
|
|
|
|
async function extractLinksRust(html: string, baseUrl: string): Promise<string[]> {
|
|
|
|
|
const hrefs = await _extractLinks(html);
|
|
|
|
|
|
|
|
|
|
const links: string[] = [];
|
|
|
|
|
|
|
|
|
|
hrefs.forEach(href => {
|
|
|
|
|
href = href.trim();
|
|
|
|
|
try {
|
|
|
|
|
if (href.startsWith("http://") || href.startsWith("https://")) {
|
|
|
|
|
// Absolute URL, add as is
|
|
|
|
|
links.push(href);
|
|
|
|
|
} else if (href.startsWith("/")) {
|
|
|
|
|
// Relative URL starting with '/', append to origin
|
|
|
|
|
links.push(new URL(href, baseUrl).href);
|
|
|
|
|
} else if (!href.startsWith("#") && !href.startsWith("mailto:")) {
|
|
|
|
|
// Relative URL not starting with '/', append to base URL
|
|
|
|
|
links.push(new URL(href, baseUrl).href);
|
|
|
|
|
} else if (href.startsWith("mailto:")) {
|
|
|
|
|
// mailto: links, add as is
|
|
|
|
|
links.push(href);
|
|
|
|
|
}
|
|
|
|
|
// Fragment-only links (#) are ignored
|
|
|
|
|
} catch (error) {
|
|
|
|
|
logger.error(
|
|
|
|
|
`Failed to construct URL for href: ${href} with base: ${baseUrl}`,
|
|
|
|
|
{ error },
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
// Remove duplicates and return
|
|
|
|
|
return [...new Set(links)];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
export async function extractLinks(html: string, baseUrl: string): Promise<string[]> {
|
|
|
|
|
try {
|
|
|
|
|
return await extractLinksRust(html, baseUrl);
|
|
|
|
|
} catch (error) {
|
2025-03-13 13:07:51 -04:00
|
|
|
logger.warn("Failed to call html-transformer! Falling back to cheerio...", {
|
2025-01-24 22:04:54 +01:00
|
|
|
error,
|
|
|
|
|
module: "scrapeURL", method: "extractLinks"
|
|
|
|
|
});
|
|
|
|
|
}
|
2024-11-07 20:57:33 +01:00
|
|
|
|
2024-12-11 19:46:11 -03:00
|
|
|
const $ = load(html);
|
|
|
|
|
const links: string[] = [];
|
|
|
|
|
|
|
|
|
|
$("a").each((_, element) => {
|
2025-01-08 12:35:42 +01:00
|
|
|
let href = $(element).attr("href");
|
2024-12-11 19:46:11 -03:00
|
|
|
if (href) {
|
2025-01-08 12:35:42 +01:00
|
|
|
href = href.trim();
|
2024-12-11 19:46:11 -03:00
|
|
|
try {
|
|
|
|
|
if (href.startsWith("http://") || href.startsWith("https://")) {
|
|
|
|
|
// Absolute URL, add as is
|
|
|
|
|
links.push(href);
|
|
|
|
|
} else if (href.startsWith("/")) {
|
|
|
|
|
// Relative URL starting with '/', append to origin
|
|
|
|
|
links.push(new URL(href, baseUrl).href);
|
|
|
|
|
} else if (!href.startsWith("#") && !href.startsWith("mailto:")) {
|
|
|
|
|
// Relative URL not starting with '/', append to base URL
|
|
|
|
|
links.push(new URL(href, baseUrl).href);
|
|
|
|
|
} else if (href.startsWith("mailto:")) {
|
|
|
|
|
// mailto: links, add as is
|
|
|
|
|
links.push(href);
|
2024-11-07 20:57:33 +01:00
|
|
|
}
|
2024-12-11 19:46:11 -03:00
|
|
|
// Fragment-only links (#) are ignored
|
|
|
|
|
} catch (error) {
|
|
|
|
|
logger.error(
|
|
|
|
|
`Failed to construct URL for href: ${href} with base: ${baseUrl}`,
|
2024-12-11 19:51:08 -03:00
|
|
|
{ error },
|
2024-12-11 19:46:11 -03:00
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
// Remove duplicates and return
|
|
|
|
|
return [...new Set(links)];
|
|
|
|
|
}
|