Files
firecrawl/apps/api/src/scraper/scrapeURL/lib/extractLinks.ts
T

84 lines
2.6 KiB
TypeScript
Raw Normal View History

2024-11-07 20:57:33 +01:00
// TODO: refactor
2025-01-24 22:04:54 +01:00
import { load } from "cheerio"; // rustified
2024-11-07 20:57:33 +01:00
import { logger } from "../../../lib/logger";
2025-01-24 22:04:54 +01:00
import { extractLinks as _extractLinks } from "../../../lib/html-transformer";
async function extractLinksRust(html: string, baseUrl: string): Promise<string[]> {
const hrefs = await _extractLinks(html);
const links: string[] = [];
hrefs.forEach(href => {
href = href.trim();
try {
if (href.startsWith("http://") || href.startsWith("https://")) {
// Absolute URL, add as is
links.push(href);
} else if (href.startsWith("/")) {
// Relative URL starting with '/', append to origin
links.push(new URL(href, baseUrl).href);
} else if (!href.startsWith("#") && !href.startsWith("mailto:")) {
// Relative URL not starting with '/', append to base URL
links.push(new URL(href, baseUrl).href);
} else if (href.startsWith("mailto:")) {
// mailto: links, add as is
links.push(href);
}
// Fragment-only links (#) are ignored
} catch (error) {
logger.error(
`Failed to construct URL for href: ${href} with base: ${baseUrl}`,
{ error },
);
}
});
// Remove duplicates and return
return [...new Set(links)];
}
export async function extractLinks(html: string, baseUrl: string): Promise<string[]> {
try {
return await extractLinksRust(html, baseUrl);
} catch (error) {
2025-03-13 13:07:51 -04:00
logger.warn("Failed to call html-transformer! Falling back to cheerio...", {
2025-01-24 22:04:54 +01:00
error,
module: "scrapeURL", method: "extractLinks"
});
}
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
const $ = load(html);
const links: string[] = [];
$("a").each((_, element) => {
2025-01-08 12:35:42 +01:00
let href = $(element).attr("href");
2024-12-11 19:46:11 -03:00
if (href) {
2025-01-08 12:35:42 +01:00
href = href.trim();
2024-12-11 19:46:11 -03:00
try {
if (href.startsWith("http://") || href.startsWith("https://")) {
// Absolute URL, add as is
links.push(href);
} else if (href.startsWith("/")) {
// Relative URL starting with '/', append to origin
links.push(new URL(href, baseUrl).href);
} else if (!href.startsWith("#") && !href.startsWith("mailto:")) {
// Relative URL not starting with '/', append to base URL
links.push(new URL(href, baseUrl).href);
} else if (href.startsWith("mailto:")) {
// mailto: links, add as is
links.push(href);
2024-11-07 20:57:33 +01:00
}
2024-12-11 19:46:11 -03:00
// Fragment-only links (#) are ignored
} catch (error) {
logger.error(
`Failed to construct URL for href: ${href} with base: ${baseUrl}`,
2024-12-11 19:51:08 -03:00
{ error },
2024-12-11 19:46:11 -03:00
);
}
}
});
// Remove duplicates and return
return [...new Set(links)];
}