Files
firecrawl/apps/api/src/scraper/scrapeURL/lib/removeUnwantedElements.ts
T

123 lines
2.8 KiB
TypeScript
Raw Normal View History

2024-11-07 20:57:33 +01:00
// TODO: refactor
import { AnyNode, Cheerio, load } from "cheerio";
import { ScrapeOptions } from "../../../controllers/v1/types";
const excludeNonMainTags = [
2024-12-11 19:46:11 -03:00
"header",
"footer",
"nav",
"aside",
".header",
".top",
".navbar",
"#header",
".footer",
".bottom",
"#footer",
".sidebar",
".side",
".aside",
"#sidebar",
".modal",
".popup",
"#modal",
".overlay",
".ad",
".ads",
".advert",
"#ad",
".lang-selector",
".language",
"#language-selector",
".social",
".social-media",
".social-links",
"#social",
".menu",
".navigation",
"#nav",
".breadcrumbs",
"#breadcrumbs",
"#search-form",
".search",
"#search",
".share",
"#share",
".widget",
"#widget",
".cookie",
2024-12-11 19:51:08 -03:00
"#cookie",
2024-11-07 20:57:33 +01:00
];
2024-12-11 19:46:11 -03:00
const forceIncludeMainTags = ["#main"];
2024-11-07 20:57:33 +01:00
export const removeUnwantedElements = (
html: string,
2024-12-11 19:51:08 -03:00
scrapeOptions: ScrapeOptions,
2024-11-07 20:57:33 +01:00
) => {
const soup = load(html);
2024-12-11 19:46:11 -03:00
if (
scrapeOptions.includeTags &&
scrapeOptions.includeTags.filter((x) => x.trim().length !== 0).length > 0
) {
2024-11-07 20:57:33 +01:00
// Create a new root element to hold the tags to keep
const newRoot = load("<div></div>")("div");
scrapeOptions.includeTags.forEach((tag) => {
2024-12-11 19:46:11 -03:00
soup(tag).each((_, element) => {
newRoot.append(soup(element).clone());
});
2024-11-07 20:57:33 +01:00
});
return newRoot.html() ?? "";
}
soup("script, style, noscript, meta, head").remove();
2024-12-11 19:46:11 -03:00
if (
scrapeOptions.excludeTags &&
scrapeOptions.excludeTags.filter((x) => x.trim().length !== 0).length > 0
) {
scrapeOptions.excludeTags.forEach((tag) => {
let elementsToRemove: Cheerio<AnyNode>;
if (tag.startsWith("*") && tag.endsWith("*")) {
let classMatch = false;
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
const regexPattern = new RegExp(tag.slice(1, -1), "i");
elementsToRemove = soup("*").filter((i, element) => {
if (element.type === "tag") {
const attributes = element.attribs;
const tagNameMatches = regexPattern.test(element.name);
const attributesMatch = Object.keys(attributes).some((attr) =>
2024-12-11 19:51:08 -03:00
regexPattern.test(`${attr}="${attributes[attr]}"`),
2024-12-11 19:46:11 -03:00
);
if (tag.startsWith("*.")) {
classMatch = Object.keys(attributes).some((attr) =>
2024-12-11 19:51:08 -03:00
regexPattern.test(`class="${attributes[attr]}"`),
2024-12-11 19:46:11 -03:00
);
2024-11-07 20:57:33 +01:00
}
2024-12-11 19:46:11 -03:00
return tagNameMatches || attributesMatch || classMatch;
}
return false;
2024-11-07 20:57:33 +01:00
});
2024-12-11 19:46:11 -03:00
} else {
elementsToRemove = soup(tag);
}
elementsToRemove.remove();
});
}
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
if (scrapeOptions.onlyMainContent) {
excludeNonMainTags.forEach((tag) => {
const elementsToRemove = soup(tag).filter(
2024-12-11 19:51:08 -03:00
forceIncludeMainTags.map((x) => ":not(:has(" + x + "))").join(""),
2024-12-11 19:46:11 -03:00
);
elementsToRemove.remove();
});
}
const cleanedHtml = soup.html();
return cleanedHtml;
2024-11-07 20:57:33 +01:00
};