Moved to utils/removeUnwantedElements, added unit tests

This commit is contained in:
rafaelsideguide
2024-06-18 09:46:42 -03:00
parent 8b3c3aae91
commit 6c726a02eb
3 changed files with 105 additions and 39 deletions
+1 -39
View File
@@ -4,10 +4,10 @@ import { extractMetadata } from "./utils/metadata";
import dotenv from "dotenv";
import { Document, PageOptions, FireEngineResponse } from "../../lib/entities";
import { parseMarkdown } from "../../lib/html-to-markdown";
import { excludeNonMainTags } from "./utils/excludeTags";
import { urlSpecificParams } from "./utils/custom/website_params";
import { fetchAndProcessPdf } from "./utils/pdfProcessor";
import { handleCustomScraping } from "./custom/handleCustomScraping";
import { removeUnwantedElements } from "./utils/removeUnwantedElements";
import axios from "axios";
dotenv.config();
@@ -313,44 +313,6 @@ export async function scrapSingleUrl(
): Promise<Document> {
urlToScrap = urlToScrap.trim();
const removeUnwantedElements = (html: string, pageOptions: PageOptions) => {
const soup = cheerio.load(html);
soup("script, style, iframe, noscript, meta, head").remove();
if (pageOptions.removeTags) {
if (typeof pageOptions.removeTags === 'string') {
pageOptions.removeTags = [pageOptions.removeTags];
}
if (Array.isArray(pageOptions.removeTags)) {
pageOptions.removeTags.forEach((tag) => {
let elementsToRemove;
if (tag.startsWith("*") && tag.endsWith("*")) {
const regexPattern = new RegExp(`\\b${tag.slice(1, -1)}\\b`);
elementsToRemove = soup('*').filter((index, element) => {
const classNames = soup(element).attr('class');
return classNames && classNames.split(/\s+/).some(className => regexPattern.test(className));
});
} else {
elementsToRemove = soup(tag);
}
elementsToRemove.remove();
});
}
}
if (pageOptions.onlyMainContent) {
// remove any other tags that are not in the main content
excludeNonMainTags.forEach((tag) => {
const elementsToRemove = soup(tag);
elementsToRemove.remove();
});
}
const cleanedHtml = soup.html();
return cleanedHtml;
};
const attemptScraping = async (
url: string,
method: (typeof baseScrapers)[number]