Merge pull request #685 from devflowinc/main

bugfix: using onlyIncludeTags and removeTags together
This commit is contained in:
Nicolas
2024-09-30 17:18:30 -03:00
committed by GitHub
2 changed files with 27 additions and 9 deletions
File diff suppressed because one or more lines are too long
@@ -1,30 +1,31 @@
import cheerio, { AnyNode, Cheerio } from "cheerio"; import { AnyNode, Cheerio, load } from "cheerio";
import { PageOptions } from "../../../lib/entities"; import { PageOptions } from "../../../lib/entities";
import { excludeNonMainTags } from "./excludeTags"; import { excludeNonMainTags } from "./excludeTags";
export const removeUnwantedElements = ( export const removeUnwantedElements = (
html: string, html: string,
pageOptions: PageOptions pageOptions: PageOptions,
) => { ) => {
const soup = cheerio.load(html); let soup = load(html);
if ( if (
pageOptions.onlyIncludeTags && pageOptions.onlyIncludeTags &&
pageOptions.onlyIncludeTags.length > 0 && pageOptions.onlyIncludeTags.length > 0 &&
pageOptions.onlyIncludeTags[0] !== '' pageOptions.onlyIncludeTags[0] !== ""
) { ) {
if (typeof pageOptions.onlyIncludeTags === "string") { if (typeof pageOptions.onlyIncludeTags === "string") {
pageOptions.onlyIncludeTags = [pageOptions.onlyIncludeTags]; pageOptions.onlyIncludeTags = [pageOptions.onlyIncludeTags];
} }
if (pageOptions.onlyIncludeTags.length !== 0) { if (pageOptions.onlyIncludeTags.length !== 0) {
// Create a new root element to hold the tags to keep // Create a new root element to hold the tags to keep
const newRoot = cheerio.load("<div></div>")("div"); const newRoot = load("<div></div>")("div");
pageOptions.onlyIncludeTags.forEach((tag) => { pageOptions.onlyIncludeTags.forEach((tag) => {
soup(tag).each((index, element) => { soup(tag).each((index, element) => {
newRoot.append(soup(element).clone()); newRoot.append(soup(element).clone());
}); });
}); });
return newRoot.html();
soup = load(newRoot.html());
} }
} }
@@ -33,7 +34,7 @@ export const removeUnwantedElements = (
if ( if (
pageOptions.removeTags && pageOptions.removeTags &&
pageOptions.removeTags.length > 0 && pageOptions.removeTags.length > 0 &&
pageOptions.removeTags[0] !== '' pageOptions.removeTags[0] !== ""
) { ) {
if (typeof pageOptions.removeTags === "string") { if (typeof pageOptions.removeTags === "string") {
pageOptions.removeTags = [pageOptions.removeTags]; pageOptions.removeTags = [pageOptions.removeTags];
@@ -51,11 +52,11 @@ export const removeUnwantedElements = (
const attributes = element.attribs; const attributes = element.attribs;
const tagNameMatches = regexPattern.test(element.name); const tagNameMatches = regexPattern.test(element.name);
const attributesMatch = Object.keys(attributes).some((attr) => const attributesMatch = Object.keys(attributes).some((attr) =>
regexPattern.test(`${attr}="${attributes[attr]}"`) regexPattern.test(`${attr}="${attributes[attr]}"`),
); );
if (tag.startsWith("*.")) { if (tag.startsWith("*.")) {
classMatch = Object.keys(attributes).some((attr) => classMatch = Object.keys(attributes).some((attr) =>
regexPattern.test(`class="${attributes[attr]}"`) regexPattern.test(`class="${attributes[attr]}"`),
); );
} }
return tagNameMatches || attributesMatch || classMatch; return tagNameMatches || attributesMatch || classMatch;