Merge pull request #685 from devflowinc/main
bugfix: using onlyIncludeTags and removeTags together
This commit is contained in:
File diff suppressed because one or more lines are too long
@@ -1,30 +1,31 @@
|
|||||||
import cheerio, { AnyNode, Cheerio } from "cheerio";
|
import { AnyNode, Cheerio, load } from "cheerio";
|
||||||
import { PageOptions } from "../../../lib/entities";
|
import { PageOptions } from "../../../lib/entities";
|
||||||
import { excludeNonMainTags } from "./excludeTags";
|
import { excludeNonMainTags } from "./excludeTags";
|
||||||
|
|
||||||
export const removeUnwantedElements = (
|
export const removeUnwantedElements = (
|
||||||
html: string,
|
html: string,
|
||||||
pageOptions: PageOptions
|
pageOptions: PageOptions,
|
||||||
) => {
|
) => {
|
||||||
const soup = cheerio.load(html);
|
let soup = load(html);
|
||||||
|
|
||||||
if (
|
if (
|
||||||
pageOptions.onlyIncludeTags &&
|
pageOptions.onlyIncludeTags &&
|
||||||
pageOptions.onlyIncludeTags.length > 0 &&
|
pageOptions.onlyIncludeTags.length > 0 &&
|
||||||
pageOptions.onlyIncludeTags[0] !== ''
|
pageOptions.onlyIncludeTags[0] !== ""
|
||||||
) {
|
) {
|
||||||
if (typeof pageOptions.onlyIncludeTags === "string") {
|
if (typeof pageOptions.onlyIncludeTags === "string") {
|
||||||
pageOptions.onlyIncludeTags = [pageOptions.onlyIncludeTags];
|
pageOptions.onlyIncludeTags = [pageOptions.onlyIncludeTags];
|
||||||
}
|
}
|
||||||
if (pageOptions.onlyIncludeTags.length !== 0) {
|
if (pageOptions.onlyIncludeTags.length !== 0) {
|
||||||
// Create a new root element to hold the tags to keep
|
// Create a new root element to hold the tags to keep
|
||||||
const newRoot = cheerio.load("<div></div>")("div");
|
const newRoot = load("<div></div>")("div");
|
||||||
pageOptions.onlyIncludeTags.forEach((tag) => {
|
pageOptions.onlyIncludeTags.forEach((tag) => {
|
||||||
soup(tag).each((index, element) => {
|
soup(tag).each((index, element) => {
|
||||||
newRoot.append(soup(element).clone());
|
newRoot.append(soup(element).clone());
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
return newRoot.html();
|
|
||||||
|
soup = load(newRoot.html());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -33,7 +34,7 @@ export const removeUnwantedElements = (
|
|||||||
if (
|
if (
|
||||||
pageOptions.removeTags &&
|
pageOptions.removeTags &&
|
||||||
pageOptions.removeTags.length > 0 &&
|
pageOptions.removeTags.length > 0 &&
|
||||||
pageOptions.removeTags[0] !== ''
|
pageOptions.removeTags[0] !== ""
|
||||||
) {
|
) {
|
||||||
if (typeof pageOptions.removeTags === "string") {
|
if (typeof pageOptions.removeTags === "string") {
|
||||||
pageOptions.removeTags = [pageOptions.removeTags];
|
pageOptions.removeTags = [pageOptions.removeTags];
|
||||||
@@ -51,11 +52,11 @@ export const removeUnwantedElements = (
|
|||||||
const attributes = element.attribs;
|
const attributes = element.attribs;
|
||||||
const tagNameMatches = regexPattern.test(element.name);
|
const tagNameMatches = regexPattern.test(element.name);
|
||||||
const attributesMatch = Object.keys(attributes).some((attr) =>
|
const attributesMatch = Object.keys(attributes).some((attr) =>
|
||||||
regexPattern.test(`${attr}="${attributes[attr]}"`)
|
regexPattern.test(`${attr}="${attributes[attr]}"`),
|
||||||
);
|
);
|
||||||
if (tag.startsWith("*.")) {
|
if (tag.startsWith("*.")) {
|
||||||
classMatch = Object.keys(attributes).some((attr) =>
|
classMatch = Object.keys(attributes).some((attr) =>
|
||||||
regexPattern.test(`class="${attributes[attr]}"`)
|
regexPattern.test(`class="${attributes[attr]}"`),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
return tagNameMatches || attributesMatch || classMatch;
|
return tagNameMatches || attributesMatch || classMatch;
|
||||||
|
|||||||
Reference in New Issue
Block a user