Files
firecrawl/apps/api/src/scraper/WebScraper/utils/blocklist.ts
T

108 lines
2.8 KiB
TypeScript
Raw Normal View History

2024-11-07 20:57:33 +01:00
import { logger } from "../../../lib/logger";
2024-12-12 16:10:10 -05:00
import crypto from "crypto";
import { configDotenv } from "dotenv";
configDotenv();
2024-07-25 09:48:06 -03:00
2024-12-12 16:10:10 -05:00
const hashKey = Buffer.from(process.env.HASH_KEY || "", "utf-8");
const algorithm = "aes-256-ecb";
function decryptAES(ciphertext: string, key: Buffer): string {
const decipher = crypto.createDecipheriv(algorithm, key, null);
const decrypted = Buffer.concat([
decipher.update(Buffer.from(ciphertext, "base64")),
decipher.final(),
]);
return decrypted.toString("utf-8");
}
const urlBlocklist = [
"h8ngAFXUNLO3ZqQufJjGVA==",
"fEGiDm/TWDBkXUXejFVICg==",
"l6Mei7IGbEmTTFoSudUnqQ==",
"4OjallJzXRiZUAWDiC2Xww==",
"ReSvkSfx34TNEdecmmSDdQ==",
"X1E4WtdmXAv3SAX9xN925Q==",
"VTzBQfMtXZzM05mnNkWkjA==",
"m/q4Lb2Z8cxwU7/CoztOFg==",
"UbVnmRaeG+gKcyVDLAm0vg==",
"xNQhczYG22tTVc6lYE3qwg==",
"CQfGDydbg4l1swRCru6O6Q==",
"l86LQxm2NonTWMauXwEsPw==",
"6v4QDUcwjnID80G+uU+tgw==",
"pCF/6nrKZAxaYntzEGluZQ==",
"r0CRhAmQqSe7V2s3073T00sAh4WcS5779jwuGJ26ows==",
"aBOVqRFBM4UVg33usY10NdiF0HCnFH/ImtD0n+zIpc8==",
"QV436UZuQ6D0Dqrx9MwaGw==",
"OYVvrwILYbzA2mSSqOPPpw==",
"xW2i4C0Dzcnp+qu12u0SAw==",
"OLHba209l0dfl0MI4EnQonBITK9z8Qwgd/NsuaTkXmA=",
"X0VynmNjpL3PrYxpUIG7sFMBt8OlrmQWtxj8oXVu2QM=",
"ObdlM5NEkvBJ/sojRW5K/Q==",
"C8Th38X0SjsE1vL/OsD8bA==",
"PTbGg8PK/h0Seyw4HEpK4Q==",
"lZdQMknjHb7+4+sjF3qNTw==",
"LsgSq54q5oDysbva29JxnQ==",
];
2024-12-12 16:10:10 -05:00
const decryptedBlocklist = hashKey.length > 0 ? urlBlocklist.map((ciphertext) => decryptAES(ciphertext, hashKey)) : [];
2024-05-20 17:24:21 -07:00
const allowedKeywords = [
2024-12-11 19:46:11 -03:00
"pulse",
"privacy",
"terms",
"policy",
"user-agreement",
"legal",
"help",
"policies",
"support",
"contact",
"about",
"careers",
"blog",
"press",
"conditions",
"tos",
"://library.tiktok.com",
"://ads.tiktok.com",
"://tiktok.com/business",
2024-12-11 19:51:08 -03:00
"://developers.facebook.com",
2024-04-25 08:39:45 -03:00
];
export function isUrlBlocked(url: string): boolean {
const lowerCaseUrl = url.toLowerCase();
// Check if the URL contains any allowed keywords as whole words
2024-12-11 19:46:11 -03:00
if (
allowedKeywords.some((keyword) =>
2024-12-11 19:51:08 -03:00
new RegExp(`\\b${keyword}\\b`, "i").test(lowerCaseUrl),
2024-12-11 19:46:11 -03:00
)
) {
2024-04-25 08:39:45 -03:00
return false;
}
2024-06-03 16:42:42 -07:00
try {
2024-12-11 19:46:11 -03:00
if (!url.startsWith("http://") && !url.startsWith("https://")) {
url = "https://" + url;
2024-06-24 16:25:07 -03:00
}
2024-12-11 19:46:11 -03:00
const urlObj = new URL(url);
const hostname = urlObj.hostname.toLowerCase();
2024-06-03 16:42:42 -07:00
// Check if the URL matches any domain in the blocklist
2024-12-12 16:10:10 -05:00
const isBlocked = decryptedBlocklist.some((domain) => {
2024-12-11 19:46:11 -03:00
const domainPattern = new RegExp(
`(^|\\.)${domain.replace(".", "\\.")}(\\.|$)`,
2024-12-11 19:51:08 -03:00
"i",
2024-12-11 19:46:11 -03:00
);
return domainPattern.test(hostname);
2024-06-03 16:42:42 -07:00
});
return isBlocked;
2024-06-03 16:42:42 -07:00
} catch (e) {
// If an error occurs (e.g., invalid URL), return false
2024-11-07 20:57:33 +01:00
logger.error(`Error parsing the following URL: ${url}`);
2024-06-03 16:42:42 -07:00
return false;
}
}