Files
firecrawl/apps/api/src/scraper/WebScraper/utils/blocklist.ts
T
2024-12-11 19:51:08 -03:00

83 lines
1.7 KiB
TypeScript

import { logger } from "../../../lib/logger";
const socialMediaBlocklist = [
"facebook.com",
"x.com",
"twitter.com",
"instagram.com",
"linkedin.com",
"snapchat.com",
"tiktok.com",
"reddit.com",
"tumblr.com",
"flickr.com",
"whatsapp.com",
"wechat.com",
"telegram.org",
"researchhub.com",
"youtube.com",
"corterix.com",
"southwest.com",
"ryanair.com",
];
const allowedKeywords = [
"pulse",
"privacy",
"terms",
"policy",
"user-agreement",
"legal",
"help",
"policies",
"support",
"contact",
"about",
"careers",
"blog",
"press",
"conditions",
"tos",
"://library.tiktok.com",
"://ads.tiktok.com",
"://tiktok.com/business",
"://developers.facebook.com",
];
export function isUrlBlocked(url: string): boolean {
const lowerCaseUrl = url.toLowerCase();
// Check if the URL contains any allowed keywords as whole words
if (
allowedKeywords.some((keyword) =>
new RegExp(`\\b${keyword}\\b`, "i").test(lowerCaseUrl),
)
) {
return false;
}
try {
if (!url.startsWith("http://") && !url.startsWith("https://")) {
url = "https://" + url;
}
const urlObj = new URL(url);
const hostname = urlObj.hostname.toLowerCase();
// Check if the URL matches any domain in the blocklist
const isBlocked = socialMediaBlocklist.some((domain) => {
const domainPattern = new RegExp(
`(^|\\.)${domain.replace(".", "\\.")}(\\.|$)`,
"i",
);
return domainPattern.test(hostname);
});
return isBlocked;
} catch (e) {
// If an error occurs (e.g., invalid URL), return false
logger.error(`Error parsing the following URL: ${url}`);
return false;
}
}