Files
firecrawl/apps/api/src/scraper/WebScraper/utils/blocklist.ts
T

83 lines
1.7 KiB
TypeScript
Raw Normal View History

2024-11-07 20:57:33 +01:00
import { logger } from "../../../lib/logger";
2024-07-25 09:48:06 -03:00
const socialMediaBlocklist = [
2024-12-11 19:46:11 -03:00
"facebook.com",
"x.com",
"twitter.com",
"instagram.com",
"linkedin.com",
"snapchat.com",
"tiktok.com",
"reddit.com",
"tumblr.com",
"flickr.com",
"whatsapp.com",
"wechat.com",
"telegram.org",
"researchhub.com",
"youtube.com",
"corterix.com",
"southwest.com",
2024-12-11 19:51:08 -03:00
"ryanair.com",
];
2024-05-20 17:24:21 -07:00
const allowedKeywords = [
2024-12-11 19:46:11 -03:00
"pulse",
"privacy",
"terms",
"policy",
"user-agreement",
"legal",
"help",
"policies",
"support",
"contact",
"about",
"careers",
"blog",
"press",
"conditions",
"tos",
"://library.tiktok.com",
"://ads.tiktok.com",
"://tiktok.com/business",
2024-12-11 19:51:08 -03:00
"://developers.facebook.com",
2024-04-25 08:39:45 -03:00
];
export function isUrlBlocked(url: string): boolean {
const lowerCaseUrl = url.toLowerCase();
// Check if the URL contains any allowed keywords as whole words
2024-12-11 19:46:11 -03:00
if (
allowedKeywords.some((keyword) =>
2024-12-11 19:51:08 -03:00
new RegExp(`\\b${keyword}\\b`, "i").test(lowerCaseUrl),
2024-12-11 19:46:11 -03:00
)
) {
2024-04-25 08:39:45 -03:00
return false;
}
2024-06-03 16:42:42 -07:00
try {
2024-12-11 19:46:11 -03:00
if (!url.startsWith("http://") && !url.startsWith("https://")) {
url = "https://" + url;
2024-06-24 16:25:07 -03:00
}
2024-12-11 19:46:11 -03:00
const urlObj = new URL(url);
const hostname = urlObj.hostname.toLowerCase();
2024-06-03 16:42:42 -07:00
// Check if the URL matches any domain in the blocklist
2024-12-11 19:46:11 -03:00
const isBlocked = socialMediaBlocklist.some((domain) => {
const domainPattern = new RegExp(
`(^|\\.)${domain.replace(".", "\\.")}(\\.|$)`,
2024-12-11 19:51:08 -03:00
"i",
2024-12-11 19:46:11 -03:00
);
return domainPattern.test(hostname);
2024-06-03 16:42:42 -07:00
});
return isBlocked;
2024-06-03 16:42:42 -07:00
} catch (e) {
// If an error occurs (e.g., invalid URL), return false
2024-11-07 20:57:33 +01:00
logger.error(`Error parsing the following URL: ${url}`);
2024-06-03 16:42:42 -07:00
return false;
}
}