Files
firecrawl/apps/api/src/scraper/WebScraper/utils/blocklist.ts
T

50 lines
1.0 KiB
TypeScript
Raw Normal View History

const socialMediaBlocklist = [
'facebook.com',
2024-05-24 15:04:15 -07:00
'x.com',
'twitter.com',
'instagram.com',
'linkedin.com',
'pinterest.com',
'snapchat.com',
'tiktok.com',
'reddit.com',
'tumblr.com',
'flickr.com',
'whatsapp.com',
'wechat.com',
'telegram.org',
];
2024-05-20 17:24:21 -07:00
const allowedKeywords = [
'pulse',
'privacy',
'terms',
'policy',
'user-agreement',
'legal',
2024-05-20 17:26:01 -07:00
'help',
'support',
'contact',
'about',
'careers',
'blog',
'press',
'conditions',
2024-04-25 08:39:45 -03:00
];
export function isUrlBlocked(url: string): boolean {
2024-05-24 15:04:15 -07:00
// Check if the URL contains any allowed keywords
2024-05-20 17:24:21 -07:00
if (allowedKeywords.some(keyword => url.includes(keyword))) {
2024-04-25 08:39:45 -03:00
return false;
}
2024-05-24 15:04:15 -07:00
// Check if the URL matches any domain in the blocklist
return socialMediaBlocklist.some(domain => {
// Create a regular expression to match the exact domain
const domainPattern = new RegExp(`(^|\\.)${domain.replace('.', '\\.')}$`);
// Test the hostname of the URL against the pattern
return domainPattern.test(new URL(url).hostname);
});
}
2024-05-24 15:04:15 -07:00