Files
firecrawl/apps/api/src/scraper/WebScraper/utils/blocklist.ts
T

69 lines
1.5 KiB
TypeScript
Raw Normal View History

2024-07-25 09:48:06 -03:00
import { Logger } from "../../../lib/logger";
const socialMediaBlocklist = [
'facebook.com',
2024-05-24 15:04:15 -07:00
'x.com',
'twitter.com',
'instagram.com',
'linkedin.com',
'snapchat.com',
'tiktok.com',
'reddit.com',
'tumblr.com',
'flickr.com',
'whatsapp.com',
'wechat.com',
'telegram.org',
2024-08-19 08:51:48 -03:00
'researchhub.com',
'youtube.com'
];
2024-05-20 17:24:21 -07:00
const allowedKeywords = [
'pulse',
'privacy',
'terms',
'policy',
'user-agreement',
'legal',
2024-05-20 17:26:01 -07:00
'help',
'policies',
2024-05-20 17:26:01 -07:00
'support',
'contact',
'about',
'careers',
'blog',
'press',
'conditions',
'tos'
2024-04-25 08:39:45 -03:00
];
export function isUrlBlocked(url: string): boolean {
const lowerCaseUrl = url.toLowerCase();
// Check if the URL contains any allowed keywords as whole words
if (allowedKeywords.some(keyword => new RegExp(`\\b${keyword}\\b`, 'i').test(lowerCaseUrl))) {
2024-04-25 08:39:45 -03:00
return false;
}
2024-06-03 16:42:42 -07:00
try {
2024-06-24 16:25:07 -03:00
if (!url.startsWith('http://') && !url.startsWith('https://')) {
url = 'https://' + url;
}
const urlObj = new URL(url);
const hostname = urlObj.hostname.toLowerCase();
2024-06-03 16:42:42 -07:00
// Check if the URL matches any domain in the blocklist
const isBlocked = socialMediaBlocklist.some(domain => {
const domainPattern = new RegExp(`(^|\\.)${domain.replace('.', '\\.')}(\\.|$)`, 'i');
return domainPattern.test(hostname);
2024-06-03 16:42:42 -07:00
});
return isBlocked;
2024-06-03 16:42:42 -07:00
} catch (e) {
// If an error occurs (e.g., invalid URL), return false
2024-07-25 09:48:06 -03:00
Logger.error(`Error parsing the following URL: ${url}`);
2024-06-03 16:42:42 -07:00
return false;
}
}