apps/api/src/scraper/WebScraper/utils/blocklist.ts

import { Logger } from "../../../lib/logger";

const socialMediaBlocklist = [
  'facebook.com',
  'x.com',
  'twitter.com',
  'instagram.com',
  'linkedin.com',
  'snapchat.com',
  'tiktok.com',
  'reddit.com',
  'tumblr.com',
  'flickr.com',
  'whatsapp.com',
  'wechat.com',
  'telegram.org',
  'researchhub.com',
  'youtube.com'
];

const allowedKeywords = [
  'pulse',
  'privacy',
  'terms',
  'policy',
  'user-agreement',
  'legal',
  'help',
  'policies',
  'support',
  'contact',
  'about',
  'careers',
  'blog',
  'press',
  'conditions',
  'tos'
];

export function isUrlBlocked(url: string): boolean {
  const lowerCaseUrl = url.toLowerCase();

  // Check if the URL contains any allowed keywords as whole words
  if (allowedKeywords.some(keyword => new RegExp(`\\b${keyword}\\b`, 'i').test(lowerCaseUrl))) {
    return false;
  }

  try {
    if (!url.startsWith('http://') && !url.startsWith('https://')) {
      url = 'https://' + url;
    }
    
    const urlObj = new URL(url);
    const hostname = urlObj.hostname.toLowerCase();

    // Check if the URL matches any domain in the blocklist
    const isBlocked = socialMediaBlocklist.some(domain => {
      const domainPattern = new RegExp(`(^|\\.)${domain.replace('.', '\\.')}(\\.|$)`, 'i');
      return domainPattern.test(hostname);
    });

    return isBlocked;
  } catch (e) {
    // If an error occurs (e.g., invalid URL), return false
    Logger.error(`Error parsing the following URL: ${url}`);
    return false;
  }
}
updated logs 2024-07-25 09:48:06 -03:00			`import { Logger } from "../../../lib/logger";`

[Feat] Added blocklist for social media urls 2024-04-23 18:50:35 -03:00			`const socialMediaBlocklist = [`
			`'facebook.com',`
Update blocklist.ts 2024-05-24 15:04:15 -07:00			`'x.com',`
[Feat] Added blocklist for social media urls 2024-04-23 18:50:35 -03:00			`'twitter.com',`
			`'instagram.com',`
			`'linkedin.com',`
			`'snapchat.com',`
			`'tiktok.com',`
			`'reddit.com',`
			`'tumblr.com',`
			`'flickr.com',`
			`'whatsapp.com',`
			`'wechat.com',`
			`'telegram.org',`
Update blocklist.ts 2024-08-19 08:51:48 -03:00			`'researchhub.com',`
			`'youtube.com'`
[Feat] Added blocklist for social media urls 2024-04-23 18:50:35 -03:00			`];`

Nick: allowed keywords for now 2024-05-20 17:24:21 -07:00			`const allowedKeywords = [`
			`'pulse',`
			`'privacy',`
			`'terms',`
			`'policy',`
			`'user-agreement',`
			`'legal',`
Update blocklist.ts 2024-05-20 17:26:01 -07:00			`'help',`
transcribed from e2e to unit tests for many cases 2024-06-17 17:09:44 -03:00			`'policies',`
Update blocklist.ts 2024-05-20 17:26:01 -07:00			`'support',`
			`'contact',`
			`'about',`
			`'careers',`
			`'blog',`
			`'press',`
			`'conditions',`
transcribed from e2e to unit tests for many cases 2024-06-17 17:09:44 -03:00			`'tos'`
[Feat] Added allowed urls 2024-04-25 08:39:45 -03:00			`];`

[Feat] Added blocklist for social media urls 2024-04-23 18:50:35 -03:00			`export function isUrlBlocked(url: string): boolean {`
transcribed from e2e to unit tests for many cases 2024-06-17 17:09:44 -03:00			`const lowerCaseUrl = url.toLowerCase();`

			`// Check if the URL contains any allowed keywords as whole words`
			if (allowedKeywords.some(keyword => new RegExp(`\\b${keyword}\\b`, 'i').test(lowerCaseUrl))) {
[Feat] Added allowed urls 2024-04-25 08:39:45 -03:00			`return false;`
			`}`

Nick: 2024-06-03 16:42:42 -07:00			`try {`
testing crawl with new.abb.com case 2024-06-24 16:25:07 -03:00			`if (!url.startsWith('http://') && !url.startsWith('https://')) {`
			`url = 'https://' + url;`
			`}`

transcribed from e2e to unit tests for many cases 2024-06-17 17:09:44 -03:00			`const urlObj = new URL(url);`
			`const hostname = urlObj.hostname.toLowerCase();`

Nick: 2024-06-03 16:42:42 -07:00			`// Check if the URL matches any domain in the blocklist`
transcribed from e2e to unit tests for many cases 2024-06-17 17:09:44 -03:00			`const isBlocked = socialMediaBlocklist.some(domain => {`
			const domainPattern = new RegExp(`(^\|\\.)${domain.replace('.', '\\.')}(\\.\|$)`, 'i');
			`return domainPattern.test(hostname);`
Nick: 2024-06-03 16:42:42 -07:00			`});`
transcribed from e2e to unit tests for many cases 2024-06-17 17:09:44 -03:00
			`return isBlocked;`
Nick: 2024-06-03 16:42:42 -07:00			`} catch (e) {`
			`// If an error occurs (e.g., invalid URL), return false`
updated logs 2024-07-25 09:48:06 -03:00			Logger.error(`Error parsing the following URL: ${url}`);
Nick: 2024-06-03 16:42:42 -07:00			`return false;`
			`}`
[Feat] Added blocklist for social media urls 2024-04-23 18:50:35 -03:00			`}`