firecrawl/apps/api/src/scraper/WebScraper/utils/blocklist.ts

import { logger } from "../../../lib/logger";

const socialMediaBlocklist = [
  "facebook.com",
  "x.com",
  "twitter.com",
  "instagram.com",
  "linkedin.com",
  "snapchat.com",
  "tiktok.com",
  "reddit.com",
  "tumblr.com",
  "flickr.com",
  "whatsapp.com",
  "wechat.com",
  "telegram.org",
  "researchhub.com",
  "youtube.com",
  "corterix.com",
  "southwest.com",
  "ryanair.com",
];

const allowedKeywords = [
  "pulse",
  "privacy",
  "terms",
  "policy",
  "user-agreement",
  "legal",
  "help",
  "policies",
  "support",
  "contact",
  "about",
  "careers",
  "blog",
  "press",
  "conditions",
  "tos",
  "://library.tiktok.com",
  "://ads.tiktok.com",
  "://tiktok.com/business",
  "://developers.facebook.com",
];

export function isUrlBlocked(url: string): boolean {
  const lowerCaseUrl = url.toLowerCase();

  // Check if the URL contains any allowed keywords as whole words
  if (
    allowedKeywords.some((keyword) =>
      new RegExp(`\\b${keyword}\\b`, "i").test(lowerCaseUrl),
    )
  ) {
    return false;
  }

  try {
    if (!url.startsWith("http://") && !url.startsWith("https://")) {
      url = "https://" + url;
    }

    const urlObj = new URL(url);
    const hostname = urlObj.hostname.toLowerCase();

    // Check if the URL matches any domain in the blocklist
    const isBlocked = socialMediaBlocklist.some((domain) => {
      const domainPattern = new RegExp(
        `(^|\\.)${domain.replace(".", "\\.")}(\\.|$)`,
        "i",
      );
      return domainPattern.test(hostname);
    });

    return isBlocked;
  } catch (e) {
    // If an error occurs (e.g., invalid URL), return false
    logger.error(`Error parsing the following URL: ${url}`);
    return false;
  }
}