apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts

import { fetchAndProcessPdf } from "../utils/pdfProcessor";

export async function handleCustomScraping(
  text: string,
  url: string
): Promise<{ scraper: string; url: string; waitAfterLoad?: number, pageOptions?: { scrollXPaths?: string[] } } | null> {
  // Check for Readme Docs special case
  if (text.includes('<meta name="readme-deploy"')) {
    console.log(
      `Special use case detected for ${url}, using Fire Engine with wait time 1000ms`
    );
    return {
      scraper: "fire-engine",
      url: url,
      waitAfterLoad: 1000,
      pageOptions: {
        scrollXPaths: ['//*[@id="ReferencePlayground"]/section[3]/div/pre/div/div/div[5]']
      }
    };
  }

  // Check for Vanta security portals
  if (text.includes('<link href="https://static.vanta.com')) {
    console.log(
      `Vanta link detected for ${url}, using Fire Engine with wait time 3000ms`
    );
    return {
      scraper: "fire-engine",
      url: url,
      waitAfterLoad: 3000,
    };
  }

  // Check for Google Drive PDF links in the raw HTML
  const googleDrivePdfPattern =
    /https:\/\/drive\.google\.com\/file\/d\/([^\/]+)\/view/;
  const googleDrivePdfLink = text.match(googleDrivePdfPattern);
  if (googleDrivePdfLink) {
    console.log(
      `Google Drive PDF link detected for ${url}: ${googleDrivePdfLink[0]}`
    );

    const fileId = googleDrivePdfLink[1];
    const pdfUrl = `https://drive.google.com/uc?export=download&id=${fileId}`;

    return {
      scraper: "pdf",
      url: pdfUrl
    };
  }
  
  return null;
}
[feat] improved the scrape for gdrive pdfs 2024-06-04 17:47:28 -03:00			`import { fetchAndProcessPdf } from "../utils/pdfProcessor";`

Nick: 2024-06-04 12:15:39 -07:00			`export async function handleCustomScraping(`
			`text: string,`
			`url: string`
Nick: 2024-06-05 10:13:52 -07:00			`): Promise<{ scraper: string; url: string; waitAfterLoad?: number, pageOptions?: { scrollXPaths?: string[] } } \| null> {`
Nick: 2024-06-04 12:15:39 -07:00			`// Check for Readme Docs special case`
			`if (text.includes('<meta name="readme-deploy"')) {`
			`console.log(`
			`Special use case detected for ${url}, using Fire Engine with wait time 1000ms`
			`);`
			`return {`
			`scraper: "fire-engine",`
			`url: url,`
Added scroll xpaths on fire-engine for handling readme docs 2024-06-05 11:48:41 -03:00			`waitAfterLoad: 1000,`
			`pageOptions: {`
			`scrollXPaths: ['//*[@id="ReferencePlayground"]/section[3]/div/pre/div/div/div[5]']`
			`}`
Nick: 2024-06-04 12:15:39 -07:00			`};`
			`}`

Update handleCustomScraping.ts 2024-06-04 12:22:46 -07:00			`// Check for Vanta security portals`
Nick: 2024-06-04 12:15:39 -07:00			`if (text.includes('<link href="https://static.vanta.com')) {`
			`console.log(`
			`Vanta link detected for ${url}, using Fire Engine with wait time 3000ms`
			`);`
			`return {`
			`scraper: "fire-engine",`
			`url: url,`
Added scroll xpaths on fire-engine for handling readme docs 2024-06-05 11:48:41 -03:00			`waitAfterLoad: 3000,`
Nick: 2024-06-04 12:15:39 -07:00			`};`
			`}`

			`// Check for Google Drive PDF links in the raw HTML`
			`const googleDrivePdfPattern =`
[feat] improved the scrape for gdrive pdfs 2024-06-04 17:47:28 -03:00			`/https:\/\/drive\.google\.com\/file\/d\/([^\/]+)\/view/;`
Nick: 2024-06-04 12:15:39 -07:00			`const googleDrivePdfLink = text.match(googleDrivePdfPattern);`
			`if (googleDrivePdfLink) {`
			`console.log(`
			`Google Drive PDF link detected for ${url}: ${googleDrivePdfLink[0]}`
			`);`
[feat] improved the scrape for gdrive pdfs 2024-06-04 17:47:28 -03:00
			`const fileId = googleDrivePdfLink[1];`
			const pdfUrl = `https://drive.google.com/uc?export=download&id=${fileId}`;

Nick: 2024-06-04 12:15:39 -07:00			`return {`
[feat] improved the scrape for gdrive pdfs 2024-06-04 17:47:28 -03:00			`scraper: "pdf",`
			`url: pdfUrl`
Nick: 2024-06-04 12:15:39 -07:00			`};`
			`}`
[feat] improved the scrape for gdrive pdfs 2024-06-04 17:47:28 -03:00
Nick: 2024-06-04 12:15:39 -07:00			`return null;`
			`}`