Files
firecrawl/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts
T

54 lines
1.5 KiB
TypeScript
Raw Normal View History

2024-06-04 17:47:28 -03:00
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
2024-06-04 12:15:39 -07:00
export async function handleCustomScraping(
text: string,
url: string
2024-06-05 10:13:52 -07:00
): Promise<{ scraper: string; url: string; waitAfterLoad?: number, pageOptions?: { scrollXPaths?: string[] } } | null> {
2024-06-04 12:15:39 -07:00
// Check for Readme Docs special case
if (text.includes('<meta name="readme-deploy"')) {
console.log(
`Special use case detected for ${url}, using Fire Engine with wait time 1000ms`
);
return {
scraper: "fire-engine",
url: url,
waitAfterLoad: 1000,
pageOptions: {
scrollXPaths: ['//*[@id="ReferencePlayground"]/section[3]/div/pre/div/div/div[5]']
}
2024-06-04 12:15:39 -07:00
};
}
2024-06-04 12:22:46 -07:00
// Check for Vanta security portals
2024-06-04 12:15:39 -07:00
if (text.includes('<link href="https://static.vanta.com')) {
console.log(
`Vanta link detected for ${url}, using Fire Engine with wait time 3000ms`
);
return {
scraper: "fire-engine",
url: url,
waitAfterLoad: 3000,
2024-06-04 12:15:39 -07:00
};
}
// Check for Google Drive PDF links in the raw HTML
const googleDrivePdfPattern =
2024-06-04 17:47:28 -03:00
/https:\/\/drive\.google\.com\/file\/d\/([^\/]+)\/view/;
2024-06-04 12:15:39 -07:00
const googleDrivePdfLink = text.match(googleDrivePdfPattern);
if (googleDrivePdfLink) {
console.log(
`Google Drive PDF link detected for ${url}: ${googleDrivePdfLink[0]}`
);
2024-06-04 17:47:28 -03:00
const fileId = googleDrivePdfLink[1];
const pdfUrl = `https://drive.google.com/uc?export=download&id=${fileId}`;
2024-06-04 12:15:39 -07:00
return {
2024-06-04 17:47:28 -03:00
scraper: "pdf",
url: pdfUrl
2024-06-04 12:15:39 -07:00
};
}
2024-06-04 17:47:28 -03:00
2024-06-04 12:15:39 -07:00
return null;
}