Files
firecrawl/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts
T

179 lines
4.8 KiB
TypeScript
Raw Normal View History

2024-11-07 20:57:33 +01:00
import { Meta } from "../..";
import { EngineScrapeResult } from "..";
import * as marked from "marked";
import { robustFetch } from "../../lib/fetch";
import { z } from "zod";
import * as Sentry from "@sentry/node";
import escapeHtml from "escape-html";
2024-12-17 12:12:22 -05:00
import PdfParse from "pdf-parse";
2024-11-07 20:57:33 +01:00
import { downloadFile, fetchFileToBuffer } from "../utils/downloadFile";
import { PDFAntibotError, RemoveFeatureError, UnsupportedFileError } from "../../error";
2024-12-27 20:59:18 -03:00
import { readFile, unlink } from "node:fs/promises";
2024-12-27 16:37:32 +01:00
import path from "node:path";
import type { Response } from "undici";
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
type PDFProcessorResult = { html: string; markdown?: string };
2024-12-27 20:54:26 -03:00
const MAX_FILE_SIZE = 19 * 1024 * 1024; // 19MB
2024-12-27 19:59:05 -03:00
async function scrapePDFWithRunPodMU(
2024-12-11 19:46:11 -03:00
meta: Meta,
2024-12-11 19:51:08 -03:00
tempFilePath: string,
2024-12-15 18:58:29 +01:00
timeToRun: number | undefined,
2024-12-27 20:54:26 -03:00
base64Content: string,
2024-12-11 19:46:11 -03:00
): Promise<PDFProcessorResult> {
2024-12-27 19:59:05 -03:00
meta.logger.debug("Processing PDF document with RunPod MU", {
2024-12-11 19:51:08 -03:00
tempFilePath,
2024-12-11 19:46:11 -03:00
});
2024-12-27 20:54:26 -03:00
const result = await robustFetch({
url:
"https://api.runpod.ai/v2/" + process.env.RUNPOD_MU_POD_ID + "/runsync",
2024-12-11 19:46:11 -03:00
method: "POST",
headers: {
2024-12-27 19:59:05 -03:00
Authorization: `Bearer ${process.env.RUNPOD_MU_API_KEY}`,
2024-12-27 16:37:32 +01:00
},
body: {
input: {
2024-12-27 20:54:26 -03:00
file_content: base64Content,
2024-12-27 16:37:32 +01:00
filename: path.basename(tempFilePath) + ".pdf",
},
2024-12-11 19:46:11 -03:00
},
logger: meta.logger.child({
2024-12-27 20:54:26 -03:00
method: "scrapePDFWithRunPodMU/robustFetch",
2024-12-11 19:46:11 -03:00
}),
schema: z.object({
2024-12-27 20:54:26 -03:00
output: z.object({
markdown: z.string(),
}),
2024-12-11 19:51:08 -03:00
}),
mock: meta.mock,
2024-12-11 19:46:11 -03:00
});
2024-12-27 20:54:26 -03:00
return {
markdown: result.output.markdown,
html: await marked.parse(result.output.markdown, { async: true }),
};
2024-11-07 20:57:33 +01:00
}
2024-12-11 19:46:11 -03:00
async function scrapePDFWithParsePDF(
meta: Meta,
2024-12-11 19:51:08 -03:00
tempFilePath: string,
2024-12-11 19:46:11 -03:00
): Promise<PDFProcessorResult> {
2024-12-17 12:12:22 -05:00
meta.logger.debug("Processing PDF document with parse-pdf", { tempFilePath });
2024-11-07 20:57:33 +01:00
2024-12-27 16:37:32 +01:00
const result = await PdfParse(await readFile(tempFilePath));
2024-12-17 12:12:22 -05:00
const escaped = escapeHtml(result.text);
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
return {
markdown: escaped,
2024-12-11 19:51:08 -03:00
html: escaped,
2024-12-11 19:46:11 -03:00
};
2024-11-07 20:57:33 +01:00
}
2024-12-17 16:58:57 -03:00
export async function scrapePDF(
meta: Meta,
timeToRun: number | undefined,
): Promise<EngineScrapeResult> {
2024-12-11 19:46:11 -03:00
if (!meta.options.parsePDF) {
if (meta.pdfPrefetch !== undefined && meta.pdfPrefetch !== null) {
const content = (await readFile(meta.pdfPrefetch.filePath)).toString("base64");
return {
url: meta.pdfPrefetch.url ?? meta.url,
statusCode: meta.pdfPrefetch.status,
html: content,
markdown: content,
};
} else {
const file = await fetchFileToBuffer(meta.url, {
headers: meta.options.headers,
});
const ct = file.response.headers.get("Content-Type");
if (ct && !ct.includes("application/pdf")) { // if downloaded file wasn't a PDF
throw new PDFAntibotError();
}
const content = file.buffer.toString("base64");
return {
url: file.response.url,
statusCode: file.response.status,
html: content,
markdown: content,
};
}
2024-12-11 19:46:11 -03:00
}
const { response, tempFilePath } = (meta.pdfPrefetch !== undefined && meta.pdfPrefetch !== null)
? { response: meta.pdfPrefetch, tempFilePath: meta.pdfPrefetch.filePath }
: await downloadFile(meta.id, meta.url, {
headers: meta.options.headers,
});
if ((response as any).headers) { // if downloadFile was used
const r: Response = response as any;
const ct = r.headers.get("Content-Type");
if (ct && !ct.includes("application/pdf")) { // if downloaded file wasn't a PDF
throw new PDFAntibotError();
}
}
2024-12-11 19:46:11 -03:00
let result: PDFProcessorResult | null = null;
2024-12-17 09:50:29 -05:00
2024-12-27 20:54:26 -03:00
const base64Content = (await readFile(tempFilePath)).toString("base64");
2024-12-17 09:50:29 -05:00
2024-12-27 20:59:18 -03:00
// First try RunPod MU if conditions are met
2024-12-17 16:58:57 -03:00
if (
2024-12-27 20:54:26 -03:00
base64Content.length < MAX_FILE_SIZE &&
process.env.RUNPOD_MU_API_KEY &&
process.env.RUNPOD_MU_POD_ID
2024-12-17 16:58:57 -03:00
) {
2024-12-11 19:46:11 -03:00
try {
2024-12-27 20:59:18 -03:00
result = await scrapePDFWithRunPodMU(
2024-12-11 19:46:11 -03:00
{
...meta,
logger: meta.logger.child({
2024-12-27 19:59:05 -03:00
method: "scrapePDF/scrapePDFWithRunPodMU",
2024-12-11 19:51:08 -03:00
}),
2024-12-11 19:46:11 -03:00
},
2024-12-11 19:51:08 -03:00
tempFilePath,
2024-12-15 18:58:29 +01:00
timeToRun,
2024-12-27 20:54:26 -03:00
base64Content,
2024-12-11 19:46:11 -03:00
);
} catch (error) {
2024-12-27 20:59:18 -03:00
if (error instanceof RemoveFeatureError) {
2024-12-11 19:46:11 -03:00
throw error;
}
2024-12-27 20:59:18 -03:00
meta.logger.warn(
2024-12-27 21:31:29 -03:00
"RunPod MU failed to parse PDF (could be due to timeout) -- falling back to parse-pdf",
2024-12-27 20:59:18 -03:00
{ error },
);
Sentry.captureException(error);
2024-11-07 20:57:33 +01:00
}
2024-12-27 20:59:18 -03:00
}
// If RunPod MU failed or wasn't attempted, use PdfParse
if (!result) {
2024-12-27 20:54:26 -03:00
result = await scrapePDFWithParsePDF(
{
...meta,
logger: meta.logger.child({
method: "scrapePDF/scrapePDFWithParsePDF",
}),
},
tempFilePath,
);
2024-12-11 19:46:11 -03:00
}
2024-12-27 16:37:32 +01:00
await unlink(tempFilePath);
2024-12-11 19:46:11 -03:00
return {
url: response.url ?? meta.url,
2024-12-11 19:46:11 -03:00
statusCode: response.status,
2024-12-27 20:54:26 -03:00
html: result?.html ?? "",
markdown: result?.markdown ?? "",
2024-12-11 19:46:11 -03:00
};
2024-12-17 16:58:57 -03:00
}