Files
firecrawl/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts
T

204 lines
5.5 KiB
TypeScript
Raw Normal View History

2024-11-07 20:57:33 +01:00
import { Meta } from "../..";
import { EngineScrapeResult } from "..";
import * as marked from "marked";
import { robustFetch } from "../../lib/fetch";
import { z } from "zod";
import * as Sentry from "@sentry/node";
import escapeHtml from "escape-html";
2024-12-17 12:12:22 -05:00
import PdfParse from "pdf-parse";
2024-11-07 20:57:33 +01:00
import { downloadFile, fetchFileToBuffer } from "../utils/downloadFile";
2024-12-27 16:37:32 +01:00
import { RemoveFeatureError, UnsupportedFileError } from "../../error";
import { stat, readFile, unlink } from "node:fs/promises";
import path from "node:path";
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
type PDFProcessorResult = { html: string; markdown?: string };
2024-12-27 16:37:32 +01:00
async function scrapePDFWithMinerU(
2024-12-11 19:46:11 -03:00
meta: Meta,
2024-12-11 19:51:08 -03:00
tempFilePath: string,
2024-12-15 18:58:29 +01:00
timeToRun: number | undefined,
2024-12-11 19:46:11 -03:00
): Promise<PDFProcessorResult> {
2024-12-27 16:37:32 +01:00
meta.logger.debug("Processing PDF document with MinerU", {
2024-12-11 19:51:08 -03:00
tempFilePath,
2024-12-11 19:46:11 -03:00
});
2024-12-27 16:37:32 +01:00
const fileStat = await stat(tempFilePath);
if (fileStat.size > ((2**10)**2)*10) {
throw new UnsupportedFileError("File is larger than PDF parser limit (10MiB)");
}
2024-12-11 19:46:11 -03:00
2024-12-27 16:37:32 +01:00
console.log(tempFilePath);
2024-12-11 19:46:11 -03:00
const upload = await robustFetch({
2024-12-27 16:37:32 +01:00
url: "https://api.runpod.ai/v2/" + process.env.MINERU_POD_ID + "/run",
2024-12-11 19:46:11 -03:00
method: "POST",
headers: {
2024-12-27 16:37:32 +01:00
Authorization: `Bearer ${process.env.MINERU_API_KEY}`,
},
body: {
input: {
file_content: (await readFile(tempFilePath)).toString("base64"),
filename: path.basename(tempFilePath) + ".pdf",
},
2024-12-11 19:46:11 -03:00
},
logger: meta.logger.child({
2024-12-27 16:37:32 +01:00
method: "scrapePDFWithMinerU/upload/robustFetch",
2024-12-11 19:46:11 -03:00
}),
schema: z.object({
2024-12-11 19:51:08 -03:00
id: z.string(),
}),
2024-12-11 19:46:11 -03:00
});
const jobId = upload.id;
// TODO: timeout, retries
const startedAt = Date.now();
2024-12-15 18:58:29 +01:00
const timeout = timeToRun ?? 300000;
2024-12-11 19:46:11 -03:00
2024-12-15 18:58:29 +01:00
while (Date.now() <= startedAt + timeout) {
2024-12-11 19:46:11 -03:00
try {
const result = await robustFetch({
2024-12-27 16:37:32 +01:00
url: `https://api.runpod.ai/v2/${process.env.MINERU_POD_ID}/status/${jobId}`,
2024-12-11 19:46:11 -03:00
method: "GET",
2024-11-07 20:57:33 +01:00
headers: {
2024-12-27 16:37:32 +01:00
Authorization: `Bearer ${process.env.MINERU_API_KEY}`,
2024-11-07 20:57:33 +01:00
},
2024-12-11 19:46:11 -03:00
logger: meta.logger.child({
2024-12-27 16:37:32 +01:00
method: "scrapePDFWithMinerU/result/robustFetch",
2024-11-07 20:57:33 +01:00
}),
2024-12-11 19:46:11 -03:00
schema: z.object({
2024-12-27 16:37:32 +01:00
status: z.string(),
error: z.any().optional(),
output: z.object({
markdown: z.string(),
}).optional(),
2024-12-11 19:51:08 -03:00
}),
2024-12-11 19:46:11 -03:00
});
2024-12-27 16:37:32 +01:00
if (result.status === "COMPLETED") {
return {
markdown: result.output!.markdown,
html: await marked.parse(result.output!.markdown, { async: true }),
};
}
if (result.status === "FAILED") {
throw new Error("MinerU failed to parse PDF: " + result.error!, { cause: result.error });
}
// result not up yet
2024-12-11 19:46:11 -03:00
} catch (e) {
if (e instanceof Error && e.message === "Request sent failure status") {
2024-12-27 16:37:32 +01:00
// if ((e.cause as any).response.status === 404) {
// // no-op, result not up yet
// } else if ((e.cause as any).response.body.includes("PDF_IS_BROKEN")) {
// // URL is not a PDF, actually!
// meta.logger.debug("URL is not actually a PDF, signalling...");
// throw new RemoveFeatureError(["pdf"]);
// } else {
throw new Error("MinerU threw an error", {
2024-12-11 19:51:08 -03:00
cause: e.cause,
2024-12-11 19:46:11 -03:00
});
2024-12-27 16:37:32 +01:00
// }
2024-12-11 19:46:11 -03:00
} else {
throw e;
}
}
2024-12-11 19:46:11 -03:00
await new Promise<void>((resolve) => setTimeout(() => resolve(), 250));
}
2024-12-27 16:37:32 +01:00
throw new Error("MinerU timed out");
2024-11-07 20:57:33 +01:00
}
2024-12-11 19:46:11 -03:00
async function scrapePDFWithParsePDF(
meta: Meta,
2024-12-11 19:51:08 -03:00
tempFilePath: string,
2024-12-11 19:46:11 -03:00
): Promise<PDFProcessorResult> {
2024-12-17 12:12:22 -05:00
meta.logger.debug("Processing PDF document with parse-pdf", { tempFilePath });
2024-11-07 20:57:33 +01:00
2024-12-27 16:37:32 +01:00
const result = await PdfParse(await readFile(tempFilePath));
2024-12-17 12:12:22 -05:00
const escaped = escapeHtml(result.text);
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
return {
markdown: escaped,
2024-12-11 19:51:08 -03:00
html: escaped,
2024-12-11 19:46:11 -03:00
};
2024-11-07 20:57:33 +01:00
}
2024-12-17 16:58:57 -03:00
export async function scrapePDF(
meta: Meta,
timeToRun: number | undefined,
): Promise<EngineScrapeResult> {
2024-12-11 19:46:11 -03:00
if (!meta.options.parsePDF) {
const file = await fetchFileToBuffer(meta.url);
const content = file.buffer.toString("base64");
2024-11-07 20:57:33 +01:00
return {
2024-12-11 19:46:11 -03:00
url: file.response.url,
statusCode: file.response.status,
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
html: content,
2024-12-11 19:51:08 -03:00
markdown: content,
2024-12-11 19:46:11 -03:00
};
}
const { response, tempFilePath } = await downloadFile(meta.id, meta.url);
let result: PDFProcessorResult | null = null;
2024-12-17 09:50:29 -05:00
2024-12-17 12:12:22 -05:00
// First, try parsing with PdfParse
2024-12-17 09:50:29 -05:00
result = await scrapePDFWithParsePDF(
{
...meta,
logger: meta.logger.child({
method: "scrapePDF/scrapePDFWithParsePDF",
}),
},
tempFilePath,
);
2024-12-27 16:37:32 +01:00
// Then, if output is too short, pass to MinerU
2024-12-17 16:58:57 -03:00
if (
2024-12-27 16:37:32 +01:00
result.markdown && result.markdown.length < 500 &&
process.env.MINERU_API_KEY && process.env.MINERU_POD_ID
2024-12-17 16:58:57 -03:00
) {
2024-12-11 19:46:11 -03:00
try {
2024-12-27 16:37:32 +01:00
const mineruResult = await scrapePDFWithMinerU(
2024-12-11 19:46:11 -03:00
{
...meta,
logger: meta.logger.child({
2024-12-27 16:37:32 +01:00
method: "scrapePDF/scrapePDFWithMinerU",
2024-12-11 19:51:08 -03:00
}),
2024-12-11 19:46:11 -03:00
},
2024-12-11 19:51:08 -03:00
tempFilePath,
2024-12-15 18:58:29 +01:00
timeToRun,
2024-12-11 19:46:11 -03:00
);
2024-12-27 16:37:32 +01:00
result = mineruResult; // Use LlamaParse result if successful
2024-12-11 19:46:11 -03:00
} catch (error) {
2024-12-27 16:37:32 +01:00
if (error instanceof Error && error.message === "MinerU timed out") {
meta.logger.warn("MinerU timed out -- using parse-pdf result", {
2024-12-11 19:51:08 -03:00
error,
2024-12-11 19:46:11 -03:00
});
} else if (error instanceof RemoveFeatureError) {
throw error;
} else {
meta.logger.warn(
2024-12-27 16:37:32 +01:00
"MinerU failed to parse PDF -- using parse-pdf result",
2024-12-11 19:51:08 -03:00
{ error },
2024-12-11 19:46:11 -03:00
);
Sentry.captureException(error);
}
2024-11-07 20:57:33 +01:00
}
2024-12-11 19:46:11 -03:00
}
2024-12-27 16:37:32 +01:00
await unlink(tempFilePath);
2024-12-11 19:46:11 -03:00
return {
url: response.url,
statusCode: response.status,
html: result.html,
2024-12-11 19:51:08 -03:00
markdown: result.markdown,
2024-12-11 19:46:11 -03:00
};
2024-12-17 16:58:57 -03:00
}