Files
firecrawl/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts
T

189 lines
5.1 KiB
TypeScript
Raw Normal View History

2024-11-07 20:57:33 +01:00
import { createReadStream, promises as fs } from "node:fs";
import { Meta } from "../..";
import { EngineScrapeResult } from "..";
import * as marked from "marked";
import { robustFetch } from "../../lib/fetch";
import { z } from "zod";
import * as Sentry from "@sentry/node";
import escapeHtml from "escape-html";
import PdfParse from "pdf-parse";
import { downloadFile, fetchFileToBuffer } from "../utils/downloadFile";
import { RemoveFeatureError } from "../../error";
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
type PDFProcessorResult = { html: string; markdown?: string };
async function scrapePDFWithLlamaParse(
meta: Meta,
tempFilePath: string
): Promise<PDFProcessorResult> {
meta.logger.debug("Processing PDF document with LlamaIndex", {
tempFilePath
});
const uploadForm = new FormData();
// This is utterly stupid but it works! - mogery
uploadForm.append("file", {
[Symbol.toStringTag]: "Blob",
name: tempFilePath,
stream() {
return createReadStream(
tempFilePath
) as unknown as ReadableStream<Uint8Array>;
},
arrayBuffer() {
throw Error("Unimplemented in mock Blob: arrayBuffer");
},
size: (await fs.stat(tempFilePath)).size,
text() {
throw Error("Unimplemented in mock Blob: text");
},
slice(start, end, contentType) {
throw Error("Unimplemented in mock Blob: slice");
},
type: "application/pdf"
} as Blob);
const upload = await robustFetch({
url: "https://api.cloud.llamaindex.ai/api/parsing/upload",
method: "POST",
headers: {
Authorization: `Bearer ${process.env.LLAMAPARSE_API_KEY}`
},
body: uploadForm,
logger: meta.logger.child({
method: "scrapePDFWithLlamaParse/upload/robustFetch"
}),
schema: z.object({
id: z.string()
})
});
const jobId = upload.id;
// TODO: timeout, retries
const startedAt = Date.now();
while (Date.now() <= startedAt + (meta.options.timeout ?? 300000)) {
try {
const result = await robustFetch({
url: `https://api.cloud.llamaindex.ai/api/parsing/job/${jobId}/result/markdown`,
method: "GET",
2024-11-07 20:57:33 +01:00
headers: {
2024-12-11 19:46:11 -03:00
Authorization: `Bearer ${process.env.LLAMAPARSE_API_KEY}`
2024-11-07 20:57:33 +01:00
},
2024-12-11 19:46:11 -03:00
logger: meta.logger.child({
method: "scrapePDFWithLlamaParse/result/robustFetch"
2024-11-07 20:57:33 +01:00
}),
2024-12-11 19:46:11 -03:00
schema: z.object({
markdown: z.string()
})
});
return {
markdown: result.markdown,
html: await marked.parse(result.markdown, { async: true })
};
} catch (e) {
if (e instanceof Error && e.message === "Request sent failure status") {
if ((e.cause as any).response.status === 404) {
// no-op, result not up yet
} else if ((e.cause as any).response.body.includes("PDF_IS_BROKEN")) {
// URL is not a PDF, actually!
meta.logger.debug("URL is not actually a PDF, signalling...");
throw new RemoveFeatureError(["pdf"]);
} else {
throw new Error("LlamaParse threw an error", {
cause: e.cause
});
}
2024-12-11 19:46:11 -03:00
} else {
throw e;
}
}
2024-12-11 19:46:11 -03:00
await new Promise<void>((resolve) => setTimeout(() => resolve(), 250));
}
throw new Error("LlamaParse timed out");
2024-11-07 20:57:33 +01:00
}
2024-12-11 19:46:11 -03:00
async function scrapePDFWithParsePDF(
meta: Meta,
tempFilePath: string
): Promise<PDFProcessorResult> {
meta.logger.debug("Processing PDF document with parse-pdf", { tempFilePath });
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
const result = await PdfParse(await fs.readFile(tempFilePath));
const escaped = escapeHtml(result.text);
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
return {
markdown: escaped,
html: escaped
};
2024-11-07 20:57:33 +01:00
}
export async function scrapePDF(meta: Meta): Promise<EngineScrapeResult> {
2024-12-11 19:46:11 -03:00
if (!meta.options.parsePDF) {
const file = await fetchFileToBuffer(meta.url);
const content = file.buffer.toString("base64");
2024-11-07 20:57:33 +01:00
return {
2024-12-11 19:46:11 -03:00
url: file.response.url,
statusCode: file.response.status,
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
html: content,
markdown: content
};
}
const { response, tempFilePath } = await downloadFile(meta.id, meta.url);
let result: PDFProcessorResult | null = null;
if (process.env.LLAMAPARSE_API_KEY) {
try {
result = await scrapePDFWithLlamaParse(
{
...meta,
logger: meta.logger.child({
method: "scrapePDF/scrapePDFWithLlamaParse"
})
},
tempFilePath
);
} catch (error) {
if (error instanceof Error && error.message === "LlamaParse timed out") {
meta.logger.warn("LlamaParse timed out -- falling back to parse-pdf", {
error
});
} else if (error instanceof RemoveFeatureError) {
throw error;
} else {
meta.logger.warn(
"LlamaParse failed to parse PDF -- falling back to parse-pdf",
{ error }
);
Sentry.captureException(error);
}
2024-11-07 20:57:33 +01:00
}
2024-12-11 19:46:11 -03:00
}
if (result === null) {
result = await scrapePDFWithParsePDF(
{
...meta,
logger: meta.logger.child({ method: "scrapePDF/scrapePDFWithParsePDF" })
},
tempFilePath
);
}
await fs.unlink(tempFilePath);
return {
url: response.url,
statusCode: response.status,
html: result.html,
markdown: result.markdown
};
2024-11-07 20:57:33 +01:00
}