Files
firecrawl/apps/api/src/scraper/scrapeURL/engines/utils/specialtyHandler.ts
T

33 lines
910 B
TypeScript
Raw Normal View History

2024-11-07 20:57:33 +01:00
import { Logger } from "winston";
import { AddFeatureError } from "../../error";
2024-12-11 19:46:11 -03:00
export function specialtyScrapeCheck(
logger: Logger,
2024-12-11 19:51:08 -03:00
headers: Record<string, string> | undefined,
2024-12-11 19:46:11 -03:00
) {
const contentType = (Object.entries(headers ?? {}).find(
2024-12-11 19:51:08 -03:00
(x) => x[0].toLowerCase() === "content-type",
2024-12-11 19:46:11 -03:00
) ?? [])[1];
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
if (contentType === undefined) {
logger.warn("Failed to check contentType -- was not present in headers", {
2024-12-11 19:51:08 -03:00
headers,
2024-12-11 19:46:11 -03:00
});
} else if (
contentType === "application/pdf" ||
contentType.startsWith("application/pdf;")
) {
// .pdf
throw new AddFeatureError(["pdf"]);
} else if (
contentType ===
"application/vnd.openxmlformats-officedocument.wordprocessingml.document" ||
contentType.startsWith(
2024-12-11 19:51:08 -03:00
"application/vnd.openxmlformats-officedocument.wordprocessingml.document;",
2024-12-11 19:46:11 -03:00
)
) {
// .docx
throw new AddFeatureError(["docx"]);
}
2024-11-07 20:57:33 +01:00
}