Files
firecrawl/apps/api/src/controllers/v1/extract.ts
T

114 lines
3.2 KiB
TypeScript
Raw Normal View History

2024-10-28 16:02:07 -03:00
import { Request, Response } from "express";
import {
RequestWithAuth,
ExtractRequest,
extractRequestSchema,
ExtractResponse,
} from "./types";
2025-01-03 20:44:27 -03:00
import { getExtractQueue } from "../../services/queue-service";
import * as Sentry from "@sentry/node";
2025-01-07 16:16:01 -03:00
import { saveExtract } from "../../lib/extract/extract-redis";
2025-01-07 17:49:21 -03:00
import { getTeamIdSyncB } from "../../lib/extract/team-id-sync";
import { performExtraction } from "../../lib/extract/extraction-service";
2024-10-28 16:02:07 -03:00
2025-01-10 18:35:10 -03:00
export async function oldExtract(
req: RequestWithAuth<{}, ExtractResponse, ExtractRequest>,
res: Response<ExtractResponse>,
extractId: string,
) {
2025-01-07 17:49:21 -03:00
// Means that are in the non-queue system
// TODO: Remove this once all teams have transitioned to the new system
2025-01-10 18:35:10 -03:00
try {
const result = await performExtraction(extractId, {
request: req.body,
teamId: req.auth.team_id,
plan: req.auth.plan ?? "free",
subId: req.acuc?.sub_id ?? undefined,
2025-01-07 17:49:21 -03:00
});
2025-01-10 18:35:10 -03:00
return res.status(200).json(result);
} catch (error) {
return res.status(500).json({
success: false,
error: "Internal server error",
});
2025-01-07 17:49:21 -03:00
}
2025-01-10 18:35:10 -03:00
}
2024-11-20 13:16:36 -08:00
/**
* Extracts data from the provided URLs based on the request parameters.
* Currently in beta.
* @param req - The request object containing authentication and extraction details.
* @param res - The response object to send the extraction results.
* @returns A promise that resolves when the extraction process is complete.
*/
2024-10-28 16:02:07 -03:00
export async function extractController(
req: RequestWithAuth<{}, ExtractResponse, ExtractRequest>,
2024-12-11 19:51:08 -03:00
res: Response<ExtractResponse>,
2024-10-28 16:02:07 -03:00
) {
2024-11-20 13:15:52 -08:00
const selfHosted = process.env.USE_DB_AUTHENTICATION !== "true";
2024-10-28 16:02:07 -03:00
req.body = extractRequestSchema.parse(req.body);
2025-01-03 20:44:27 -03:00
const extractId = crypto.randomUUID();
const jobData = {
2024-12-26 12:41:37 -03:00
request: req.body,
teamId: req.auth.team_id,
plan: req.auth.plan,
2025-01-03 20:44:27 -03:00
subId: req.acuc?.sub_id,
extractId,
};
2024-11-14 14:59:34 -05:00
2025-01-10 18:35:10 -03:00
if (
(await getTeamIdSyncB(req.auth.team_id)) &&
2025-01-22 11:01:10 -03:00
req.body.origin !== "api-sdk" &&
req.body.origin !== "website"
2025-01-10 18:35:10 -03:00
) {
2025-01-07 17:49:21 -03:00
return await oldExtract(req, res, extractId);
}
2025-01-07 16:16:01 -03:00
await saveExtract(extractId, {
id: extractId,
team_id: req.auth.team_id,
plan: req.auth.plan,
createdAt: Date.now(),
status: "processing",
2025-01-14 01:45:50 -03:00
showSteps: req.body.__experimental_streamSteps,
showLLMUsage: req.body.__experimental_llmUsage,
showSources: req.body.__experimental_showSources || req.body.showSources,
2025-01-07 16:16:01 -03:00
});
2025-01-03 20:44:27 -03:00
if (Sentry.isInitialized()) {
const size = JSON.stringify(jobData).length;
await Sentry.startSpan(
{
name: "Add extract job",
op: "queue.publish",
attributes: {
"messaging.message.id": extractId,
"messaging.destination.name": getExtractQueue().name,
"messaging.message.body.size": size,
},
},
async (span) => {
await getExtractQueue().add(extractId, {
...jobData,
sentry: {
trace: Sentry.spanToTraceHeader(span),
baggage: Sentry.spanToBaggageHeader(span),
size,
},
2025-01-24 11:03:04 +01:00
}, { jobId: extractId });
2025-01-03 20:44:27 -03:00
},
);
} else {
await getExtractQueue().add(extractId, jobData, {
jobId: extractId,
});
}
2025-01-07 17:20:49 -03:00
return res.status(200).json({
2025-01-03 20:44:27 -03:00
success: true,
id: extractId,
urlTrace: [],
});
2024-12-11 19:46:11 -03:00
}