Files
firecrawl/apps/api/src/controllers/v1/scrape.ts
T

123 lines
3.3 KiB
TypeScript
Raw Normal View History

2024-11-07 20:57:33 +01:00
import { Response } from "express";
import { logger } from "../../lib/logger";
import {
Document,
RequestWithAuth,
ScrapeRequest,
scrapeRequestSchema,
ScrapeResponse,
} from "./types";
2024-08-16 15:14:37 -03:00
import { billTeam } from "../../services/billing/credit_billing";
import { v4 as uuidv4 } from "uuid";
2024-08-23 18:27:00 +02:00
import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
2024-08-16 15:14:37 -03:00
import { logJob } from "../../services/logging/log_job";
2024-08-28 12:46:59 -03:00
import { getJobPriority } from "../../lib/job-priority";
import { PlanType } from "../../types";
2024-10-25 20:21:12 +02:00
import { getScrapeQueue } from "../../services/queue-service";
2024-08-06 15:24:45 -03:00
export async function scrapeController(
req: RequestWithAuth<{}, ScrapeResponse, ScrapeRequest>,
res: Response<ScrapeResponse>
) {
2024-08-16 18:39:13 -03:00
req.body = scrapeRequestSchema.parse(req.body);
let earlyReturn = false;
2024-08-16 15:14:37 -03:00
const origin = req.body.origin;
const timeout = req.body.timeout;
const jobId = uuidv4();
const startTime = new Date().getTime();
const jobPriority = await getJobPriority({
plan: req.auth.plan as PlanType,
2024-08-16 15:14:37 -03:00
team_id: req.auth.team_id,
basePriority: 10,
});
2024-10-25 20:21:12 +02:00
await addScrapeJob(
{
url: req.body.url,
mode: "single_urls",
team_id: req.auth.team_id,
2024-11-07 20:57:33 +01:00
scrapeOptions: req.body,
internalOptions: {},
plan: req.auth.plan!,
origin: req.body.origin,
is_scrape: true,
},
{},
jobId,
jobPriority
);
2024-08-16 15:14:37 -03:00
2024-11-07 20:57:33 +01:00
const totalWait = (req.body.waitFor ?? 0) + (req.body.actions ?? []).reduce((a,x) => (x.type === "wait" ? x.milliseconds ?? 0 : 0) + a, 0);
2024-11-07 20:57:33 +01:00
let doc: Document;
2024-08-16 15:14:37 -03:00
try {
2024-11-07 20:57:33 +01:00
doc = await waitForJob<Document>(jobId, timeout + totalWait); // TODO: better types for this
2024-08-16 15:14:37 -03:00
} catch (e) {
2024-11-07 20:57:33 +01:00
logger.error(`Error in scrapeController: ${e}`);
2024-08-16 15:14:37 -03:00
if (e instanceof Error && e.message.startsWith("Job wait")) {
return res.status(408).json({
success: false,
error: "Request timed out",
});
2024-09-24 10:27:49 +02:00
} else {
2024-08-16 15:14:37 -03:00
return res.status(500).json({
success: false,
2024-11-07 20:57:33 +01:00
error: `(Internal server error) - ${e && e?.message ? e.message : e}`,
2024-08-16 15:14:37 -03:00
});
2024-08-06 15:24:45 -03:00
}
}
2024-10-25 20:21:12 +02:00
await getScrapeQueue().remove(jobId);
2024-08-16 15:14:37 -03:00
const endTime = new Date().getTime();
const timeTakenInSeconds = (endTime - startTime) / 1000;
const numTokens =
2024-11-07 20:57:33 +01:00
doc && doc.extract
// ? numTokensFromString(doc.markdown, "gpt-3.5-turbo")
? 0 // TODO: fix
: 0;
2024-08-06 15:24:45 -03:00
2024-08-16 15:14:37 -03:00
let creditsToBeBilled = 1; // Assuming 1 credit per document
if (earlyReturn) {
// Don't bill if we're early returning
return;
}
2024-08-30 17:09:44 -03:00
if(req.body.extract && req.body.formats.includes("extract")) {
2024-09-12 12:51:14 -04:00
creditsToBeBilled = 5;
2024-08-30 17:09:44 -03:00
}
2024-08-06 15:24:45 -03:00
2024-09-26 22:28:14 +02:00
billTeam(req.auth.team_id, req.acuc?.sub_id, creditsToBeBilled).catch(error => {
2024-11-07 20:57:33 +01:00
logger.error(`Failed to bill team ${req.auth.team_id} for ${creditsToBeBilled} credits: ${error}`);
2024-09-03 21:09:32 -03:00
// Optionally, you could notify an admin or add to a retry queue here
});
2024-08-06 15:24:45 -03:00
2024-11-07 20:57:33 +01:00
if (!req.body.formats.includes("rawHtml")) {
2024-08-30 17:32:41 -03:00
if (doc && doc.rawHtml) {
delete doc.rawHtml;
}
}
2024-08-16 15:14:37 -03:00
logJob({
job_id: jobId,
success: true,
message: "Scrape completed",
num_docs: 1,
docs: [doc],
time_taken: timeTakenInSeconds,
team_id: req.auth.team_id,
mode: "scrape",
url: req.body.url,
2024-11-07 20:57:33 +01:00
scrapeOptions: req.body,
origin: origin,
2024-08-16 15:14:37 -03:00
num_tokens: numTokens,
});
return res.status(200).json({
success: true,
2024-11-07 20:57:33 +01:00
data: doc,
2024-09-02 18:51:18 -03:00
scrape_id: origin?.includes("website") ? jobId : undefined,
2024-08-16 15:14:37 -03:00
});
}