Nick: from bulk to batch

This commit is contained in:
Nicolas
2024-10-23 15:37:24 -03:00
parent 70c4e7c334
commit d8abd15716
8 changed files with 84 additions and 83 deletions
@@ -1,8 +1,8 @@
import { Response } from "express";
import { v4 as uuidv4 } from "uuid";
import {
BulkScrapeRequest,
bulkScrapeRequestSchema,
BatchScrapeRequest,
batchScrapeRequestSchema,
CrawlResponse,
legacyScrapeOptions,
RequestWithAuth,
@@ -17,11 +17,11 @@ import { logCrawl } from "../../services/logging/crawl_log";
import { getScrapeQueue } from "../../services/queue-service";
import { getJobPriority } from "../../lib/job-priority";
export async function bulkScrapeController(
req: RequestWithAuth<{}, CrawlResponse, BulkScrapeRequest>,
export async function batchScrapeController(
req: RequestWithAuth<{}, CrawlResponse, BatchScrapeRequest>,
res: Response<CrawlResponse>
) {
req.body = bulkScrapeRequestSchema.parse(req.body);
req.body = batchScrapeRequestSchema.parse(req.body);
const id = uuidv4();
@@ -92,7 +92,7 @@ export async function bulkScrapeController(
return res.status(200).json({
success: true,
id,
url: `${protocol}://${req.get("host")}/v1/bulk/scrape/${id}`,
url: `${protocol}://${req.get("host")}/v1/batch/scrape/${id}`,
});
}
+2 -2
View File
@@ -44,7 +44,7 @@ export async function getJobs(ids: string[]) {
return jobs;
}
export async function crawlStatusController(req: RequestWithAuth<CrawlStatusParams, undefined, CrawlStatusResponse>, res: Response<CrawlStatusResponse>) {
export async function crawlStatusController(req: RequestWithAuth<CrawlStatusParams, undefined, CrawlStatusResponse>, res: Response<CrawlStatusResponse>, isBatch = false) {
const sc = await getCrawl(req.params.jobId);
if (!sc) {
return res.status(404).json({ success: false, error: "Job not found" });
@@ -113,7 +113,7 @@ export async function crawlStatusController(req: RequestWithAuth<CrawlStatusPara
const data = doneJobs.map(x => x.returnvalue);
const protocol = process.env.ENV === "local" ? req.protocol : "https";
const nextURL = new URL(`${protocol}://${req.get("host")}/v1/crawl/${req.params.jobId}`);
const nextURL = new URL(`${protocol}://${req.get("host")}/v1/${isBatch ? "batch/scrape" : "crawl"}/${req.params.jobId}`);
nextURL.searchParams.set("skip", (start + data.length).toString());
+2 -2
View File
@@ -144,7 +144,7 @@ export const scrapeRequestSchema = scrapeOptions.extend({
export type ScrapeRequest = z.infer<typeof scrapeRequestSchema>;
export const bulkScrapeRequestSchema = scrapeOptions.extend({
export const batchScrapeRequestSchema = scrapeOptions.extend({
urls: url.array(),
origin: z.string().optional().default("api"),
}).strict(strictMessage).refine(
@@ -163,7 +163,7 @@ export const bulkScrapeRequestSchema = scrapeOptions.extend({
return obj;
});
export type BulkScrapeRequest = z.infer<typeof bulkScrapeRequestSchema>;
export type BatchScrapeRequest = z.infer<typeof batchScrapeRequestSchema>;
const crawlerOptions = z.object({
includePaths: z.string().array().default([]),
+6 -5
View File
@@ -17,7 +17,7 @@ import { crawlCancelController } from "../controllers/v1/crawl-cancel";
import { Logger } from "../lib/logger";
import { scrapeStatusController } from "../controllers/v1/scrape-status";
import { concurrencyCheckController } from "../controllers/v1/concurrency-check";
import { bulkScrapeController } from "../controllers/v1/bulk-scrape";
import { batchScrapeController } from "../controllers/v1/batch-scrape";
// import { crawlPreviewController } from "../../src/controllers/v1/crawlPreview";
// import { crawlJobStatusPreviewController } from "../../src/controllers/v1/status";
// import { searchController } from "../../src/controllers/v1/search";
@@ -124,12 +124,12 @@ v1Router.post(
);
v1Router.post(
"/bulk/scrape",
"/batch/scrape",
authMiddleware(RateLimiterMode.Crawl),
checkCreditsMiddleware(),
blocklistMiddleware,
idempotencyMiddleware,
wrap(bulkScrapeController)
wrap(batchScrapeController)
);
v1Router.post(
@@ -147,9 +147,10 @@ v1Router.get(
);
v1Router.get(
"/bulk/scrape/:jobId",
"/batch/scrape/:jobId",
authMiddleware(RateLimiterMode.CrawlStatus),
wrap(crawlStatusController)
// Yes, it uses the same controller as the normal crawl status controller
wrap((req:any, res):any => crawlStatusController(req, res, true))
);
v1Router.get(