feat(v1/batch/scrape): add ignoreInvalidURLs option

This commit is contained in:
Gergő Móricz
2024-12-14 01:11:43 +01:00
parent e74e4bcefc
commit 4b5014d7fe
4 changed files with 72 additions and 7 deletions
+33 -7
View File
@@ -3,9 +3,11 @@ import { v4 as uuidv4 } from "uuid";
import { import {
BatchScrapeRequest, BatchScrapeRequest,
batchScrapeRequestSchema, batchScrapeRequestSchema,
CrawlResponse, batchScrapeRequestSchemaNoURLValidation,
url as urlSchema,
RequestWithAuth, RequestWithAuth,
ScrapeOptions, ScrapeOptions,
BatchScrapeResponse,
} from "./types"; } from "./types";
import { import {
addCrawlJobs, addCrawlJobs,
@@ -21,10 +23,14 @@ import { callWebhook } from "../../services/webhook";
import { logger as _logger } from "../../lib/logger"; import { logger as _logger } from "../../lib/logger";
export async function batchScrapeController( export async function batchScrapeController(
req: RequestWithAuth<{}, CrawlResponse, BatchScrapeRequest>, req: RequestWithAuth<{}, BatchScrapeResponse, BatchScrapeRequest>,
res: Response<CrawlResponse>, res: Response<BatchScrapeResponse>,
) { ) {
req.body = batchScrapeRequestSchema.parse(req.body); if (req.body?.ignoreInvalidURLs === true) {
req.body = batchScrapeRequestSchemaNoURLValidation.parse(req.body);
} else {
req.body = batchScrapeRequestSchema.parse(req.body);
}
const id = req.body.appendToId ?? uuidv4(); const id = req.body.appendToId ?? uuidv4();
const logger = _logger.child({ const logger = _logger.child({
@@ -35,8 +41,27 @@ export async function batchScrapeController(
teamId: req.auth.team_id, teamId: req.auth.team_id,
plan: req.auth.plan, plan: req.auth.plan,
}); });
let urls = req.body.urls;
let invalidURLs: string[] | undefined = undefined;
if (req.body.ignoreInvalidURLs) {
invalidURLs = [];
let pendingURLs = urls;
urls = [];
for (const u of pendingURLs) {
try {
const nu = urlSchema.parse(u);
urls.push(nu);
} catch (_) {
invalidURLs.push(u);
}
}
}
logger.debug("Batch scrape " + id + " starting", { logger.debug("Batch scrape " + id + " starting", {
urlsLength: req.body.urls, urlsLength: urls,
appendToId: req.body.appendToId, appendToId: req.body.appendToId,
account: req.account, account: req.account,
}); });
@@ -70,7 +95,7 @@ export async function batchScrapeController(
// If it is over 1000, we need to get the job priority, // If it is over 1000, we need to get the job priority,
// otherwise we can use the default priority of 20 // otherwise we can use the default priority of 20
if (req.body.urls.length > 1000) { if (urls.length > 1000) {
// set base to 21 // set base to 21
jobPriority = await getJobPriority({ jobPriority = await getJobPriority({
plan: req.auth.plan, plan: req.auth.plan,
@@ -84,7 +109,7 @@ export async function batchScrapeController(
delete (scrapeOptions as any).urls; delete (scrapeOptions as any).urls;
delete (scrapeOptions as any).appendToId; delete (scrapeOptions as any).appendToId;
const jobs = req.body.urls.map((x) => { const jobs = urls.map((x) => {
return { return {
data: { data: {
url: x, url: x,
@@ -140,5 +165,6 @@ export async function batchScrapeController(
success: true, success: true,
id, id,
url: `${protocol}://${req.get("host")}/v1/batch/scrape/${id}`, url: `${protocol}://${req.get("host")}/v1/batch/scrape/${id}`,
invalidURLs,
}); });
} }
+34
View File
@@ -262,6 +262,31 @@ export const batchScrapeRequestSchema = scrapeOptions
origin: z.string().optional().default("api"), origin: z.string().optional().default("api"),
webhook: webhookSchema.optional(), webhook: webhookSchema.optional(),
appendToId: z.string().uuid().optional(), appendToId: z.string().uuid().optional(),
ignoreInvalidURLs: z.boolean().default(false),
})
.strict(strictMessage)
.refine(
(obj) => {
const hasExtractFormat = obj.formats?.includes("extract");
const hasExtractOptions = obj.extract !== undefined;
return (
(hasExtractFormat && hasExtractOptions) ||
(!hasExtractFormat && !hasExtractOptions)
);
},
{
message:
"When 'extract' format is specified, 'extract' options must be provided, and vice versa",
},
);
export const batchScrapeRequestSchemaNoURLValidation = scrapeOptions
.extend({
urls: z.string().array(),
origin: z.string().optional().default("api"),
webhook: webhookSchema.optional(),
appendToId: z.string().uuid().optional(),
ignoreInvalidURLs: z.boolean().default(false),
}) })
.strict(strictMessage) .strict(strictMessage)
.refine( .refine(
@@ -446,6 +471,15 @@ export type CrawlResponse =
url: string; url: string;
}; };
export type BatchScrapeResponse =
| ErrorResponse
| {
success: true;
id: string;
url: string;
invalidURLs?: string[];
};
export type MapResponse = export type MapResponse =
| ErrorResponse | ErrorResponse
| { | {
+4
View File
@@ -60,6 +60,8 @@ export async function addCrawlJob(id: string, job_id: string) {
} }
export async function addCrawlJobs(id: string, job_ids: string[]) { export async function addCrawlJobs(id: string, job_ids: string[]) {
if (job_ids.length === 0) return true;
_logger.debug("Adding crawl jobs to Redis...", { _logger.debug("Adding crawl jobs to Redis...", {
jobIds: job_ids, jobIds: job_ids,
module: "crawl-redis", module: "crawl-redis",
@@ -261,6 +263,8 @@ export async function lockURLs(
sc: StoredCrawl, sc: StoredCrawl,
urls: string[], urls: string[],
): Promise<boolean> { ): Promise<boolean> {
if (urls.length === 0) return true;
urls = urls.map((url) => normalizeURL(url, sc)); urls = urls.map((url) => normalizeURL(url, sc));
const logger = _logger.child({ const logger = _logger.child({
crawlId: id, crawlId: id,
+1
View File
@@ -108,6 +108,7 @@ export async function addScrapeJobs(
}; };
}[], }[],
) { ) {
if (jobs.length === 0) return true;
// TODO: better // TODO: better
await Promise.all( await Promise.all(
jobs.map((job) => jobs.map((job) =>