2024-10-17 19:40:18 +02:00
import { Response } from "express" ;
import { v4 as uuidv4 } from "uuid" ;
import {
2024-10-23 15:37:24 -03:00
BatchScrapeRequest ,
batchScrapeRequestSchema ,
2024-12-14 01:11:43 +01:00
batchScrapeRequestSchemaNoURLValidation ,
url as urlSchema ,
2024-10-17 19:40:18 +02:00
RequestWithAuth ,
2024-12-11 19:51:08 -03:00
ScrapeOptions ,
2024-12-14 01:11:43 +01:00
BatchScrapeResponse ,
2024-10-17 19:40:18 +02:00
} from "./types" ;
import {
addCrawlJobs ,
2025-01-20 09:40:59 +01:00
finishCrawlKickoff ,
2024-12-04 23:35:29 +01:00
getCrawl ,
2024-10-17 19:40:18 +02:00
lockURLs ,
saveCrawl ,
2024-12-11 19:51:08 -03:00
StoredCrawl ,
2024-10-17 19:40:18 +02:00
} from "../../lib/crawl-redis" ;
import { logCrawl } from "../../services/logging/crawl_log" ;
import { getJobPriority } from "../../lib/job-priority" ;
2024-10-25 20:21:12 +02:00
import { addScrapeJobs } from "../../services/queue-jobs" ;
2024-11-14 22:39:41 +01:00
import { callWebhook } from "../../services/webhook" ;
2024-12-05 20:50:36 +01:00
import { logger as _logger } from "../../lib/logger" ;
2025-04-17 09:23:53 -07:00
import { CostTracking } from "../../lib/extract/extraction-service" ;
2024-10-17 19:40:18 +02:00
2024-10-23 15:37:24 -03:00
export async function batchScrapeController (
2024-12-14 01:11:43 +01:00
req : RequestWithAuth < { } , BatchScrapeResponse , BatchScrapeRequest > ,
res : Response < BatchScrapeResponse > ,
2024-10-17 19:40:18 +02:00
) {
2024-12-14 01:11:43 +01:00
if ( req . body ? . ignoreInvalidURLs === true ) {
req . body = batchScrapeRequestSchemaNoURLValidation . parse ( req . body ) ;
} else {
req . body = batchScrapeRequestSchema . parse ( req . body ) ;
}
2024-10-17 19:40:18 +02:00
2024-12-04 23:35:29 +01:00
const id = req . body . appendToId ? ? uuidv4 ( ) ;
2024-12-11 19:46:11 -03:00
const logger = _logger . child ( {
crawlId : id ,
batchScrapeId : id ,
module : "api/v1" ,
method : "batchScrapeController" ,
teamId : req.auth.team_id ,
} ) ;
2024-12-14 01:11:43 +01:00
let urls = req . body . urls ;
let invalidURLs : string [ ] | undefined = undefined ;
if ( req . body . ignoreInvalidURLs ) {
invalidURLs = [ ] ;
let pendingURLs = urls ;
urls = [ ] ;
for ( const u of pendingURLs ) {
try {
const nu = urlSchema . parse ( u ) ;
urls . push ( nu ) ;
} catch ( _ ) {
invalidURLs . push ( u ) ;
}
}
}
2024-12-11 19:46:11 -03:00
logger . debug ( "Batch scrape " + id + " starting" , {
2025-01-04 16:59:35 +01:00
urlsLength : urls.length ,
2024-12-11 19:46:11 -03:00
appendToId : req.body.appendToId ,
2024-12-11 19:51:08 -03:00
account : req.account ,
2024-12-11 19:46:11 -03:00
} ) ;
2024-10-17 19:40:18 +02:00
2024-12-04 23:35:29 +01:00
if ( ! req . body . appendToId ) {
await logCrawl ( id , req . auth . team_id ) ;
}
2024-10-17 19:40:18 +02:00
2024-11-07 20:57:33 +01:00
let { remainingCredits } = req . account ! ;
2024-12-11 19:46:11 -03:00
const useDbAuthentication = process . env . USE_DB_AUTHENTICATION === "true" ;
if ( ! useDbAuthentication ) {
2024-10-17 19:40:18 +02:00
remainingCredits = Infinity ;
}
2024-12-11 19:46:11 -03:00
const sc : StoredCrawl = req . body . appendToId
? ( ( await getCrawl ( req . body . appendToId ) ) as StoredCrawl )
: {
crawlerOptions : null ,
scrapeOptions : req.body ,
2025-04-02 19:52:43 +02:00
internalOptions : { disableSmartWaitCache : true , teamId : req.auth.team_id } , // NOTE: smart wait disabled for batch scrapes to ensure contentful scrape, speed does not matter
2024-12-11 19:46:11 -03:00
team_id : req.auth.team_id ,
createdAt : Date.now ( ) ,
} ;
2024-10-17 19:40:18 +02:00
2024-12-04 23:35:29 +01:00
if ( ! req . body . appendToId ) {
await saveCrawl ( id , sc ) ;
}
2024-10-17 19:40:18 +02:00
let jobPriority = 20 ;
// If it is over 1000, we need to get the job priority,
// otherwise we can use the default priority of 20
2024-12-14 01:11:43 +01:00
if ( urls . length > 1000 ) {
2024-10-17 19:40:18 +02:00
// set base to 21
2024-12-11 19:46:11 -03:00
jobPriority = await getJobPriority ( {
team_id : req.auth.team_id ,
2024-12-11 19:51:08 -03:00
basePriority : 21 ,
2024-12-11 19:46:11 -03:00
} ) ;
2024-10-17 19:40:18 +02:00
}
2024-12-05 20:50:36 +01:00
logger . debug ( "Using job priority " + jobPriority , { jobPriority } ) ;
2024-10-17 19:40:18 +02:00
2024-12-05 20:49:28 +01:00
const scrapeOptions : ScrapeOptions = { . . . req . body } ;
delete ( scrapeOptions as any ) . urls ;
delete ( scrapeOptions as any ) . appendToId ;
2024-12-14 01:11:43 +01:00
const jobs = urls . map ( ( x ) = > {
2024-10-17 19:40:18 +02:00
return {
data : {
url : x ,
2024-10-25 20:21:12 +02:00
mode : "single_urls" as const ,
2024-10-17 19:40:18 +02:00
team_id : req.auth.team_id ,
crawlerOptions : null ,
2024-12-05 20:49:28 +01:00
scrapeOptions ,
2024-10-17 19:40:18 +02:00
origin : "api" ,
crawl_id : id ,
sitemapped : true ,
v1 : true ,
2024-12-11 19:51:08 -03:00
webhook : req.body.webhook ,
2024-10-17 19:40:18 +02:00
} ,
opts : {
2024-10-25 20:21:12 +02:00
jobId : uuidv4 ( ) ,
2024-12-11 19:51:08 -03:00
priority : 20 ,
} ,
2024-10-17 19:40:18 +02:00
} ;
} ) ;
2025-01-20 09:40:59 +01:00
await finishCrawlKickoff ( id ) ;
2024-12-05 20:50:36 +01:00
logger . debug ( "Locking URLs..." ) ;
2024-10-17 19:40:18 +02:00
await lockURLs (
id ,
2024-11-11 21:36:22 +01:00
sc ,
2024-12-11 19:51:08 -03:00
jobs . map ( ( x ) = > x . data . url ) ,
2024-10-17 19:40:18 +02:00
) ;
2024-12-05 20:50:36 +01:00
logger . debug ( "Adding scrape jobs to Redis..." ) ;
2024-10-17 19:40:18 +02:00
await addCrawlJobs (
id ,
2024-12-11 19:51:08 -03:00
jobs . map ( ( x ) = > x . opts . jobId ) ,
2024-10-17 19:40:18 +02:00
) ;
2024-12-05 20:50:36 +01:00
logger . debug ( "Adding scrape jobs to BullMQ..." ) ;
2024-10-25 20:21:12 +02:00
await addScrapeJobs ( jobs ) ;
2024-10-17 19:40:18 +02:00
2024-12-11 19:46:11 -03:00
if ( req . body . webhook ) {
logger . debug ( "Calling webhook with batch_scrape.started..." , {
2024-12-11 19:51:08 -03:00
webhook : req.body.webhook ,
2024-12-11 19:46:11 -03:00
} ) ;
await callWebhook (
req . auth . team_id ,
id ,
null ,
req . body . webhook ,
true ,
2024-12-11 19:51:08 -03:00
"batch_scrape.started" ,
2024-12-11 19:46:11 -03:00
) ;
2024-11-14 22:39:41 +01:00
}
2024-10-17 19:40:18 +02:00
const protocol = process . env . ENV === "local" ? req . protocol : "https" ;
2024-12-11 19:46:11 -03:00
2024-10-17 19:40:18 +02:00
return res . status ( 200 ) . json ( {
success : true ,
id ,
2024-12-11 19:51:08 -03:00
url : ` ${ protocol } :// ${ req . get ( "host" ) } /v1/batch/scrape/ ${ id } ` ,
2024-12-14 01:11:43 +01:00
invalidURLs ,
2024-10-17 19:40:18 +02:00
} ) ;
}