Nick: fixed prettier
This commit is contained in:
@@ -5,7 +5,7 @@ import {
|
||||
crawlRequestSchema,
|
||||
CrawlResponse,
|
||||
RequestWithAuth,
|
||||
toLegacyCrawlerOptions,
|
||||
toLegacyCrawlerOptions
|
||||
} from "./types";
|
||||
import {
|
||||
addCrawlJob,
|
||||
@@ -14,7 +14,7 @@ import {
|
||||
lockURL,
|
||||
lockURLs,
|
||||
saveCrawl,
|
||||
StoredCrawl,
|
||||
StoredCrawl
|
||||
} from "../../lib/crawl-redis";
|
||||
import { logCrawl } from "../../services/logging/crawl_log";
|
||||
import { getScrapeQueue } from "../../services/queue-service";
|
||||
@@ -32,21 +32,31 @@ export async function crawlController(
|
||||
req.body = crawlRequestSchema.parse(req.body);
|
||||
|
||||
const id = uuidv4();
|
||||
const logger = _logger.child({ crawlId: id, module: "api/v1", method: "crawlController", teamId: req.auth.team_id, plan: req.auth.plan });
|
||||
logger.debug("Crawl " + id + " starting", { request: req.body, originalRequest: preNormalizedBody, account: req.account });
|
||||
const logger = _logger.child({
|
||||
crawlId: id,
|
||||
module: "api/v1",
|
||||
method: "crawlController",
|
||||
teamId: req.auth.team_id,
|
||||
plan: req.auth.plan
|
||||
});
|
||||
logger.debug("Crawl " + id + " starting", {
|
||||
request: req.body,
|
||||
originalRequest: preNormalizedBody,
|
||||
account: req.account
|
||||
});
|
||||
|
||||
await logCrawl(id, req.auth.team_id);
|
||||
|
||||
let { remainingCredits } = req.account!;
|
||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||
if(!useDbAuthentication){
|
||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true";
|
||||
if (!useDbAuthentication) {
|
||||
remainingCredits = Infinity;
|
||||
}
|
||||
|
||||
const crawlerOptions = {
|
||||
...req.body,
|
||||
url: undefined,
|
||||
scrapeOptions: undefined,
|
||||
scrapeOptions: undefined
|
||||
};
|
||||
const scrapeOptions = req.body.scrapeOptions;
|
||||
|
||||
@@ -73,8 +83,12 @@ export async function crawlController(
|
||||
|
||||
const originalLimit = crawlerOptions.limit;
|
||||
crawlerOptions.limit = Math.min(remainingCredits, crawlerOptions.limit);
|
||||
logger.debug("Determined limit: " + crawlerOptions.limit, { remainingCredits, bodyLimit: originalLimit, originalBodyLimit: preNormalizedBody.limit });
|
||||
|
||||
logger.debug("Determined limit: " + crawlerOptions.limit, {
|
||||
remainingCredits,
|
||||
bodyLimit: originalLimit,
|
||||
originalBodyLimit: preNormalizedBody.limit
|
||||
});
|
||||
|
||||
const sc: StoredCrawl = {
|
||||
originUrl: req.body.url,
|
||||
crawlerOptions: toLegacyCrawlerOptions(crawlerOptions),
|
||||
@@ -82,7 +96,7 @@ export async function crawlController(
|
||||
internalOptions: { disableSmartWaitCache: true }, // NOTE: smart wait disabled for crawls to ensure contentful scrape, speed does not matter
|
||||
team_id: req.auth.team_id,
|
||||
createdAt: Date.now(),
|
||||
plan: req.auth.plan,
|
||||
plan: req.auth.plan
|
||||
};
|
||||
|
||||
const crawler = crawlToCrawler(id, sc);
|
||||
@@ -90,7 +104,9 @@ export async function crawlController(
|
||||
try {
|
||||
sc.robots = await crawler.getRobotsTxt(scrapeOptions.skipTlsVerification);
|
||||
} catch (e) {
|
||||
logger.debug("Failed to get robots.txt (this is probably fine!)", { error: e });
|
||||
logger.debug("Failed to get robots.txt (this is probably fine!)", {
|
||||
error: e
|
||||
});
|
||||
}
|
||||
|
||||
await saveCrawl(id, sc);
|
||||
@@ -98,15 +114,21 @@ export async function crawlController(
|
||||
const sitemap = sc.crawlerOptions.ignoreSitemap
|
||||
? null
|
||||
: await crawler.tryGetSitemap();
|
||||
|
||||
|
||||
if (sitemap !== null && sitemap.length > 0) {
|
||||
logger.debug("Using sitemap of length " + sitemap.length, { sitemapLength: sitemap.length });
|
||||
logger.debug("Using sitemap of length " + sitemap.length, {
|
||||
sitemapLength: sitemap.length
|
||||
});
|
||||
let jobPriority = 20;
|
||||
// If it is over 1000, we need to get the job priority,
|
||||
// otherwise we can use the default priority of 20
|
||||
if(sitemap.length > 1000){
|
||||
if (sitemap.length > 1000) {
|
||||
// set base to 21
|
||||
jobPriority = await getJobPriority({plan: req.auth.plan, team_id: req.auth.team_id, basePriority: 21})
|
||||
jobPriority = await getJobPriority({
|
||||
plan: req.auth.plan,
|
||||
team_id: req.auth.team_id,
|
||||
basePriority: 21
|
||||
});
|
||||
}
|
||||
logger.debug("Using job priority " + jobPriority, { jobPriority });
|
||||
|
||||
@@ -127,14 +149,14 @@ export async function crawlController(
|
||||
crawl_id: id,
|
||||
sitemapped: true,
|
||||
webhook: req.body.webhook,
|
||||
v1: true,
|
||||
v1: true
|
||||
},
|
||||
opts: {
|
||||
jobId: uuid,
|
||||
priority: 20,
|
||||
},
|
||||
priority: 20
|
||||
}
|
||||
};
|
||||
})
|
||||
});
|
||||
|
||||
logger.debug("Locking URLs...");
|
||||
await lockURLs(
|
||||
@@ -150,7 +172,9 @@ export async function crawlController(
|
||||
logger.debug("Adding scrape jobs to BullMQ...");
|
||||
await getScrapeQueue().addBulk(jobs);
|
||||
} else {
|
||||
logger.debug("Sitemap not found or ignored.", { ignoreSitemap: sc.crawlerOptions.ignoreSitemap });
|
||||
logger.debug("Sitemap not found or ignored.", {
|
||||
ignoreSitemap: sc.crawlerOptions.ignoreSitemap
|
||||
});
|
||||
|
||||
logger.debug("Locking URL...");
|
||||
await lockURL(id, sc, req.body.url);
|
||||
@@ -168,30 +192,37 @@ export async function crawlController(
|
||||
origin: "api",
|
||||
crawl_id: id,
|
||||
webhook: req.body.webhook,
|
||||
v1: true,
|
||||
v1: true
|
||||
},
|
||||
{
|
||||
priority: 15,
|
||||
priority: 15
|
||||
},
|
||||
jobId,
|
||||
jobId
|
||||
);
|
||||
logger.debug("Adding scrape job to BullMQ...", { jobId });
|
||||
await addCrawlJob(id, jobId);
|
||||
}
|
||||
logger.debug("Done queueing jobs!");
|
||||
|
||||
if(req.body.webhook) {
|
||||
logger.debug("Calling webhook with crawl.started...", { webhook: req.body.webhook });
|
||||
await callWebhook(req.auth.team_id, id, null, req.body.webhook, true, "crawl.started");
|
||||
if (req.body.webhook) {
|
||||
logger.debug("Calling webhook with crawl.started...", {
|
||||
webhook: req.body.webhook
|
||||
});
|
||||
await callWebhook(
|
||||
req.auth.team_id,
|
||||
id,
|
||||
null,
|
||||
req.body.webhook,
|
||||
true,
|
||||
"crawl.started"
|
||||
);
|
||||
}
|
||||
|
||||
const protocol = process.env.ENV === "local" ? req.protocol : "https";
|
||||
|
||||
|
||||
return res.status(200).json({
|
||||
success: true,
|
||||
id,
|
||||
url: `${protocol}://${req.get("host")}/v1/crawl/${id}`,
|
||||
url: `${protocol}://${req.get("host")}/v1/crawl/${id}`
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user