Nick: fixed prettier

This commit is contained in:
Nicolas
2024-12-11 19:46:11 -03:00
parent e5fe9e1534
commit 00335e2ba9
134 changed files with 9565 additions and 7108 deletions
+61 -30
View File
@@ -5,7 +5,7 @@ import {
crawlRequestSchema,
CrawlResponse,
RequestWithAuth,
toLegacyCrawlerOptions,
toLegacyCrawlerOptions
} from "./types";
import {
addCrawlJob,
@@ -14,7 +14,7 @@ import {
lockURL,
lockURLs,
saveCrawl,
StoredCrawl,
StoredCrawl
} from "../../lib/crawl-redis";
import { logCrawl } from "../../services/logging/crawl_log";
import { getScrapeQueue } from "../../services/queue-service";
@@ -32,21 +32,31 @@ export async function crawlController(
req.body = crawlRequestSchema.parse(req.body);
const id = uuidv4();
const logger = _logger.child({ crawlId: id, module: "api/v1", method: "crawlController", teamId: req.auth.team_id, plan: req.auth.plan });
logger.debug("Crawl " + id + " starting", { request: req.body, originalRequest: preNormalizedBody, account: req.account });
const logger = _logger.child({
crawlId: id,
module: "api/v1",
method: "crawlController",
teamId: req.auth.team_id,
plan: req.auth.plan
});
logger.debug("Crawl " + id + " starting", {
request: req.body,
originalRequest: preNormalizedBody,
account: req.account
});
await logCrawl(id, req.auth.team_id);
let { remainingCredits } = req.account!;
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
if(!useDbAuthentication){
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true";
if (!useDbAuthentication) {
remainingCredits = Infinity;
}
const crawlerOptions = {
...req.body,
url: undefined,
scrapeOptions: undefined,
scrapeOptions: undefined
};
const scrapeOptions = req.body.scrapeOptions;
@@ -73,8 +83,12 @@ export async function crawlController(
const originalLimit = crawlerOptions.limit;
crawlerOptions.limit = Math.min(remainingCredits, crawlerOptions.limit);
logger.debug("Determined limit: " + crawlerOptions.limit, { remainingCredits, bodyLimit: originalLimit, originalBodyLimit: preNormalizedBody.limit });
logger.debug("Determined limit: " + crawlerOptions.limit, {
remainingCredits,
bodyLimit: originalLimit,
originalBodyLimit: preNormalizedBody.limit
});
const sc: StoredCrawl = {
originUrl: req.body.url,
crawlerOptions: toLegacyCrawlerOptions(crawlerOptions),
@@ -82,7 +96,7 @@ export async function crawlController(
internalOptions: { disableSmartWaitCache: true }, // NOTE: smart wait disabled for crawls to ensure contentful scrape, speed does not matter
team_id: req.auth.team_id,
createdAt: Date.now(),
plan: req.auth.plan,
plan: req.auth.plan
};
const crawler = crawlToCrawler(id, sc);
@@ -90,7 +104,9 @@ export async function crawlController(
try {
sc.robots = await crawler.getRobotsTxt(scrapeOptions.skipTlsVerification);
} catch (e) {
logger.debug("Failed to get robots.txt (this is probably fine!)", { error: e });
logger.debug("Failed to get robots.txt (this is probably fine!)", {
error: e
});
}
await saveCrawl(id, sc);
@@ -98,15 +114,21 @@ export async function crawlController(
const sitemap = sc.crawlerOptions.ignoreSitemap
? null
: await crawler.tryGetSitemap();
if (sitemap !== null && sitemap.length > 0) {
logger.debug("Using sitemap of length " + sitemap.length, { sitemapLength: sitemap.length });
logger.debug("Using sitemap of length " + sitemap.length, {
sitemapLength: sitemap.length
});
let jobPriority = 20;
// If it is over 1000, we need to get the job priority,
// otherwise we can use the default priority of 20
if(sitemap.length > 1000){
if (sitemap.length > 1000) {
// set base to 21
jobPriority = await getJobPriority({plan: req.auth.plan, team_id: req.auth.team_id, basePriority: 21})
jobPriority = await getJobPriority({
plan: req.auth.plan,
team_id: req.auth.team_id,
basePriority: 21
});
}
logger.debug("Using job priority " + jobPriority, { jobPriority });
@@ -127,14 +149,14 @@ export async function crawlController(
crawl_id: id,
sitemapped: true,
webhook: req.body.webhook,
v1: true,
v1: true
},
opts: {
jobId: uuid,
priority: 20,
},
priority: 20
}
};
})
});
logger.debug("Locking URLs...");
await lockURLs(
@@ -150,7 +172,9 @@ export async function crawlController(
logger.debug("Adding scrape jobs to BullMQ...");
await getScrapeQueue().addBulk(jobs);
} else {
logger.debug("Sitemap not found or ignored.", { ignoreSitemap: sc.crawlerOptions.ignoreSitemap });
logger.debug("Sitemap not found or ignored.", {
ignoreSitemap: sc.crawlerOptions.ignoreSitemap
});
logger.debug("Locking URL...");
await lockURL(id, sc, req.body.url);
@@ -168,30 +192,37 @@ export async function crawlController(
origin: "api",
crawl_id: id,
webhook: req.body.webhook,
v1: true,
v1: true
},
{
priority: 15,
priority: 15
},
jobId,
jobId
);
logger.debug("Adding scrape job to BullMQ...", { jobId });
await addCrawlJob(id, jobId);
}
logger.debug("Done queueing jobs!");
if(req.body.webhook) {
logger.debug("Calling webhook with crawl.started...", { webhook: req.body.webhook });
await callWebhook(req.auth.team_id, id, null, req.body.webhook, true, "crawl.started");
if (req.body.webhook) {
logger.debug("Calling webhook with crawl.started...", {
webhook: req.body.webhook
});
await callWebhook(
req.auth.team_id,
id,
null,
req.body.webhook,
true,
"crawl.started"
);
}
const protocol = process.env.ENV === "local" ? req.protocol : "https";
return res.status(200).json({
success: true,
id,
url: `${protocol}://${req.get("host")}/v1/crawl/${id}`,
url: `${protocol}://${req.get("host")}/v1/crawl/${id}`
});
}