feat: scrape event logging to DB
This commit is contained in:
@@ -10,6 +10,7 @@ import { logCrawl } from "../../src/services/logging/crawl_log";
|
|||||||
import { validateIdempotencyKey } from "../../src/services/idempotency/validate";
|
import { validateIdempotencyKey } from "../../src/services/idempotency/validate";
|
||||||
import { createIdempotencyKey } from "../../src/services/idempotency/create";
|
import { createIdempotencyKey } from "../../src/services/idempotency/create";
|
||||||
import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../src/lib/default-values";
|
import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../src/lib/default-values";
|
||||||
|
import { v4 as uuidv4 } from "uuid";
|
||||||
|
|
||||||
export async function crawlController(req: Request, res: Response) {
|
export async function crawlController(req: Request, res: Response) {
|
||||||
try {
|
try {
|
||||||
@@ -60,10 +61,11 @@ export async function crawlController(req: Request, res: Response) {
|
|||||||
const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions };
|
const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions };
|
||||||
const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions };
|
const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions };
|
||||||
|
|
||||||
if (mode === "single_urls" && !url.includes(",")) {
|
if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
|
||||||
try {
|
try {
|
||||||
const a = new WebScraperDataProvider();
|
const a = new WebScraperDataProvider();
|
||||||
await a.setOptions({
|
await a.setOptions({
|
||||||
|
jobId: uuidv4(),
|
||||||
mode: "single_urls",
|
mode: "single_urls",
|
||||||
urls: [url],
|
urls: [url],
|
||||||
crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true },
|
crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true },
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ import { Document } from "../lib/entities";
|
|||||||
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
|
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
|
||||||
import { numTokensFromString } from '../lib/LLM-extraction/helpers';
|
import { numTokensFromString } from '../lib/LLM-extraction/helpers';
|
||||||
import { defaultPageOptions, defaultExtractorOptions, defaultTimeout, defaultOrigin } from '../lib/default-values';
|
import { defaultPageOptions, defaultExtractorOptions, defaultTimeout, defaultOrigin } from '../lib/default-values';
|
||||||
|
import { v4 as uuidv4 } from "uuid";
|
||||||
|
|
||||||
export async function scrapeHelper(
|
export async function scrapeHelper(
|
||||||
req: Request,
|
req: Request,
|
||||||
@@ -35,6 +36,7 @@ export async function scrapeHelper(
|
|||||||
|
|
||||||
const a = new WebScraperDataProvider();
|
const a = new WebScraperDataProvider();
|
||||||
await a.setOptions({
|
await a.setOptions({
|
||||||
|
jobId: uuidv4(),
|
||||||
mode: "single_urls",
|
mode: "single_urls",
|
||||||
urls: [url],
|
urls: [url],
|
||||||
crawlerOptions: {
|
crawlerOptions: {
|
||||||
|
|||||||
@@ -7,8 +7,10 @@ import { logJob } from "../services/logging/log_job";
|
|||||||
import { PageOptions, SearchOptions } from "../lib/entities";
|
import { PageOptions, SearchOptions } from "../lib/entities";
|
||||||
import { search } from "../search";
|
import { search } from "../search";
|
||||||
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
|
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
|
||||||
|
import { v4 as uuidv4 } from "uuid";
|
||||||
|
|
||||||
export async function searchHelper(
|
export async function searchHelper(
|
||||||
|
jobId: string,
|
||||||
req: Request,
|
req: Request,
|
||||||
team_id: string,
|
team_id: string,
|
||||||
crawlerOptions: any,
|
crawlerOptions: any,
|
||||||
@@ -75,6 +77,7 @@ export async function searchHelper(
|
|||||||
|
|
||||||
const a = new WebScraperDataProvider();
|
const a = new WebScraperDataProvider();
|
||||||
await a.setOptions({
|
await a.setOptions({
|
||||||
|
jobId,
|
||||||
mode: "single_urls",
|
mode: "single_urls",
|
||||||
urls: res.map((r) => r.url).slice(0, searchOptions.limit ?? 7),
|
urls: res.map((r) => r.url).slice(0, searchOptions.limit ?? 7),
|
||||||
crawlerOptions: {
|
crawlerOptions: {
|
||||||
@@ -148,6 +151,8 @@ export async function searchController(req: Request, res: Response) {
|
|||||||
|
|
||||||
const searchOptions = req.body.searchOptions ?? { limit: 7 };
|
const searchOptions = req.body.searchOptions ?? { limit: 7 };
|
||||||
|
|
||||||
|
const jobId = uuidv4();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const { success: creditsCheckSuccess, message: creditsCheckMessage } =
|
const { success: creditsCheckSuccess, message: creditsCheckMessage } =
|
||||||
await checkTeamCredits(team_id, 1);
|
await checkTeamCredits(team_id, 1);
|
||||||
@@ -160,6 +165,7 @@ export async function searchController(req: Request, res: Response) {
|
|||||||
}
|
}
|
||||||
const startTime = new Date().getTime();
|
const startTime = new Date().getTime();
|
||||||
const result = await searchHelper(
|
const result = await searchHelper(
|
||||||
|
jobId,
|
||||||
req,
|
req,
|
||||||
team_id,
|
team_id,
|
||||||
crawlerOptions,
|
crawlerOptions,
|
||||||
@@ -169,6 +175,7 @@ export async function searchController(req: Request, res: Response) {
|
|||||||
const endTime = new Date().getTime();
|
const endTime = new Date().getTime();
|
||||||
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||||
logJob({
|
logJob({
|
||||||
|
job_id: jobId,
|
||||||
success: result.success,
|
success: result.success,
|
||||||
message: result.error,
|
message: result.error,
|
||||||
num_docs: result.data ? result.data.length : 0,
|
num_docs: result.data ? result.data.length : 0,
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ async function example() {
|
|||||||
const example = new WebScraperDataProvider();
|
const example = new WebScraperDataProvider();
|
||||||
|
|
||||||
await example.setOptions({
|
await example.setOptions({
|
||||||
|
jobId: "TEST",
|
||||||
mode: "crawl",
|
mode: "crawl",
|
||||||
urls: ["https://mendable.ai"],
|
urls: ["https://mendable.ai"],
|
||||||
crawlerOptions: {},
|
crawlerOptions: {},
|
||||||
|
|||||||
@@ -56,6 +56,7 @@ export type CrawlerOptions = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
export type WebScraperOptions = {
|
export type WebScraperOptions = {
|
||||||
|
jobId: string;
|
||||||
urls: string[];
|
urls: string[];
|
||||||
mode: "single_urls" | "sitemap" | "crawl";
|
mode: "single_urls" | "sitemap" | "crawl";
|
||||||
crawlerOptions?: CrawlerOptions;
|
crawlerOptions?: CrawlerOptions;
|
||||||
|
|||||||
@@ -0,0 +1,58 @@
|
|||||||
|
import type { baseScrapers } from "../scraper/WebScraper/single_url";
|
||||||
|
import { supabase_service as supabase } from "../services/supabase";
|
||||||
|
|
||||||
|
export type ScrapeErrorEvent = {
|
||||||
|
type: "error",
|
||||||
|
message: string,
|
||||||
|
stack?: string,
|
||||||
|
}
|
||||||
|
|
||||||
|
export type ScrapeScrapeEvent = {
|
||||||
|
type: "scrape",
|
||||||
|
method: (typeof baseScrapers)[number],
|
||||||
|
result: null | {
|
||||||
|
success: boolean,
|
||||||
|
response_code?: number,
|
||||||
|
error?: string,
|
||||||
|
// proxy?: string,
|
||||||
|
time_taken: number,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
export type ScrapeQueueEvent = {
|
||||||
|
type: "queue",
|
||||||
|
event: "created" | "started" | "interrupted" | "finished",
|
||||||
|
worker?: string,
|
||||||
|
}
|
||||||
|
|
||||||
|
export type ScrapeEvent = ScrapeErrorEvent | ScrapeScrapeEvent | ScrapeQueueEvent;
|
||||||
|
|
||||||
|
export class ScrapeEvents {
|
||||||
|
static async insert(jobId: string, content: ScrapeEvent) {
|
||||||
|
if (jobId === "TEST") return null;
|
||||||
|
|
||||||
|
if (process.env.USE_DB_AUTH) {
|
||||||
|
const result = await supabase.from("scrape_events").insert({
|
||||||
|
job_id: jobId,
|
||||||
|
type: content.type,
|
||||||
|
content: content,
|
||||||
|
// created_at
|
||||||
|
}).single();
|
||||||
|
return (result.data as any).id;
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
static async updateScrapeResult(logId: number | null, result: ScrapeScrapeEvent["result"]) {
|
||||||
|
if (logId === null) return;
|
||||||
|
|
||||||
|
const previousLog = (await supabase.from("scrape_events").select().eq("id", logId).single()).data as any;
|
||||||
|
await supabase.from("scrape_events").update({
|
||||||
|
content: {
|
||||||
|
...previousLog.content,
|
||||||
|
result,
|
||||||
|
}
|
||||||
|
}).eq("id", logId);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -60,6 +60,7 @@ export async function runWebScraper({
|
|||||||
const provider = new WebScraperDataProvider();
|
const provider = new WebScraperDataProvider();
|
||||||
if (mode === "crawl") {
|
if (mode === "crawl") {
|
||||||
await provider.setOptions({
|
await provider.setOptions({
|
||||||
|
jobId: bull_job_id,
|
||||||
mode: mode,
|
mode: mode,
|
||||||
urls: [url],
|
urls: [url],
|
||||||
crawlerOptions: crawlerOptions,
|
crawlerOptions: crawlerOptions,
|
||||||
@@ -68,6 +69,7 @@ export async function runWebScraper({
|
|||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
await provider.setOptions({
|
await provider.setOptions({
|
||||||
|
jobId: bull_job_id,
|
||||||
mode: mode,
|
mode: mode,
|
||||||
urls: url.split(","),
|
urls: url.split(","),
|
||||||
crawlerOptions: crawlerOptions,
|
crawlerOptions: crawlerOptions,
|
||||||
|
|||||||
@@ -42,6 +42,7 @@ describe('WebCrawler', () => {
|
|||||||
|
|
||||||
|
|
||||||
crawler = new WebCrawler({
|
crawler = new WebCrawler({
|
||||||
|
jobId: "TEST",
|
||||||
initialUrl: initialUrl,
|
initialUrl: initialUrl,
|
||||||
includes: [],
|
includes: [],
|
||||||
excludes: [],
|
excludes: [],
|
||||||
@@ -76,6 +77,7 @@ describe('WebCrawler', () => {
|
|||||||
|
|
||||||
|
|
||||||
crawler = new WebCrawler({
|
crawler = new WebCrawler({
|
||||||
|
jobId: "TEST",
|
||||||
initialUrl: initialUrl,
|
initialUrl: initialUrl,
|
||||||
includes: [],
|
includes: [],
|
||||||
excludes: [],
|
excludes: [],
|
||||||
@@ -104,6 +106,7 @@ describe('WebCrawler', () => {
|
|||||||
|
|
||||||
|
|
||||||
crawler = new WebCrawler({
|
crawler = new WebCrawler({
|
||||||
|
jobId: "TEST",
|
||||||
initialUrl: initialUrl,
|
initialUrl: initialUrl,
|
||||||
includes: [],
|
includes: [],
|
||||||
excludes: [],
|
excludes: [],
|
||||||
@@ -133,6 +136,7 @@ describe('WebCrawler', () => {
|
|||||||
|
|
||||||
|
|
||||||
crawler = new WebCrawler({
|
crawler = new WebCrawler({
|
||||||
|
jobId: "TEST",
|
||||||
initialUrl: initialUrl,
|
initialUrl: initialUrl,
|
||||||
includes: [],
|
includes: [],
|
||||||
excludes: [],
|
excludes: [],
|
||||||
@@ -161,6 +165,7 @@ describe('WebCrawler', () => {
|
|||||||
|
|
||||||
// Setup the crawler with the specific test case options
|
// Setup the crawler with the specific test case options
|
||||||
const crawler = new WebCrawler({
|
const crawler = new WebCrawler({
|
||||||
|
jobId: "TEST",
|
||||||
initialUrl: initialUrl,
|
initialUrl: initialUrl,
|
||||||
includes: [],
|
includes: [],
|
||||||
excludes: [],
|
excludes: [],
|
||||||
@@ -194,6 +199,7 @@ describe('WebCrawler', () => {
|
|||||||
const limit = 2; // Set a limit for the number of links
|
const limit = 2; // Set a limit for the number of links
|
||||||
|
|
||||||
crawler = new WebCrawler({
|
crawler = new WebCrawler({
|
||||||
|
jobId: "TEST",
|
||||||
initialUrl: initialUrl,
|
initialUrl: initialUrl,
|
||||||
includes: [],
|
includes: [],
|
||||||
excludes: [],
|
excludes: [],
|
||||||
|
|||||||
@@ -15,8 +15,8 @@ describe('scrapSingleUrl', () => {
|
|||||||
const pageOptionsWithHtml: PageOptions = { includeHtml: true };
|
const pageOptionsWithHtml: PageOptions = { includeHtml: true };
|
||||||
const pageOptionsWithoutHtml: PageOptions = { includeHtml: false };
|
const pageOptionsWithoutHtml: PageOptions = { includeHtml: false };
|
||||||
|
|
||||||
const resultWithHtml = await scrapSingleUrl(url, pageOptionsWithHtml);
|
const resultWithHtml = await scrapSingleUrl("TEST", url, pageOptionsWithHtml);
|
||||||
const resultWithoutHtml = await scrapSingleUrl(url, pageOptionsWithoutHtml);
|
const resultWithoutHtml = await scrapSingleUrl("TEST", url, pageOptionsWithoutHtml);
|
||||||
|
|
||||||
expect(resultWithHtml.html).toBeDefined();
|
expect(resultWithHtml.html).toBeDefined();
|
||||||
expect(resultWithoutHtml.html).toBeUndefined();
|
expect(resultWithoutHtml.html).toBeUndefined();
|
||||||
@@ -27,7 +27,7 @@ it('should return a list of links on the mendable.ai page', async () => {
|
|||||||
const url = 'https://mendable.ai';
|
const url = 'https://mendable.ai';
|
||||||
const pageOptions: PageOptions = { includeHtml: true };
|
const pageOptions: PageOptions = { includeHtml: true };
|
||||||
|
|
||||||
const result = await scrapSingleUrl(url, pageOptions);
|
const result = await scrapSingleUrl("TEST", url, pageOptions);
|
||||||
|
|
||||||
// Check if the result contains a list of links
|
// Check if the result contains a list of links
|
||||||
expect(result.linksOnPage).toBeDefined();
|
expect(result.linksOnPage).toBeDefined();
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ import { axiosTimeout } from "../../../src/lib/timeout";
|
|||||||
import { Logger } from "../../../src/lib/logger";
|
import { Logger } from "../../../src/lib/logger";
|
||||||
|
|
||||||
export class WebCrawler {
|
export class WebCrawler {
|
||||||
|
private jobId: string;
|
||||||
private initialUrl: string;
|
private initialUrl: string;
|
||||||
private baseUrl: string;
|
private baseUrl: string;
|
||||||
private includes: string[];
|
private includes: string[];
|
||||||
@@ -27,6 +28,7 @@ export class WebCrawler {
|
|||||||
private allowExternalContentLinks: boolean;
|
private allowExternalContentLinks: boolean;
|
||||||
|
|
||||||
constructor({
|
constructor({
|
||||||
|
jobId,
|
||||||
initialUrl,
|
initialUrl,
|
||||||
includes,
|
includes,
|
||||||
excludes,
|
excludes,
|
||||||
@@ -37,6 +39,7 @@ export class WebCrawler {
|
|||||||
allowBackwardCrawling = false,
|
allowBackwardCrawling = false,
|
||||||
allowExternalContentLinks = false
|
allowExternalContentLinks = false
|
||||||
}: {
|
}: {
|
||||||
|
jobId: string;
|
||||||
initialUrl: string;
|
initialUrl: string;
|
||||||
includes?: string[];
|
includes?: string[];
|
||||||
excludes?: string[];
|
excludes?: string[];
|
||||||
@@ -47,6 +50,7 @@ export class WebCrawler {
|
|||||||
allowBackwardCrawling?: boolean;
|
allowBackwardCrawling?: boolean;
|
||||||
allowExternalContentLinks?: boolean;
|
allowExternalContentLinks?: boolean;
|
||||||
}) {
|
}) {
|
||||||
|
this.jobId = jobId;
|
||||||
this.initialUrl = initialUrl;
|
this.initialUrl = initialUrl;
|
||||||
this.baseUrl = new URL(initialUrl).origin;
|
this.baseUrl = new URL(initialUrl).origin;
|
||||||
this.includes = includes ?? [];
|
this.includes = includes ?? [];
|
||||||
@@ -261,7 +265,7 @@ export class WebCrawler {
|
|||||||
|
|
||||||
// If it is the first link, fetch with single url
|
// If it is the first link, fetch with single url
|
||||||
if (this.visited.size === 1) {
|
if (this.visited.size === 1) {
|
||||||
const page = await scrapSingleUrl(url, { ...pageOptions, includeHtml: true });
|
const page = await scrapSingleUrl(this.jobId, url, { ...pageOptions, includeHtml: true });
|
||||||
content = page.html ?? "";
|
content = page.html ?? "";
|
||||||
pageStatusCode = page.metadata?.pageStatusCode;
|
pageStatusCode = page.metadata?.pageStatusCode;
|
||||||
pageError = page.metadata?.pageError || undefined;
|
pageError = page.metadata?.pageError || undefined;
|
||||||
|
|||||||
@@ -22,6 +22,7 @@ import { getAdjustedMaxDepth, getURLDepth } from "./utils/maxDepthUtils";
|
|||||||
import { Logger } from "../../lib/logger";
|
import { Logger } from "../../lib/logger";
|
||||||
|
|
||||||
export class WebScraperDataProvider {
|
export class WebScraperDataProvider {
|
||||||
|
private jobId: string;
|
||||||
private bullJobId: string;
|
private bullJobId: string;
|
||||||
private urls: string[] = [""];
|
private urls: string[] = [""];
|
||||||
private mode: "single_urls" | "sitemap" | "crawl" = "single_urls";
|
private mode: "single_urls" | "sitemap" | "crawl" = "single_urls";
|
||||||
@@ -66,6 +67,7 @@ export class WebScraperDataProvider {
|
|||||||
batchUrls.map(async (url, index) => {
|
batchUrls.map(async (url, index) => {
|
||||||
const existingHTML = allHtmls ? allHtmls[i + index] : "";
|
const existingHTML = allHtmls ? allHtmls[i + index] : "";
|
||||||
const result = await scrapSingleUrl(
|
const result = await scrapSingleUrl(
|
||||||
|
this.jobId,
|
||||||
url,
|
url,
|
||||||
this.pageOptions,
|
this.pageOptions,
|
||||||
this.extractorOptions,
|
this.extractorOptions,
|
||||||
@@ -166,6 +168,7 @@ export class WebScraperDataProvider {
|
|||||||
inProgress?: (progress: Progress) => void
|
inProgress?: (progress: Progress) => void
|
||||||
): Promise<Document[]> {
|
): Promise<Document[]> {
|
||||||
const crawler = new WebCrawler({
|
const crawler = new WebCrawler({
|
||||||
|
jobId: this.jobId,
|
||||||
initialUrl: this.urls[0],
|
initialUrl: this.urls[0],
|
||||||
includes: this.includes,
|
includes: this.includes,
|
||||||
excludes: this.excludes,
|
excludes: this.excludes,
|
||||||
@@ -500,6 +503,7 @@ export class WebScraperDataProvider {
|
|||||||
throw new Error("Urls are required");
|
throw new Error("Urls are required");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
this.jobId = options.jobId;
|
||||||
this.bullJobId = options.bullJobId;
|
this.bullJobId = options.bullJobId;
|
||||||
this.urls = options.urls;
|
this.urls = options.urls;
|
||||||
this.mode = options.mode;
|
this.mode = options.mode;
|
||||||
|
|||||||
@@ -18,10 +18,11 @@ import { scrapWithPlaywright } from "./scrapers/playwright";
|
|||||||
import { scrapWithScrapingBee } from "./scrapers/scrapingBee";
|
import { scrapWithScrapingBee } from "./scrapers/scrapingBee";
|
||||||
import { extractLinks } from "./utils/utils";
|
import { extractLinks } from "./utils/utils";
|
||||||
import { Logger } from "../../lib/logger";
|
import { Logger } from "../../lib/logger";
|
||||||
|
import { ScrapeEvents } from "../../lib/scrape-events";
|
||||||
|
|
||||||
dotenv.config();
|
dotenv.config();
|
||||||
|
|
||||||
const baseScrapers = [
|
export const baseScrapers = [
|
||||||
"fire-engine",
|
"fire-engine",
|
||||||
"fire-engine;chrome-cdp",
|
"fire-engine;chrome-cdp",
|
||||||
"scrapingBee",
|
"scrapingBee",
|
||||||
@@ -118,6 +119,7 @@ function getScrapingFallbackOrder(
|
|||||||
|
|
||||||
|
|
||||||
export async function scrapSingleUrl(
|
export async function scrapSingleUrl(
|
||||||
|
jobId: string,
|
||||||
urlToScrap: string,
|
urlToScrap: string,
|
||||||
pageOptions: PageOptions = {
|
pageOptions: PageOptions = {
|
||||||
onlyMainContent: true,
|
onlyMainContent: true,
|
||||||
@@ -145,6 +147,13 @@ export async function scrapSingleUrl(
|
|||||||
} = { text: "", screenshot: "", metadata: {} };
|
} = { text: "", screenshot: "", metadata: {} };
|
||||||
let screenshot = "";
|
let screenshot = "";
|
||||||
|
|
||||||
|
const timer = Date.now();
|
||||||
|
const logInsertPromise = ScrapeEvents.insert(jobId, {
|
||||||
|
type: "scrape",
|
||||||
|
method,
|
||||||
|
result: null,
|
||||||
|
});
|
||||||
|
|
||||||
switch (method) {
|
switch (method) {
|
||||||
case "fire-engine":
|
case "fire-engine":
|
||||||
case "fire-engine;chrome-cdp":
|
case "fire-engine;chrome-cdp":
|
||||||
@@ -254,8 +263,18 @@ export async function scrapSingleUrl(
|
|||||||
}
|
}
|
||||||
//* TODO: add an optional to return markdown or structured/extracted content
|
//* TODO: add an optional to return markdown or structured/extracted content
|
||||||
let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions);
|
let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions);
|
||||||
|
const text = await parseMarkdown(cleanedHtml);
|
||||||
|
|
||||||
|
const insertedLogId = await logInsertPromise;
|
||||||
|
ScrapeEvents.updateScrapeResult(insertedLogId, {
|
||||||
|
success: !!scraperResponse.metadata.pageError && !!text,
|
||||||
|
error: scraperResponse.metadata.pageError,
|
||||||
|
response_code: scraperResponse.metadata.pageStatusCode,
|
||||||
|
time_taken: Date.now() - timer,
|
||||||
|
});
|
||||||
|
|
||||||
return {
|
return {
|
||||||
text: await parseMarkdown(cleanedHtml),
|
text,
|
||||||
html: cleanedHtml,
|
html: cleanedHtml,
|
||||||
rawHtml: scraperResponse.text,
|
rawHtml: scraperResponse.text,
|
||||||
screenshot: scraperResponse.screenshot,
|
screenshot: scraperResponse.screenshot,
|
||||||
@@ -379,6 +398,11 @@ export async function scrapSingleUrl(
|
|||||||
return document;
|
return document;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Logger.debug(`⛏️ Error: ${error.message} - Failed to fetch URL: ${urlToScrap}`);
|
Logger.debug(`⛏️ Error: ${error.message} - Failed to fetch URL: ${urlToScrap}`);
|
||||||
|
ScrapeEvents.insert(jobId, {
|
||||||
|
type: "error",
|
||||||
|
message: typeof error === "string" ? error : typeof error.message === "string" ? error.message : JSON.stringify(error),
|
||||||
|
stack: error.stack,
|
||||||
|
});
|
||||||
return {
|
return {
|
||||||
content: "",
|
content: "",
|
||||||
markdown: "",
|
markdown: "",
|
||||||
|
|||||||
Reference in New Issue
Block a user