feat: scrape event logging to DB

This commit is contained in:
Gergo Moricz
2024-07-24 14:31:25 +02:00
parent 6208ecdbc0
commit 7cd9bf92e3
12 changed files with 118 additions and 7 deletions
+3 -1
View File
@@ -10,6 +10,7 @@ import { logCrawl } from "../../src/services/logging/crawl_log";
import { validateIdempotencyKey } from "../../src/services/idempotency/validate"; import { validateIdempotencyKey } from "../../src/services/idempotency/validate";
import { createIdempotencyKey } from "../../src/services/idempotency/create"; import { createIdempotencyKey } from "../../src/services/idempotency/create";
import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../src/lib/default-values"; import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../src/lib/default-values";
import { v4 as uuidv4 } from "uuid";
export async function crawlController(req: Request, res: Response) { export async function crawlController(req: Request, res: Response) {
try { try {
@@ -60,10 +61,11 @@ export async function crawlController(req: Request, res: Response) {
const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions }; const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions };
const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions }; const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions };
if (mode === "single_urls" && !url.includes(",")) { if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
try { try {
const a = new WebScraperDataProvider(); const a = new WebScraperDataProvider();
await a.setOptions({ await a.setOptions({
jobId: uuidv4(),
mode: "single_urls", mode: "single_urls",
urls: [url], urls: [url],
crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true }, crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true },
+2
View File
@@ -9,6 +9,7 @@ import { Document } from "../lib/entities";
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
import { numTokensFromString } from '../lib/LLM-extraction/helpers'; import { numTokensFromString } from '../lib/LLM-extraction/helpers';
import { defaultPageOptions, defaultExtractorOptions, defaultTimeout, defaultOrigin } from '../lib/default-values'; import { defaultPageOptions, defaultExtractorOptions, defaultTimeout, defaultOrigin } from '../lib/default-values';
import { v4 as uuidv4 } from "uuid";
export async function scrapeHelper( export async function scrapeHelper(
req: Request, req: Request,
@@ -35,6 +36,7 @@ export async function scrapeHelper(
const a = new WebScraperDataProvider(); const a = new WebScraperDataProvider();
await a.setOptions({ await a.setOptions({
jobId: uuidv4(),
mode: "single_urls", mode: "single_urls",
urls: [url], urls: [url],
crawlerOptions: { crawlerOptions: {
+7
View File
@@ -7,8 +7,10 @@ import { logJob } from "../services/logging/log_job";
import { PageOptions, SearchOptions } from "../lib/entities"; import { PageOptions, SearchOptions } from "../lib/entities";
import { search } from "../search"; import { search } from "../search";
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
import { v4 as uuidv4 } from "uuid";
export async function searchHelper( export async function searchHelper(
jobId: string,
req: Request, req: Request,
team_id: string, team_id: string,
crawlerOptions: any, crawlerOptions: any,
@@ -75,6 +77,7 @@ export async function searchHelper(
const a = new WebScraperDataProvider(); const a = new WebScraperDataProvider();
await a.setOptions({ await a.setOptions({
jobId,
mode: "single_urls", mode: "single_urls",
urls: res.map((r) => r.url).slice(0, searchOptions.limit ?? 7), urls: res.map((r) => r.url).slice(0, searchOptions.limit ?? 7),
crawlerOptions: { crawlerOptions: {
@@ -148,6 +151,8 @@ export async function searchController(req: Request, res: Response) {
const searchOptions = req.body.searchOptions ?? { limit: 7 }; const searchOptions = req.body.searchOptions ?? { limit: 7 };
const jobId = uuidv4();
try { try {
const { success: creditsCheckSuccess, message: creditsCheckMessage } = const { success: creditsCheckSuccess, message: creditsCheckMessage } =
await checkTeamCredits(team_id, 1); await checkTeamCredits(team_id, 1);
@@ -160,6 +165,7 @@ export async function searchController(req: Request, res: Response) {
} }
const startTime = new Date().getTime(); const startTime = new Date().getTime();
const result = await searchHelper( const result = await searchHelper(
jobId,
req, req,
team_id, team_id,
crawlerOptions, crawlerOptions,
@@ -169,6 +175,7 @@ export async function searchController(req: Request, res: Response) {
const endTime = new Date().getTime(); const endTime = new Date().getTime();
const timeTakenInSeconds = (endTime - startTime) / 1000; const timeTakenInSeconds = (endTime - startTime) / 1000;
logJob({ logJob({
job_id: jobId,
success: result.success, success: result.success,
message: result.error, message: result.error,
num_docs: result.data ? result.data.length : 0, num_docs: result.data ? result.data.length : 0,
+1
View File
@@ -4,6 +4,7 @@ async function example() {
const example = new WebScraperDataProvider(); const example = new WebScraperDataProvider();
await example.setOptions({ await example.setOptions({
jobId: "TEST",
mode: "crawl", mode: "crawl",
urls: ["https://mendable.ai"], urls: ["https://mendable.ai"],
crawlerOptions: {}, crawlerOptions: {},
+1
View File
@@ -56,6 +56,7 @@ export type CrawlerOptions = {
} }
export type WebScraperOptions = { export type WebScraperOptions = {
jobId: string;
urls: string[]; urls: string[];
mode: "single_urls" | "sitemap" | "crawl"; mode: "single_urls" | "sitemap" | "crawl";
crawlerOptions?: CrawlerOptions; crawlerOptions?: CrawlerOptions;
+58
View File
@@ -0,0 +1,58 @@
import type { baseScrapers } from "../scraper/WebScraper/single_url";
import { supabase_service as supabase } from "../services/supabase";
export type ScrapeErrorEvent = {
type: "error",
message: string,
stack?: string,
}
export type ScrapeScrapeEvent = {
type: "scrape",
method: (typeof baseScrapers)[number],
result: null | {
success: boolean,
response_code?: number,
error?: string,
// proxy?: string,
time_taken: number,
},
}
export type ScrapeQueueEvent = {
type: "queue",
event: "created" | "started" | "interrupted" | "finished",
worker?: string,
}
export type ScrapeEvent = ScrapeErrorEvent | ScrapeScrapeEvent | ScrapeQueueEvent;
export class ScrapeEvents {
static async insert(jobId: string, content: ScrapeEvent) {
if (jobId === "TEST") return null;
if (process.env.USE_DB_AUTH) {
const result = await supabase.from("scrape_events").insert({
job_id: jobId,
type: content.type,
content: content,
// created_at
}).single();
return (result.data as any).id;
}
return null;
}
static async updateScrapeResult(logId: number | null, result: ScrapeScrapeEvent["result"]) {
if (logId === null) return;
const previousLog = (await supabase.from("scrape_events").select().eq("id", logId).single()).data as any;
await supabase.from("scrape_events").update({
content: {
...previousLog.content,
result,
}
}).eq("id", logId);
}
}
+2
View File
@@ -60,6 +60,7 @@ export async function runWebScraper({
const provider = new WebScraperDataProvider(); const provider = new WebScraperDataProvider();
if (mode === "crawl") { if (mode === "crawl") {
await provider.setOptions({ await provider.setOptions({
jobId: bull_job_id,
mode: mode, mode: mode,
urls: [url], urls: [url],
crawlerOptions: crawlerOptions, crawlerOptions: crawlerOptions,
@@ -68,6 +69,7 @@ export async function runWebScraper({
}); });
} else { } else {
await provider.setOptions({ await provider.setOptions({
jobId: bull_job_id,
mode: mode, mode: mode,
urls: url.split(","), urls: url.split(","),
crawlerOptions: crawlerOptions, crawlerOptions: crawlerOptions,
@@ -42,6 +42,7 @@ describe('WebCrawler', () => {
crawler = new WebCrawler({ crawler = new WebCrawler({
jobId: "TEST",
initialUrl: initialUrl, initialUrl: initialUrl,
includes: [], includes: [],
excludes: [], excludes: [],
@@ -76,6 +77,7 @@ describe('WebCrawler', () => {
crawler = new WebCrawler({ crawler = new WebCrawler({
jobId: "TEST",
initialUrl: initialUrl, initialUrl: initialUrl,
includes: [], includes: [],
excludes: [], excludes: [],
@@ -104,6 +106,7 @@ describe('WebCrawler', () => {
crawler = new WebCrawler({ crawler = new WebCrawler({
jobId: "TEST",
initialUrl: initialUrl, initialUrl: initialUrl,
includes: [], includes: [],
excludes: [], excludes: [],
@@ -133,6 +136,7 @@ describe('WebCrawler', () => {
crawler = new WebCrawler({ crawler = new WebCrawler({
jobId: "TEST",
initialUrl: initialUrl, initialUrl: initialUrl,
includes: [], includes: [],
excludes: [], excludes: [],
@@ -161,6 +165,7 @@ describe('WebCrawler', () => {
// Setup the crawler with the specific test case options // Setup the crawler with the specific test case options
const crawler = new WebCrawler({ const crawler = new WebCrawler({
jobId: "TEST",
initialUrl: initialUrl, initialUrl: initialUrl,
includes: [], includes: [],
excludes: [], excludes: [],
@@ -194,6 +199,7 @@ describe('WebCrawler', () => {
const limit = 2; // Set a limit for the number of links const limit = 2; // Set a limit for the number of links
crawler = new WebCrawler({ crawler = new WebCrawler({
jobId: "TEST",
initialUrl: initialUrl, initialUrl: initialUrl,
includes: [], includes: [],
excludes: [], excludes: [],
@@ -15,8 +15,8 @@ describe('scrapSingleUrl', () => {
const pageOptionsWithHtml: PageOptions = { includeHtml: true }; const pageOptionsWithHtml: PageOptions = { includeHtml: true };
const pageOptionsWithoutHtml: PageOptions = { includeHtml: false }; const pageOptionsWithoutHtml: PageOptions = { includeHtml: false };
const resultWithHtml = await scrapSingleUrl(url, pageOptionsWithHtml); const resultWithHtml = await scrapSingleUrl("TEST", url, pageOptionsWithHtml);
const resultWithoutHtml = await scrapSingleUrl(url, pageOptionsWithoutHtml); const resultWithoutHtml = await scrapSingleUrl("TEST", url, pageOptionsWithoutHtml);
expect(resultWithHtml.html).toBeDefined(); expect(resultWithHtml.html).toBeDefined();
expect(resultWithoutHtml.html).toBeUndefined(); expect(resultWithoutHtml.html).toBeUndefined();
@@ -27,7 +27,7 @@ it('should return a list of links on the mendable.ai page', async () => {
const url = 'https://mendable.ai'; const url = 'https://mendable.ai';
const pageOptions: PageOptions = { includeHtml: true }; const pageOptions: PageOptions = { includeHtml: true };
const result = await scrapSingleUrl(url, pageOptions); const result = await scrapSingleUrl("TEST", url, pageOptions);
// Check if the result contains a list of links // Check if the result contains a list of links
expect(result.linksOnPage).toBeDefined(); expect(result.linksOnPage).toBeDefined();
+5 -1
View File
@@ -11,6 +11,7 @@ import { axiosTimeout } from "../../../src/lib/timeout";
import { Logger } from "../../../src/lib/logger"; import { Logger } from "../../../src/lib/logger";
export class WebCrawler { export class WebCrawler {
private jobId: string;
private initialUrl: string; private initialUrl: string;
private baseUrl: string; private baseUrl: string;
private includes: string[]; private includes: string[];
@@ -27,6 +28,7 @@ export class WebCrawler {
private allowExternalContentLinks: boolean; private allowExternalContentLinks: boolean;
constructor({ constructor({
jobId,
initialUrl, initialUrl,
includes, includes,
excludes, excludes,
@@ -37,6 +39,7 @@ export class WebCrawler {
allowBackwardCrawling = false, allowBackwardCrawling = false,
allowExternalContentLinks = false allowExternalContentLinks = false
}: { }: {
jobId: string;
initialUrl: string; initialUrl: string;
includes?: string[]; includes?: string[];
excludes?: string[]; excludes?: string[];
@@ -47,6 +50,7 @@ export class WebCrawler {
allowBackwardCrawling?: boolean; allowBackwardCrawling?: boolean;
allowExternalContentLinks?: boolean; allowExternalContentLinks?: boolean;
}) { }) {
this.jobId = jobId;
this.initialUrl = initialUrl; this.initialUrl = initialUrl;
this.baseUrl = new URL(initialUrl).origin; this.baseUrl = new URL(initialUrl).origin;
this.includes = includes ?? []; this.includes = includes ?? [];
@@ -261,7 +265,7 @@ export class WebCrawler {
// If it is the first link, fetch with single url // If it is the first link, fetch with single url
if (this.visited.size === 1) { if (this.visited.size === 1) {
const page = await scrapSingleUrl(url, { ...pageOptions, includeHtml: true }); const page = await scrapSingleUrl(this.jobId, url, { ...pageOptions, includeHtml: true });
content = page.html ?? ""; content = page.html ?? "";
pageStatusCode = page.metadata?.pageStatusCode; pageStatusCode = page.metadata?.pageStatusCode;
pageError = page.metadata?.pageError || undefined; pageError = page.metadata?.pageError || undefined;
+4
View File
@@ -22,6 +22,7 @@ import { getAdjustedMaxDepth, getURLDepth } from "./utils/maxDepthUtils";
import { Logger } from "../../lib/logger"; import { Logger } from "../../lib/logger";
export class WebScraperDataProvider { export class WebScraperDataProvider {
private jobId: string;
private bullJobId: string; private bullJobId: string;
private urls: string[] = [""]; private urls: string[] = [""];
private mode: "single_urls" | "sitemap" | "crawl" = "single_urls"; private mode: "single_urls" | "sitemap" | "crawl" = "single_urls";
@@ -66,6 +67,7 @@ export class WebScraperDataProvider {
batchUrls.map(async (url, index) => { batchUrls.map(async (url, index) => {
const existingHTML = allHtmls ? allHtmls[i + index] : ""; const existingHTML = allHtmls ? allHtmls[i + index] : "";
const result = await scrapSingleUrl( const result = await scrapSingleUrl(
this.jobId,
url, url,
this.pageOptions, this.pageOptions,
this.extractorOptions, this.extractorOptions,
@@ -166,6 +168,7 @@ export class WebScraperDataProvider {
inProgress?: (progress: Progress) => void inProgress?: (progress: Progress) => void
): Promise<Document[]> { ): Promise<Document[]> {
const crawler = new WebCrawler({ const crawler = new WebCrawler({
jobId: this.jobId,
initialUrl: this.urls[0], initialUrl: this.urls[0],
includes: this.includes, includes: this.includes,
excludes: this.excludes, excludes: this.excludes,
@@ -500,6 +503,7 @@ export class WebScraperDataProvider {
throw new Error("Urls are required"); throw new Error("Urls are required");
} }
this.jobId = options.jobId;
this.bullJobId = options.bullJobId; this.bullJobId = options.bullJobId;
this.urls = options.urls; this.urls = options.urls;
this.mode = options.mode; this.mode = options.mode;
+26 -2
View File
@@ -18,10 +18,11 @@ import { scrapWithPlaywright } from "./scrapers/playwright";
import { scrapWithScrapingBee } from "./scrapers/scrapingBee"; import { scrapWithScrapingBee } from "./scrapers/scrapingBee";
import { extractLinks } from "./utils/utils"; import { extractLinks } from "./utils/utils";
import { Logger } from "../../lib/logger"; import { Logger } from "../../lib/logger";
import { ScrapeEvents } from "../../lib/scrape-events";
dotenv.config(); dotenv.config();
const baseScrapers = [ export const baseScrapers = [
"fire-engine", "fire-engine",
"fire-engine;chrome-cdp", "fire-engine;chrome-cdp",
"scrapingBee", "scrapingBee",
@@ -118,6 +119,7 @@ function getScrapingFallbackOrder(
export async function scrapSingleUrl( export async function scrapSingleUrl(
jobId: string,
urlToScrap: string, urlToScrap: string,
pageOptions: PageOptions = { pageOptions: PageOptions = {
onlyMainContent: true, onlyMainContent: true,
@@ -145,6 +147,13 @@ export async function scrapSingleUrl(
} = { text: "", screenshot: "", metadata: {} }; } = { text: "", screenshot: "", metadata: {} };
let screenshot = ""; let screenshot = "";
const timer = Date.now();
const logInsertPromise = ScrapeEvents.insert(jobId, {
type: "scrape",
method,
result: null,
});
switch (method) { switch (method) {
case "fire-engine": case "fire-engine":
case "fire-engine;chrome-cdp": case "fire-engine;chrome-cdp":
@@ -254,8 +263,18 @@ export async function scrapSingleUrl(
} }
//* TODO: add an optional to return markdown or structured/extracted content //* TODO: add an optional to return markdown or structured/extracted content
let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions); let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions);
const text = await parseMarkdown(cleanedHtml);
const insertedLogId = await logInsertPromise;
ScrapeEvents.updateScrapeResult(insertedLogId, {
success: !!scraperResponse.metadata.pageError && !!text,
error: scraperResponse.metadata.pageError,
response_code: scraperResponse.metadata.pageStatusCode,
time_taken: Date.now() - timer,
});
return { return {
text: await parseMarkdown(cleanedHtml), text,
html: cleanedHtml, html: cleanedHtml,
rawHtml: scraperResponse.text, rawHtml: scraperResponse.text,
screenshot: scraperResponse.screenshot, screenshot: scraperResponse.screenshot,
@@ -379,6 +398,11 @@ export async function scrapSingleUrl(
return document; return document;
} catch (error) { } catch (error) {
Logger.debug(`⛏️ Error: ${error.message} - Failed to fetch URL: ${urlToScrap}`); Logger.debug(`⛏️ Error: ${error.message} - Failed to fetch URL: ${urlToScrap}`);
ScrapeEvents.insert(jobId, {
type: "error",
message: typeof error === "string" ? error : typeof error.message === "string" ? error.message : JSON.stringify(error),
stack: error.stack,
});
return { return {
content: "", content: "",
markdown: "", markdown: "",