feat: scrape event logging to DB

2024-07-24 14:31:25 +02:00
parent 6208ecdbc0
commit 7cd9bf92e3
12 changed files with 118 additions and 7 deletions
@@ -10,6 +10,7 @@ import { logCrawl } from "../../src/services/logging/crawl_log";
 import { validateIdempotencyKey } from "../../src/services/idempotency/validate";
 import { createIdempotencyKey } from "../../src/services/idempotency/create";
 import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../src/lib/default-values";
 import { v4 as uuidv4 } from "uuid";
 export async function crawlController(req: Request, res: Response) {
  try {
@@ -60,10 +61,11 @@ export async function crawlController(req: Request, res: Response) {
    const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions };
    const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions };
-    if (mode === "single_urls" && !url.includes(",")) {
+    if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
      try {
        const a = new WebScraperDataProvider();
        await a.setOptions({
          jobId: uuidv4(),
          mode: "single_urls",
          urls: [url],
          crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true },
@@ -9,6 +9,7 @@ import { Document } from "../lib/entities";
 import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
 import { numTokensFromString } from '../lib/LLM-extraction/helpers';
 import { defaultPageOptions, defaultExtractorOptions, defaultTimeout, defaultOrigin } from '../lib/default-values';
 import { v4 as uuidv4 } from "uuid";
 export async function scrapeHelper(
  req: Request,
@@ -35,6 +36,7 @@ export async function scrapeHelper(
  const a = new WebScraperDataProvider();
  await a.setOptions({
    jobId: uuidv4(),
    mode: "single_urls",
    urls: [url],
    crawlerOptions: {
@@ -7,8 +7,10 @@ import { logJob } from "../services/logging/log_job";
 import { PageOptions, SearchOptions } from "../lib/entities";
 import { search } from "../search";
 import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
 import { v4 as uuidv4 } from "uuid";
 export async function searchHelper(
  jobId: string,
  req: Request,
  team_id: string,
  crawlerOptions: any,
@@ -75,6 +77,7 @@ export async function searchHelper(
  const a = new WebScraperDataProvider();
  await a.setOptions({
    jobId,
    mode: "single_urls",
    urls: res.map((r) => r.url).slice(0, searchOptions.limit ?? 7),
    crawlerOptions: {
@@ -148,6 +151,8 @@ export async function searchController(req: Request, res: Response) {
    const searchOptions = req.body.searchOptions ?? { limit: 7 };
    const jobId = uuidv4();
    try {
      const { success: creditsCheckSuccess, message: creditsCheckMessage } =
        await checkTeamCredits(team_id, 1);
@@ -160,6 +165,7 @@ export async function searchController(req: Request, res: Response) {
    }
    const startTime = new Date().getTime();
    const result = await searchHelper(
      jobId,
      req,
      team_id,
      crawlerOptions,
@@ -169,6 +175,7 @@ export async function searchController(req: Request, res: Response) {
    const endTime = new Date().getTime();
    const timeTakenInSeconds = (endTime - startTime) / 1000;
    logJob({
      job_id: jobId,
      success: result.success,
      message: result.error,
      num_docs: result.data ? result.data.length : 0,
@@ -4,6 +4,7 @@ async function example() {
  const example = new WebScraperDataProvider();
  await example.setOptions({
    jobId: "TEST",
    mode: "crawl",
    urls: ["https://mendable.ai"],
    crawlerOptions: {},
@@ -56,6 +56,7 @@ export type CrawlerOptions = {
 }
 export type WebScraperOptions = {
  jobId: string;
  urls: string[];
  mode: "single_urls" | "sitemap" | "crawl";
  crawlerOptions?: CrawlerOptions;
@@ -0,0 +1,58 @@
 import type { baseScrapers } from "../scraper/WebScraper/single_url";
 import { supabase_service as supabase } from "../services/supabase";
 export type ScrapeErrorEvent = {
  type: "error",
  message: string,
  stack?: string,
 }
 export type ScrapeScrapeEvent = {
  type: "scrape",
  method: (typeof baseScrapers)[number],
  result: null | {
    success: boolean,
    response_code?: number,
    error?: string,
    // proxy?: string,
    time_taken: number,
  },
 }
 export type ScrapeQueueEvent = {
  type: "queue",
  event: "created" | "started" | "interrupted" | "finished",
  worker?: string,
 }
 export type ScrapeEvent = ScrapeErrorEvent | ScrapeScrapeEvent | ScrapeQueueEvent;
 export class ScrapeEvents {
  static async insert(jobId: string, content: ScrapeEvent) {
    if (jobId === "TEST") return null;
    if (process.env.USE_DB_AUTH) {
      const result = await supabase.from("scrape_events").insert({
        job_id: jobId,
        type: content.type,
        content: content,
        // created_at
      }).single();
      return (result.data as any).id;
    }
    return null;
  }
  static async updateScrapeResult(logId: number | null, result: ScrapeScrapeEvent["result"]) {
    if (logId === null) return;
    const previousLog = (await supabase.from("scrape_events").select().eq("id", logId).single()).data as any;
    await supabase.from("scrape_events").update({
      content: {
        ...previousLog.content,
        result,
      }
    }).eq("id", logId);
  }
 }
@@ -60,6 +60,7 @@ export async function runWebScraper({
    const provider = new WebScraperDataProvider();
    if (mode === "crawl") {
      await provider.setOptions({
        jobId: bull_job_id,
        mode: mode,
        urls: [url],
        crawlerOptions: crawlerOptions,
@@ -68,6 +69,7 @@ export async function runWebScraper({
      });
    } else {
      await provider.setOptions({
        jobId: bull_job_id,
        mode: mode,
        urls: url.split(","),
        crawlerOptions: crawlerOptions,
@@ -42,6 +42,7 @@ describe('WebCrawler', () => {
    crawler = new WebCrawler({
      jobId: "TEST",
      initialUrl: initialUrl,
      includes: [],
      excludes: [],
@@ -76,6 +77,7 @@ describe('WebCrawler', () => {
    crawler = new WebCrawler({
      jobId: "TEST",
      initialUrl: initialUrl,
      includes: [],
      excludes: [],
@@ -104,6 +106,7 @@ describe('WebCrawler', () => {
    crawler = new WebCrawler({
      jobId: "TEST",
      initialUrl: initialUrl,
      includes: [],
      excludes: [],
@@ -133,6 +136,7 @@ describe('WebCrawler', () => {
    crawler = new WebCrawler({
      jobId: "TEST",
      initialUrl: initialUrl,
      includes: [],
      excludes: [],
@@ -161,6 +165,7 @@ describe('WebCrawler', () => {
    // Setup the crawler with the specific test case options
    const crawler = new WebCrawler({
      jobId: "TEST",
      initialUrl: initialUrl,
      includes: [],
      excludes: [],
@@ -194,6 +199,7 @@ describe('WebCrawler', () => {
    const limit = 2;  // Set a limit for the number of links
    crawler = new WebCrawler({
      jobId: "TEST",
      initialUrl: initialUrl,
      includes: [],
      excludes: [],
@@ -15,8 +15,8 @@ describe('scrapSingleUrl', () => {
    const pageOptionsWithHtml: PageOptions = { includeHtml: true };
    const pageOptionsWithoutHtml: PageOptions = { includeHtml: false };
-    const resultWithHtml = await scrapSingleUrl(url, pageOptionsWithHtml);
+    const resultWithHtml = await scrapSingleUrl("TEST", url, pageOptionsWithHtml);
-    const resultWithoutHtml = await scrapSingleUrl(url, pageOptionsWithoutHtml);
+    const resultWithoutHtml = await scrapSingleUrl("TEST", url, pageOptionsWithoutHtml);
    expect(resultWithHtml.html).toBeDefined();
    expect(resultWithoutHtml.html).toBeUndefined();
@@ -27,7 +27,7 @@ it('should return a list of links on the mendable.ai page', async () => {
  const url = 'https://mendable.ai';
  const pageOptions: PageOptions = { includeHtml: true };
-  const result = await scrapSingleUrl(url, pageOptions);
+  const result = await scrapSingleUrl("TEST", url, pageOptions);
  // Check if the result contains a list of links
  expect(result.linksOnPage).toBeDefined();
@@ -11,6 +11,7 @@ import { axiosTimeout } from "../../../src/lib/timeout";
 import { Logger } from "../../../src/lib/logger";
 export class WebCrawler {
  private jobId: string;
  private initialUrl: string;
  private baseUrl: string;
  private includes: string[];
@@ -27,6 +28,7 @@ export class WebCrawler {
  private allowExternalContentLinks: boolean;
  constructor({
    jobId,
    initialUrl,
    includes,
    excludes,
@@ -37,6 +39,7 @@ export class WebCrawler {
    allowBackwardCrawling = false,
    allowExternalContentLinks = false
  }: {
    jobId: string;
    initialUrl: string;
    includes?: string[];
    excludes?: string[];
@@ -47,6 +50,7 @@ export class WebCrawler {
    allowBackwardCrawling?: boolean;
    allowExternalContentLinks?: boolean;
  }) {
    this.jobId = jobId;
    this.initialUrl = initialUrl;
    this.baseUrl = new URL(initialUrl).origin;
    this.includes = includes ?? [];
@@ -261,7 +265,7 @@ export class WebCrawler {
      // If it is the first link, fetch with single url
      if (this.visited.size === 1) {
-        const page = await scrapSingleUrl(url, { ...pageOptions, includeHtml: true });
+        const page = await scrapSingleUrl(this.jobId, url, { ...pageOptions, includeHtml: true });
        content = page.html ?? "";
        pageStatusCode = page.metadata?.pageStatusCode;
        pageError = page.metadata?.pageError || undefined;
@@ -22,6 +22,7 @@ import { getAdjustedMaxDepth, getURLDepth } from "./utils/maxDepthUtils";
 import { Logger } from "../../lib/logger";
 export class WebScraperDataProvider {
  private jobId: string;
  private bullJobId: string;
  private urls: string[] = [""];
  private mode: "single_urls" | "sitemap" | "crawl" = "single_urls";
@@ -66,6 +67,7 @@ export class WebScraperDataProvider {
        batchUrls.map(async (url, index) => {
          const existingHTML = allHtmls ? allHtmls[i + index] : "";
          const result = await scrapSingleUrl(
            this.jobId,
            url,
            this.pageOptions,
            this.extractorOptions,
@@ -166,6 +168,7 @@ export class WebScraperDataProvider {
    inProgress?: (progress: Progress) => void
  ): Promise<Document[]> {
    const crawler = new WebCrawler({
      jobId: this.jobId,
      initialUrl: this.urls[0],
      includes: this.includes,
      excludes: this.excludes,
@@ -500,6 +503,7 @@ export class WebScraperDataProvider {
      throw new Error("Urls are required");
    }
    this.jobId = options.jobId;
    this.bullJobId = options.bullJobId;
    this.urls = options.urls;
    this.mode = options.mode;
@@ -18,10 +18,11 @@ import { scrapWithPlaywright } from "./scrapers/playwright";
 import { scrapWithScrapingBee } from "./scrapers/scrapingBee";
 import { extractLinks } from "./utils/utils";
 import { Logger } from "../../lib/logger";
 import { ScrapeEvents } from "../../lib/scrape-events";
 dotenv.config();
-const baseScrapers = [
+export const baseScrapers = [
  "fire-engine",
  "fire-engine;chrome-cdp",
  "scrapingBee",
@@ -118,6 +119,7 @@ function getScrapingFallbackOrder(
 export async function scrapSingleUrl(
  jobId: string,
  urlToScrap: string,
  pageOptions: PageOptions = {
    onlyMainContent: true,
@@ -145,6 +147,13 @@ export async function scrapSingleUrl(
    } = { text: "", screenshot: "", metadata: {} };
    let screenshot = "";
    const timer = Date.now();
    const logInsertPromise = ScrapeEvents.insert(jobId, {
      type: "scrape",
      method,
      result: null,
    });
    switch (method) {
      case "fire-engine":
      case "fire-engine;chrome-cdp":  
@@ -254,8 +263,18 @@ export async function scrapSingleUrl(
    }
    //* TODO: add an optional to return markdown or structured/extracted content
    let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions);
    const text = await parseMarkdown(cleanedHtml);
    const insertedLogId = await logInsertPromise;
    ScrapeEvents.updateScrapeResult(insertedLogId, {
      success: !!scraperResponse.metadata.pageError && !!text,
      error: scraperResponse.metadata.pageError,
      response_code: scraperResponse.metadata.pageStatusCode,
      time_taken: Date.now() - timer,
    });
    return {
-      text: await parseMarkdown(cleanedHtml),
+      text,
      html: cleanedHtml,
      rawHtml: scraperResponse.text,
      screenshot: scraperResponse.screenshot,
@@ -379,6 +398,11 @@ export async function scrapSingleUrl(
    return document;
  } catch (error) {
    Logger.debug(`⛏️ Error: ${error.message} - Failed to fetch URL: ${urlToScrap}`);
    ScrapeEvents.insert(jobId, {
      type: "error",
      message: typeof error === "string" ? error : typeof error.message === "string" ? error.message : JSON.stringify(error),
      stack: error.stack,
    });
    return {
      content: "",
      markdown: "",