apps/api/src/lib/extract/extraction-service.ts

import { Document, ExtractRequest, URLTrace } from "../../controllers/v1/types";
import { PlanType } from "../../types";
import { logger } from "../logger";
import { processUrl } from "./url-processor";
import { scrapeDocument } from "./document-scraper";
import { generateOpenAICompletions } from "../../scraper/scrapeURL/transformers/llmExtract";
import { buildDocument } from "./build-document";
import { billTeam } from "../../services/billing/credit_billing";
import { logJob } from "../../services/logging/log_job";
import { _addScrapeJobToBullMQ } from "../../services/queue-jobs";
import { saveCrawl, StoredCrawl } from "../crawl-redis";
import { updateExtract } from "./extract-redis";
import { CUSTOM_U_TEAMS } from "./config";

interface ExtractServiceOptions {
  request: ExtractRequest;
  teamId: string;
  plan: PlanType;
  subId?: string;
}

interface ExtractResult {
  success: boolean;
  data?: any;
  extractId: string;
  warning?: string;
  urlTrace?: URLTrace[];
  error?: string;
}

function getRootDomain(url: string): string {
  try {
    if (url.endsWith("/*")) {
      url = url.slice(0, -2);
    }
    const urlObj = new URL(url);
    return `${urlObj.protocol}//${urlObj.hostname}`;
  } catch (e) {
    return url;
  }
}

export async function performExtraction(
  extractId: string,
  options: ExtractServiceOptions,
): Promise<ExtractResult> {
  const { request, teamId, plan, subId } = options;
  const urlTraces: URLTrace[] = [];
  let docs: Document[] = [];

  // Process URLs
  const urlPromises = request.urls.map((url) =>
    processUrl(
      {
        url,
        prompt: request.prompt,
        teamId,
        plan,
        allowExternalLinks: request.allowExternalLinks,
        origin: request.origin,
        limit: request.limit,
        includeSubdomains: request.includeSubdomains,
      },
      urlTraces,
    ),
  );

  const processedUrls = await Promise.all(urlPromises);
  const links = processedUrls.flat().filter((url) => url);

  if (links.length === 0) {
    return {
      success: false,
      error:
        "No valid URLs found to scrape. Try adjusting your search criteria or including more URLs.",
      extractId,
      urlTrace: urlTraces,
    };
  }

  // Scrape documents
  const timeout = Math.floor((request.timeout || 40000) * 0.7) || 30000;
  const scrapePromises = links.map((url) =>
    scrapeDocument(
      {
        url,
        teamId,
        plan,
        origin: request.origin || "api",
        timeout,
      },
      urlTraces,
    ),
  );

  try {
    const results = await Promise.all(scrapePromises);
    docs.push(...results.filter((doc): doc is Document => doc !== null));
  } catch (error) {
    return {
      success: false,
      error: error.message,
      extractId,
      urlTrace: urlTraces,
    };
  }

  // Generate completions
  const completions = await generateOpenAICompletions(
    logger.child({ method: "extractService/generateOpenAICompletions" }),
    {
      mode: "llm",
      systemPrompt:
        (request.systemPrompt ? `${request.systemPrompt}\n` : "") +
        "Always prioritize using the provided content to answer the question. Do not make up an answer. Do not hallucinate. Be concise and follow the schema always if provided. Here are the urls the user provided of which he wants to extract information from: " +
        links.join(", "),
      prompt: request.prompt,
      schema: request.schema,
    },
    docs.map((x) => buildDocument(x)).join("\n"),
    undefined,
    true,
  );

  // Update token usage in traces
  if (completions.numTokens) {
    const totalLength = docs.reduce(
      (sum, doc) => sum + (doc.markdown?.length || 0),
      0,
    );
    docs.forEach((doc) => {
      if (doc.metadata?.sourceURL) {
        const trace = urlTraces.find((t) => t.url === doc.metadata.sourceURL);
        if (trace && trace.contentStats) {
          trace.contentStats.tokensUsed = Math.floor(
            ((doc.markdown?.length || 0) / totalLength) * completions.numTokens,
          );
        }
      }
    });
  }

  // Kickoff background crawl for indexing root domains
  // const rootDomains = new Set(request.urls.map(getRootDomain));
  // rootDomains.forEach(async url => {
  //   const crawlId = crypto.randomUUID();

  //   // Create and save crawl configuration first
  //   const sc: StoredCrawl = {
  //     originUrl: url,
  //     crawlerOptions: {
  //       maxDepth: 15,
  //       limit: 5000,
  //       includePaths: [],
  //       excludePaths: [],
  //       ignoreSitemap: false,
  //       includeSubdomains: true,
  //       allowExternalLinks: false,
  //       allowBackwardLinks: true
  //     },
  //     scrapeOptions: {
  //         formats: ["markdown"],
  //         onlyMainContent: true,
  //         waitFor: 0,
  //         mobile: false,
  //         removeBase64Images: true,
  //         fastMode: false,
  //         parsePDF: true,
  //         skipTlsVerification: false,
  //     },
  //     internalOptions: {
  //       disableSmartWaitCache: true,
  //       isBackgroundIndex: true
  //     },
  //     team_id: process.env.BACKGROUND_INDEX_TEAM_ID!,
  //     createdAt: Date.now(),
  //     plan: "hobby", // make it a low concurrency
  //   };

  //   // Save the crawl configuration
  //   await saveCrawl(crawlId, sc);

  //   // Then kick off the job
  //   await _addScrapeJobToBullMQ({
  //     url,
  //     mode: "kickoff" as const,
  //     team_id: process.env.BACKGROUND_INDEX_TEAM_ID!,
  //     plan: "hobby", // make it a low concurrency
  //     crawlerOptions: sc.crawlerOptions,
  //     scrapeOptions: sc.scrapeOptions,
  //     internalOptions: sc.internalOptions,
  //     origin: "index",
  //     crawl_id: crawlId,
  //     webhook: null,
  //     v1: true,
  //   }, {}, crypto.randomUUID(), 50);
  // });

  let linksBilled = links.length * 5;

  if (CUSTOM_U_TEAMS.includes(teamId)) {
    linksBilled = 1;
  }
  // Bill team for usage
  billTeam(teamId, subId, linksBilled).catch((error) => {
    logger.error(
      `Failed to bill team ${teamId} for ${linksBilled} credits: ${error}`,
    );
  });

  // Log job
  logJob({
    job_id: extractId,
    success: true,
    message: "Extract completed",
    num_docs: 1,
    docs: completions.extract ?? {},
    time_taken: (new Date().getTime() - Date.now()) / 1000,
    team_id: teamId,
    mode: "extract",
    url: request.urls.join(", "),
    scrapeOptions: request,
    origin: request.origin ?? "api",
    num_tokens: completions.numTokens ?? 0,
  }).then(() => {
    updateExtract(extractId, {
      status: "completed",
    }).catch((error) => {
      logger.error(
        `Failed to update extract ${extractId} status to completed: ${error}`,
      );
    });
  });

  return {
    success: true,
    data: completions.extract ?? {},
    extractId,
    warning: completions.warning,
    urlTrace: request.urlTrace ? urlTraces : undefined,
  };
}
Nick: refactor 2024-12-26 12:41:37 -03:00			`import { Document, ExtractRequest, URLTrace } from "../../controllers/v1/types";`
			`import { PlanType } from "../../types";`
			`import { logger } from "../logger";`
			`import { processUrl } from "./url-processor";`
			`import { scrapeDocument } from "./document-scraper";`
			`import { generateOpenAICompletions } from "../../scraper/scrapeURL/transformers/llmExtract";`
			`import { buildDocument } from "./build-document";`
			`import { billTeam } from "../../services/billing/credit_billing";`
			`import { logJob } from "../../services/logging/log_job";`
Nick: async background index 2024-12-30 21:42:01 -03:00			`import { _addScrapeJobToBullMQ } from "../../services/queue-jobs";`
			`import { saveCrawl, StoredCrawl } from "../crawl-redis";`
Nick: 2025-01-07 16:16:01 -03:00			`import { updateExtract } from "./extract-redis";`
Nick: links-billed update (temp) 2025-01-08 15:13:33 -03:00			`import { CUSTOM_U_TEAMS } from "./config";`
Nick: refactor 2024-12-26 12:41:37 -03:00
			`interface ExtractServiceOptions {`
			`request: ExtractRequest;`
			`teamId: string;`
			`plan: PlanType;`
			`subId?: string;`
			`}`

			`interface ExtractResult {`
			`success: boolean;`
			`data?: any;`
Nick: init 2025-01-03 20:44:27 -03:00			`extractId: string;`
Nick: refactor 2024-12-26 12:41:37 -03:00			`warning?: string;`
Nick: making it optional for the user 2024-12-26 12:43:58 -03:00			`urlTrace?: URLTrace[];`
Nick: refactor 2024-12-26 12:41:37 -03:00			`error?: string;`
			`}`

Nick: async background index 2024-12-30 21:42:01 -03:00			`function getRootDomain(url: string): string {`
			`try {`
Nick: formatting fixes 2025-01-10 18:35:10 -03:00			`if (url.endsWith("/*")) {`
Nick: async background index 2024-12-30 21:42:01 -03:00			`url = url.slice(0, -2);`
			`}`
			`const urlObj = new URL(url);`
			return `${urlObj.protocol}//${urlObj.hostname}`;
			`} catch (e) {`
			`return url;`
			`}`
			`}`

Nick: formatting fixes 2025-01-10 18:35:10 -03:00			`export async function performExtraction(`
			`extractId: string,`
			`options: ExtractServiceOptions,`
			`): Promise<ExtractResult> {`
Nick: refactor 2024-12-26 12:41:37 -03:00			`const { request, teamId, plan, subId } = options;`
			`const urlTraces: URLTrace[] = [];`
			`let docs: Document[] = [];`

			`// Process URLs`
Nick: formatting fixes 2025-01-10 18:35:10 -03:00			`const urlPromises = request.urls.map((url) =>`
			`processUrl(`
			`{`
			`url,`
			`prompt: request.prompt,`
			`teamId,`
			`plan,`
			`allowExternalLinks: request.allowExternalLinks,`
			`origin: request.origin,`
			`limit: request.limit,`
			`includeSubdomains: request.includeSubdomains,`
			`},`
			`urlTraces,`
			`),`
Nick: refactor 2024-12-26 12:41:37 -03:00			`);`

			`const processedUrls = await Promise.all(urlPromises);`
Nick: formatting fixes 2025-01-10 18:35:10 -03:00			`const links = processedUrls.flat().filter((url) => url);`
Nick: refactor 2024-12-26 12:41:37 -03:00
			`if (links.length === 0) {`
			`return {`
			`success: false,`
Nick: formatting fixes 2025-01-10 18:35:10 -03:00			`error:`
			`"No valid URLs found to scrape. Try adjusting your search criteria or including more URLs.",`
Nick: init 2025-01-03 20:44:27 -03:00			`extractId,`
Nick: refactor 2024-12-26 12:41:37 -03:00			`urlTrace: urlTraces,`
			`};`
			`}`

			`// Scrape documents`
			`const timeout = Math.floor((request.timeout \|\| 40000) * 0.7) \|\| 30000;`
Nick: formatting fixes 2025-01-10 18:35:10 -03:00			`const scrapePromises = links.map((url) =>`
			`scrapeDocument(`
			`{`
			`url,`
			`teamId,`
			`plan,`
			`origin: request.origin \|\| "api",`
			`timeout,`
			`},`
			`urlTraces,`
			`),`
Nick: refactor 2024-12-26 12:41:37 -03:00			`);`

			`try {`
			`const results = await Promise.all(scrapePromises);`
			`docs.push(...results.filter((doc): doc is Document => doc !== null));`
			`} catch (error) {`
			`return {`
			`success: false,`
			`error: error.message,`
Nick: init 2025-01-03 20:44:27 -03:00			`extractId,`
Nick: refactor 2024-12-26 12:41:37 -03:00			`urlTrace: urlTraces,`
			`};`
			`}`

			`// Generate completions`
			`const completions = await generateOpenAICompletions(`
			`logger.child({ method: "extractService/generateOpenAICompletions" }),`
			`{`
			`mode: "llm",`
			`systemPrompt:`
			(request.systemPrompt ? `${request.systemPrompt}\n` : "") +
broader search if didnt find results 2025-01-02 18:00:18 -03:00			`"Always prioritize using the provided content to answer the question. Do not make up an answer. Do not hallucinate. Be concise and follow the schema always if provided. Here are the urls the user provided of which he wants to extract information from: " +`
Nick: refactor 2024-12-26 12:41:37 -03:00			`links.join(", "),`
			`prompt: request.prompt,`
			`schema: request.schema,`
			`},`
			`docs.map((x) => buildDocument(x)).join("\n"),`
			`undefined,`
			`true,`
			`);`

			`// Update token usage in traces`
			`if (completions.numTokens) {`
Nick: formatting fixes 2025-01-10 18:35:10 -03:00			`const totalLength = docs.reduce(`
			`(sum, doc) => sum + (doc.markdown?.length \|\| 0),`
			`0,`
			`);`
Nick: refactor 2024-12-26 12:41:37 -03:00			`docs.forEach((doc) => {`
			`if (doc.metadata?.sourceURL) {`
			`const trace = urlTraces.find((t) => t.url === doc.metadata.sourceURL);`
			`if (trace && trace.contentStats) {`
			`trace.contentStats.tokensUsed = Math.floor(`
Nick: formatting fixes 2025-01-10 18:35:10 -03:00			`((doc.markdown?.length \|\| 0) / totalLength) * completions.numTokens,`
Nick: refactor 2024-12-26 12:41:37 -03:00			`);`
			`}`
			`}`
			`});`
			`}`

Nick: async background index 2024-12-30 21:42:01 -03:00			`// Kickoff background crawl for indexing root domains`
Update extraction-service.ts 2024-12-31 15:22:50 -03:00			`// const rootDomains = new Set(request.urls.map(getRootDomain));`
			`// rootDomains.forEach(async url => {`
			`// const crawlId = crypto.randomUUID();`
Nick: formatting fixes 2025-01-10 18:35:10 -03:00
Update extraction-service.ts 2024-12-31 15:22:50 -03:00			`// // Create and save crawl configuration first`
			`// const sc: StoredCrawl = {`
			`// originUrl: url,`
			`// crawlerOptions: {`
			`// maxDepth: 15,`
			`// limit: 5000,`
			`// includePaths: [],`
			`// excludePaths: [],`
			`// ignoreSitemap: false,`
			`// includeSubdomains: true,`
			`// allowExternalLinks: false,`
			`// allowBackwardLinks: true`
			`// },`
			`// scrapeOptions: {`
			`// formats: ["markdown"],`
			`// onlyMainContent: true,`
			`// waitFor: 0,`
			`// mobile: false,`
			`// removeBase64Images: true,`
			`// fastMode: false,`
			`// parsePDF: true,`
			`// skipTlsVerification: false,`
			`// },`
Nick: formatting fixes 2025-01-10 18:35:10 -03:00			`// internalOptions: {`
Update extraction-service.ts 2024-12-31 15:22:50 -03:00			`// disableSmartWaitCache: true,`
			`// isBackgroundIndex: true`
			`// },`
			`// team_id: process.env.BACKGROUND_INDEX_TEAM_ID!,`
			`// createdAt: Date.now(),`
			`// plan: "hobby", // make it a low concurrency`
			`// };`

			`// // Save the crawl configuration`
			`// await saveCrawl(crawlId, sc);`

			`// // Then kick off the job`
			`// await _addScrapeJobToBullMQ({`
			`// url,`
			`// mode: "kickoff" as const,`
			`// team_id: process.env.BACKGROUND_INDEX_TEAM_ID!,`
			`// plan: "hobby", // make it a low concurrency`
			`// crawlerOptions: sc.crawlerOptions,`
			`// scrapeOptions: sc.scrapeOptions,`
			`// internalOptions: sc.internalOptions,`
			`// origin: "index",`
			`// crawl_id: crawlId,`
			`// webhook: null,`
			`// v1: true,`
			`// }, {}, crypto.randomUUID(), 50);`
			`// });`
Nick: async background index 2024-12-30 21:42:01 -03:00
Nick: links-billed update (temp) 2025-01-08 15:13:33 -03:00			`let linksBilled = links.length * 5;`

Nick: formatting fixes 2025-01-10 18:35:10 -03:00			`if (CUSTOM_U_TEAMS.includes(teamId)) {`
Nick: links-billed update (temp) 2025-01-08 15:13:33 -03:00			`linksBilled = 1;`
			`}`
Nick: refactor 2024-12-26 12:41:37 -03:00			`// Bill team for usage`
Nick: links-billed update (temp) 2025-01-08 15:13:33 -03:00			`billTeam(teamId, subId, linksBilled).catch((error) => {`
Nick: refactor 2024-12-26 12:41:37 -03:00			`logger.error(`
Nick: links-billed update (temp) 2025-01-08 15:13:33 -03:00			`Failed to bill team ${teamId} for ${linksBilled} credits: ${error}`,
Nick: refactor 2024-12-26 12:41:37 -03:00			`);`
			`});`

			`// Log job`
			`logJob({`
Nick: init 2025-01-03 20:44:27 -03:00			`job_id: extractId,`
Nick: refactor 2024-12-26 12:41:37 -03:00			`success: true,`
			`message: "Extract completed",`
			`num_docs: 1,`
			`docs: completions.extract ?? {},`
			`time_taken: (new Date().getTime() - Date.now()) / 1000,`
			`team_id: teamId,`
			`mode: "extract",`
			`url: request.urls.join(", "),`
			`scrapeOptions: request,`
			`origin: request.origin ?? "api",`
			`num_tokens: completions.numTokens ?? 0,`
Nick: 2025-01-07 16:16:01 -03:00			`}).then(() => {`
			`updateExtract(extractId, {`
			`status: "completed",`
			`}).catch((error) => {`
Nick: formatting fixes 2025-01-10 18:35:10 -03:00			`logger.error(`
			`Failed to update extract ${extractId} status to completed: ${error}`,
			`);`
Nick: 2025-01-07 16:16:01 -03:00			`});`
Nick: refactor 2024-12-26 12:41:37 -03:00			`});`

			`return {`
			`success: true,`
			`data: completions.extract ?? {},`
Nick: init 2025-01-03 20:44:27 -03:00			`extractId,`
Nick: refactor 2024-12-26 12:41:37 -03:00			`warning: completions.warning,`
Nick: making it optional for the user 2024-12-26 12:43:58 -03:00			`urlTrace: request.urlTrace ? urlTraces : undefined,`
Nick: refactor 2024-12-26 12:41:37 -03:00			`};`
Nick: formatting fixes 2025-01-10 18:35:10 -03:00			`}`