bump rust version pt.2

bump rust version
webhook param for crawl (#1609 )
2025-06-02 18:10:14 +02:00 · 2025-06-02 18:09:12 +02:00 · 2025-06-02 18:08:32 +02:00 · 2025-05-29 17:00:13 -03:00 · 2025-05-29 20:59:15 +02:00 · 2025-05-29 20:08:52 +02:00
18 changed files with 228 additions and 53 deletions
@@ -373,5 +373,14 @@ describe("Scrape tests", () => {
    });

    expect(response.metadata.sourceURL).toBe("https://firecrawl.dev/?pagewanted=all&et_blog");
-  }, 35000);
+  }, 30000);
+
+  it.concurrent("application/json content type is markdownified properly", async () => {
+    const response = await scrape({
+      url: "https://jsonplaceholder.typicode.com/todos/1",
+      formats: ["markdown"],
+    });
+
+    expect(response.markdown).toContain("```json");
+  }, 30000);
 });
@@ -13,6 +13,7 @@ import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
 import { getJobPriority } from "../../lib/job-priority";
 import { getScrapeQueue } from "../../services/queue-service";
 import { supabaseGetJobById } from "../../lib/supabase-jobs";
+import { calculateCreditsToBeBilled } from "../../lib/scrape-billing";

 export async function scrapeController(
  req: RequestWithAuth<{}, ScrapeResponse, ScrapeRequest>,
@@ -132,33 +133,12 @@ export async function scrapeController(
        0 // TODO: fix
      : 0;

-  let creditsToBeBilled = 1; // Assuming 1 credit per document
  if (earlyReturn) {
    // Don't bill if we're early returning
    return;
  }
-  if ((req.body.extract && req.body.formats?.includes("extract")) || (req.body.formats?.includes("changeTracking") && req.body.changeTrackingOptions?.modes?.includes("json"))) {
-    creditsToBeBilled = 5;
-  }

-  if (req.body.agent?.model?.toLowerCase() === "fire-1" || req.body.extract?.agent?.model?.toLowerCase() === "fire-1" || req.body.jsonOptions?.agent?.model?.toLowerCase() === "fire-1") {
-    if (process.env.USE_DB_AUTHENTICATION === "true") {
-      // @Nick this is a hack pushed at 2AM pls help - mogery
-      const job = await supabaseGetJobById(jobId);
-      if (!job?.cost_tracking) {
-        logger.warn("No cost tracking found for job", {
-          jobId,
-        });
-      }
-      creditsToBeBilled = Math.ceil((job?.cost_tracking?.totalCost ?? 1) * 1800);
-    } else {
-      creditsToBeBilled = 150;
-    }
-  }
-
-  if (doc?.metadata?.proxyUsed === "stealth") {
-    creditsToBeBilled += 4;
-  }
+  let creditsToBeBilled = await calculateCreditsToBeBilled(req.body, doc, jobId);

  billTeam(req.auth.team_id, req.acuc?.sub_id, creditsToBeBilled).catch(
    (error) => {
@@ -255,7 +255,13 @@ export async function searchController(
    }

    // Bill team once for all successful results
-    billTeam(req.auth.team_id, req.acuc?.sub_id, responseData.data.length).catch((error) => {
+    billTeam(req.auth.team_id, req.acuc?.sub_id, responseData.data.reduce((a,x) => {
+      if (x.metadata?.numPages !== undefined && x.metadata.numPages > 0) {
+        return a + x.metadata.numPages;
+      } else {
+        return a + 1;
+      }
+    }, 0)).catch((error) => {
      logger.error(
        `Failed to bill team ${req.auth.team_id} for ${responseData.data.length} credits: ${error}`,
      );
@@ -750,6 +750,7 @@ export type Document = {
    scrapeId?: string;
    error?: string;
    numPages?: number;
+    contentType?: string;
    proxyUsed: "basic" | "stealth";
    // [key: string]: string | string[] | number | { smartScrape: number; other: number; total: number } | undefined;
  };
@@ -95,8 +95,13 @@ function startServer(port = DEFAULT_PORT) {
    logger.info(`Worker ${process.pid} listening on port ${port}`);
  });

-  const exitHandler = () => {
+  const exitHandler = async () => {
    logger.info("SIGTERM signal received: closing HTTP server");
+    if (process.env.IS_KUBERNETES === "true") {
+      // Account for GCE load balancer drain timeout
+      logger.info("Waiting 60s for GCE load balancer drain timeout");
+      await new Promise((resolve) => setTimeout(resolve, 60000));
+    }
    server.close(() => {
      logger.info("Server closed.");
      process.exit(0);
@@ -0,0 +1,49 @@
+import { Document, ScrapeOptions } from "../controllers/v1/types";
+import { supabaseGetJobById } from "./supabase-jobs";
+import { logger } from "./logger";
+import { CostTracking } from "./extract/extraction-service";
+
+const creditsPerPDFPage = 1;
+const stealthProxyCostBonus = 4;
+
+export async function calculateCreditsToBeBilled(options: ScrapeOptions, document: Document, jobId: string, costTracking?: any) {
+    let creditsToBeBilled = 1; // Assuming 1 credit per document
+    if ((options.extract && options.formats?.includes("extract")) || (options.formats?.includes("changeTracking") && options.changeTrackingOptions?.modes?.includes("json"))) {
+        creditsToBeBilled = 5;
+    }
+
+    if (options.agent?.model?.toLowerCase() === "fire-1" || options.extract?.agent?.model?.toLowerCase() === "fire-1" || options.jsonOptions?.agent?.model?.toLowerCase() === "fire-1") {
+        if (process.env.USE_DB_AUTHENTICATION === "true") {
+            // @Nick this is a hack pushed at 2AM pls help - mogery
+            if (!costTracking) {
+                const job = await supabaseGetJobById(jobId);
+                costTracking = job?.cost_tracking;
+            }
+
+            if (!costTracking) {
+                logger.warn("No cost tracking found for job", {
+                    jobId,
+                    scrapeId: jobId
+                });
+            }
+            
+            if (costTracking instanceof CostTracking) {
+                costTracking = costTracking.toJSON();
+            }
+
+            creditsToBeBilled = Math.ceil((costTracking?.totalCost ?? 1) * 1800);
+        } else {
+            creditsToBeBilled = 150;
+        }
+    } 
+    
+    if (document.metadata.numPages !== undefined && document.metadata.numPages > 1) {
+        creditsToBeBilled += creditsPerPDFPage * (document.metadata.numPages - 1);
+    }
+
+    if (document?.metadata?.proxyUsed === "stealth") {
+        creditsToBeBilled += stealthProxyCostBonus;
+    }
+
+    return creditsToBeBilled;
+}
@@ -30,7 +30,7 @@ export async function scrapeURLWithFetch(
    url: string;
    body: string,
    status: number;
-    headers: any;
+    headers: [string, string][];
  };

  if (meta.mock !== null) {
@@ -117,5 +117,8 @@ export async function scrapeURLWithFetch(
    url: response.url,
    html: response.body,
    statusCode: response.status,
+    contentType: (response.headers.find(
+      (x) => x[0].toLowerCase() === "content-type",
+    ) ?? [])[1] ?? undefined,
  };
 }
@@ -273,6 +273,10 @@ export async function scrapeURLWithFireEngineChromeCDP(
    error: response.pageError,
    statusCode: response.pageStatusCode,

+    contentType: (Object.entries(response.responseHeaders ?? {}).find(
+      (x) => x[0].toLowerCase() === "content-type",
+    ) ?? [])[1] ?? undefined,
+
    screenshot: response.screenshot,
    ...(actions.length > 0
      ? {
@@ -336,6 +340,10 @@ export async function scrapeURLWithFireEnginePlaywright(
    error: response.pageError,
    statusCode: response.pageStatusCode,

+    contentType: (Object.entries(response.responseHeaders ?? {}).find(
+      (x) => x[0].toLowerCase() === "content-type",
+    ) ?? [])[1] ?? undefined,
+
    ...(response.screenshots !== undefined && response.screenshots.length > 0
      ? {
          screenshot: response.screenshots[0],
@@ -391,5 +399,9 @@ export async function scrapeURLWithFireEngineTLSClient(
    html: response.content,
    error: response.pageError,
    statusCode: response.pageStatusCode,
+
+    contentType: (Object.entries(response.responseHeaders ?? {}).find(
+      (x) => x[0].toLowerCase() === "content-type",
+    ) ?? [])[1] ?? undefined,
  };
 }
@@ -111,6 +111,8 @@ export type EngineScrapeResult = {
  };

  numPages?: number;
+
+  contentType?: string;
 };

 const engineHandlers: {
@@ -379,6 +379,7 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
      statusCode: result.result.statusCode,
      error: result.result.error,
      numPages: result.result.numPages,
+      contentType: result.result.contentType,
      proxyUsed: meta.featureFlags.has("stealthProxy") ? "stealth" : "basic",
    },
  };
@@ -61,6 +61,17 @@ export async function deriveMarkdownFromHTML(
    );
  }

+  if (document.metadata.contentType?.includes("application/json")) {
+    if (document.rawHtml === undefined) {
+      throw new Error(
+        "rawHtml is undefined -- this transformer is being called out of order",
+      );
+    }
+
+    document.markdown = "```json\n" + document.rawHtml + "\n```";
+    return document;
+  }
+
  document.markdown = await parseMarkdown(document.html);
  return document;
 }
@@ -5,6 +5,58 @@ import { logger } from "../lib/logger";

 dotenv.config();

+
+export async function fire_engine_search(
+  q: string,
+  options: {
+    tbs?: string;
+    filter?: string;
+    lang?: string;
+    country?: string;
+    location?: string;
+    numResults: number;
+    page?: number;
+  },
+  abort?: AbortSignal,
+): Promise<SearchResult[]> {
+  try {
+    let data = JSON.stringify({
+      query: q,
+      lang: options.lang,
+      country: options.country,
+      location: options.location,
+      tbs: options.tbs,
+      numResults: options.numResults,
+      page: options.page ?? 1,
+    });
+
+    if (!process.env.FIRE_ENGINE_BETA_URL) {
+      return [];
+    }
+
+    const response = await fetch(`${process.env.FIRE_ENGINE_BETA_URL}/search`, {
+      method: "POST",
+      headers: {
+        "Content-Type": "application/json",
+        "X-Disable-Cache": "true",
+      },
+      body: data,
+      signal: abort,
+    });
+
+    if (response.ok) {
+      const responseData = await response.json();
+      return responseData;
+    } else {
+      return [];
+    }
+  } catch (error) {
+    logger.error(error);
+    Sentry.captureException(error);
+    return [];
+  }
+}
+
 export async function fireEngineMap(
  q: string,
  options: {
@@ -34,7 +86,7 @@ export async function fireEngineMap(
      return [];
    }

-    const response = await fetch(`${process.env.FIRE_ENGINE_BETA_URL}/search`, {
+    const response = await fetch(`${process.env.FIRE_ENGINE_BETA_URL}/map`, {
      method: "POST",
      headers: {
        "Content-Type": "application/json",
@@ -4,6 +4,7 @@ import { googleSearch } from "./googlesearch";
 import { searchapi_search } from "./searchapi";
 import { serper_search } from "./serper";
 import { searxng_search } from "./searxng";
+import { fire_engine_search } from "./fireEngine";

 export async function search({
  query,
@@ -31,8 +32,19 @@ export async function search({
  timeout?: number;
 }): Promise<SearchResult[]> {
  try {
+    if (process.env.FIRE_ENGINE_BETA_URL) {
+      const results = await fire_engine_search(query, {
+        numResults: num_results,
+        tbs,
+        filter,
+        lang,
+        country,
+        location,
+      });
+      if (results.length > 0) return results;
+    }
    if (process.env.SERPER_API_KEY) {
-      return await serper_search(query, {
+      const results = await serper_search(query, {
        num_results,
        tbs,
        filter,
@@ -40,9 +52,10 @@ export async function search({
        country,
        location,
      });
+      if (results.length > 0) return results;
    }
    if (process.env.SEARCHAPI_API_KEY) {
-      return await searchapi_search(query, {
+      const results = await searchapi_search(query, {
        num_results,
        tbs,
        filter,
@@ -50,9 +63,10 @@ export async function search({
        country,
        location,
      });
+      if (results.length > 0) return results;
    }
    if (process.env.SEARXNG_ENDPOINT) {
-      return await searxng_search(query, {
+      const results = await searxng_search(query, {
        num_results,
        tbs,
        filter,
@@ -60,6 +74,7 @@ export async function search({
        country,
        location,
      });
+      if (results.length > 0) return results;
    }
    return await googleSearch(
      query,
@@ -66,10 +66,10 @@ export function getIndexQueue() {
      connection: redisConnection,
      defaultJobOptions: {
        removeOnComplete: {
-          age: 90000, // 25 hours
+          age: 3600, // 1 hour
        },
        removeOnFail: {
-          age: 90000, // 25 hours
+          age: 3600, // 1 hour
        },
      },
    });
@@ -120,10 +120,10 @@ export function getBillingQueue() {
      connection: redisConnection,
      defaultJobOptions: {
        removeOnComplete: {
-          age: 90000, // 25 hours
+          age: 3600, // 1 hour
        },
        removeOnFail: {
-          age: 90000, // 25 hours
+          age: 3600, // 1 hour
        },
      },
    });
@@ -85,6 +85,7 @@ import https from "https";
 import { cacheableLookup } from "../scraper/scrapeURL/lib/cacheableLookup";
 import { robustFetch } from "../scraper/scrapeURL/lib/fetch";
 import { RateLimiterMode } from "../types";
+import { calculateCreditsToBeBilled } from "../lib/scrape-billing";
 import { redisEvictConnection } from "./redis";

 configDotenv();
@@ -1384,22 +1385,7 @@ async function processJob(job: Job & { id: string }, token: string) {
    }

    if (job.data.is_scrape !== true) {
-      let creditsToBeBilled = 1; // Assuming 1 credit per document
-      if ((job.data.scrapeOptions.extract && job.data.scrapeOptions.formats?.includes("extract")) || (job.data.scrapeOptions.formats?.includes("changeTracking") && job.data.scrapeOptions.changeTrackingOptions?.modes?.includes("json"))) {
-        creditsToBeBilled = 5;
-      }
-
-      if (job.data.scrapeOptions.agent?.model?.toLowerCase() === "fire-1" || job.data.scrapeOptions.extract?.agent?.model?.toLowerCase() === "fire-1" || job.data.scrapeOptions.jsonOptions?.agent?.model?.toLowerCase() === "fire-1") {
-        if (process.env.USE_DB_AUTHENTICATION === "true") {
-          creditsToBeBilled = Math.ceil((costTracking.toJSON().totalCost ?? 1) * 1800);
-        } else {
-          creditsToBeBilled = 150;
-        }
-      }
-
-      if (doc.metadata?.proxyUsed === "stealth") {
-        creditsToBeBilled += 4;
-      }
+      let creditsToBeBilled = await calculateCreditsToBeBilled(job.data.scrapeOptions, doc, job.id, costTracking);

      if (
        job.data.team_id !== process.env.BACKGROUND_INDEX_TEAM_ID! &&
@@ -680,7 +680,7 @@ checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a"

 [[package]]
 name = "firecrawl"
-version = "1.1.0"
+version = "1.2.0"
 dependencies = [
 "assert_matches",
 "axum",
@@ -1,7 +1,7 @@
 [package]
 name = "firecrawl"
 author= "Mendable.ai"
-version = "1.1.0"
+version = "1.2.0"
 edition = "2021"
 license = "MIT"
 homepage = "https://www.firecrawl.dev/"
@@ -99,6 +99,49 @@ impl From<CrawlScrapeOptions> for ScrapeOptions {
    }
 }

+/// Options for webhook notifications
+#[serde_with::skip_serializing_none]
+#[derive(Deserialize, Serialize, Debug, Default, Clone)]
+#[serde(rename_all = "camelCase")]
+pub struct WebhookOptions {
+    /// URL to send webhook notifications to
+    pub url: String,
+
+    /// Custom headers to include in webhook requests
+    pub headers: Option<HashMap<String, String>>,
+
+    /// Custom data included in all webhook payloads
+    pub metadata: Option<HashMap<String, String>>,
+
+    /// Event types to receive
+    pub events: Option<Vec<WebhookEvent>>,
+}
+
+impl From<String> for WebhookOptions {
+    fn from(value: String) -> Self {
+        Self {
+            url: value,
+            ..Default::default()
+        }
+    }
+}
+
+#[derive(Deserialize, Serialize, Debug, PartialEq, Eq, Clone, Copy)]
+#[serde(rename_all = "camelCase")]
+pub enum WebhookEvent {
+    /// Crawl finished successfully
+    Completed,
+
+    /// Crawl encountered an error
+    Failed,
+
+    /// Individual page scraped
+    Page,
+
+    /// Crawl job initiated
+    Started,
+}
+
 #[serde_with::skip_serializing_none]
 #[derive(Deserialize, Serialize, Debug, Default, Clone)]
 #[serde(rename_all = "camelCase")]
@@ -132,7 +175,7 @@ pub struct CrawlOptions {
    pub allow_external_links: Option<bool>,

    /// URL to send Webhook crawl events to.
-    pub webhook: Option<String>,
+    pub webhook: Option<WebhookOptions>,

    /// Idempotency key to send to the crawl endpoint.
    #[serde(skip)]
Author	SHA1	Message	Date
Gergő Móricz	1396451d31	bump rust version pt.2	2025-06-02 18:10:14 +02:00
Gergő Móricz	07fb651a91	bump rust version	2025-06-02 18:09:12 +02:00
Supasin Liulak	6a76ccfacb	webhook param for crawl (#1609 )	2025-06-02 18:08:32 +02:00
Nicolas	9297afd1ff	Nick: search	2025-05-29 17:00:13 -03:00
Gergő Móricz	a8e0482718	feat(search): bill for PDFs properly	2025-05-29 20:59:15 +02:00
Gergő Móricz	a2f41fb650	feat(api/server): wait 60s for GCE load balancer drain timeout To minimize 502s.	2025-05-29 20:08:52 +02:00
Gergő Móricz	3ea221b093	fix(api/queue): tighten expiries on indexQueue jobs	2025-05-29 16:36:55 +02:00
Gergő Móricz	c9dd0e609a	fix(api/queue): tighten expiries on billingQueue jobs	2025-05-29 16:26:52 +02:00
Gergő Móricz	93655b5c0b	feat(scrapeURL/pdf): bill n credits per page (FIR-1934) (#1553 ) * feat(scrapeURL/pdf): bill n credits per page * Update scrape.ts * Update queue-worker.ts * separate billing logi --------- Co-authored-by: Nicolas <nicolascamara29@gmail.com>	2025-05-29 16:01:08 +02:00
Gergő Móricz	38c96b524f	feat(scrapeURL): handle contentType JSON better in markdown conversion (#1604 )	2025-05-29 15:26:07 +02:00