feat(rust): update rust sdk to support new features (#1446)

* chore(rust-sdk): cargo fmt * feat(rust-sdk): implement search api + example + test * feat(rust-sdk): implement crawl cancel api + example + test * feat(rust-sdk): implement crawl check errors api + example + test * feat(rust-sdk): implement batch crawl + test + example + Fix MapOptions * feat(rust-sdk): implement extract api + test + example * feat(rust-sdk): implement llmtxt api + test + example * chore(rust-sdk): correct mock tests * chore(rust-sdk): prep for cargo distribution
2025-04-18 07:59:59 +03:00
parent 33aece8e96
commit f2c01340d1
20 changed files with 4350 additions and 125 deletions
@@ -2,7 +2,11 @@ use std::collections::HashMap;

 use serde::{Deserialize, Serialize};

-use crate::{document::Document, scrape::{ScrapeFormats, ScrapeOptions}, FirecrawlApp, FirecrawlError, API_VERSION};
+use crate::{
+    document::Document,
+    scrape::{ScrapeFormats, ScrapeOptions},
+    FirecrawlApp, FirecrawlError, API_VERSION,
+};

 #[derive(Deserialize, Serialize, Clone, Copy, Debug)]
 pub enum CrawlScrapeFormats {
@@ -23,13 +27,13 @@ pub enum CrawlScrapeFormats {
    Links,

    /// Will result in a URL to a screenshot of the page.
-    /// 
+    ///
    /// Can not be used in conjunction with `CrawlScrapeFormats::ScreenshotFullPage`.
    #[serde(rename = "screenshot")]
    Screenshot,

    /// Will result in a URL to a full-page screenshot of the page.
-    /// 
+    ///
    /// Can not be used in conjunction with `CrawlScrapeFormats::Screenshot`.
    #[serde(rename = "screenshot@fullPage")]
    ScreenshotFullPage,
@@ -59,12 +63,12 @@ pub struct CrawlScrapeOptions {
    pub only_main_content: Option<bool>,

    /// HTML tags to exclusively include.
-    /// 
+    ///
    /// For example, if you pass `div`, you will only get content from `<div>`s and their children.
    pub include_tags: Option<Vec<String>>,

    /// HTML tags to exclude.
-    /// 
+    ///
    /// For example, if you pass `img`, you will never get image URLs in your results.
    pub exclude_tags: Option<Vec<String>>,

@@ -81,7 +85,9 @@ pub struct CrawlScrapeOptions {
 impl From<CrawlScrapeOptions> for ScrapeOptions {
    fn from(value: CrawlScrapeOptions) -> Self {
        ScrapeOptions {
-            formats: value.formats.map(|formats| formats.into_iter().map(|x| x.into()).collect()),
+            formats: value
+                .formats
+                .map(|formats| formats.into_iter().map(|x| x.into()).collect()),
            only_main_content: value.only_main_content,
            include_tags: value.include_tags,
            exclude_tags: value.exclude_tags,
@@ -101,12 +107,12 @@ pub struct CrawlOptions {
    pub scrape_options: Option<CrawlScrapeOptions>,

    /// URL RegEx patterns to (exclusively) include.
-    /// 
+    ///
    /// For example, if you specified `"blog"`, only pages that have `blog` somewhere in the URL would be crawled.
    pub include_paths: Option<Vec<String>>,

    /// URL RegEx patterns to exclude.
-    /// 
+    ///
    /// For example, if you specified `"blog"`, pages that have `blog` somewhere in the URL would not be crawled.
    pub exclude_paths: Option<Vec<String>>,

@@ -200,6 +206,29 @@ pub struct CrawlStatus {
    pub data: Vec<Document>,
 }

+#[derive(Deserialize, Serialize, Debug, Clone)]
+#[serde(rename_all = "camelCase")]
+pub struct CrawlError {
+    pub id: String,
+    pub timestamp: Option<String>,
+    pub url: String,
+    pub error: String,
+}
+
+#[derive(Deserialize, Serialize, Debug, Clone)]
+#[serde(rename_all = "camelCase")]
+pub struct CrawlErrorsResponse {
+    pub errors: Vec<CrawlError>,
+    #[serde(rename = "robotsBlocked")]
+    pub robots_blocked: Vec<String>,
+}
+
+#[derive(Deserialize, Serialize, Debug, Clone)]
+#[serde(rename_all = "camelCase")]
+pub struct CancelCrawlResponse {
+    pub status: String,
+}
+
 #[derive(Deserialize, Serialize, Debug, Clone)]
 #[serde(rename_all = "camelCase")]
 pub struct CrawlAsyncResponse {
@@ -223,19 +252,20 @@ impl FirecrawlApp {
            url: url.as_ref().to_string(),
            options: options.unwrap_or_default(),
        };
-        
+
        let headers = self.prepare_headers(body.options.idempotency_key.as_ref());

        let response = self
            .client
-            .post(&format!("{}{}/crawl", self.api_url, API_VERSION))
+            .post(format!("{}{}/crawl", self.api_url, API_VERSION))
            .headers(headers.clone())
            .json(&body)
            .send()
            .await
            .map_err(|e| FirecrawlError::HttpError(format!("Crawling {:?}", url.as_ref()), e))?;

-        self.handle_response::<CrawlAsyncResponse>(response, "start crawl job").await
+        self.handle_response::<CrawlAsyncResponse>(response, "start crawl job")
+            .await
    }

    /// Performs a crawl job for a URL using the Firecrawl API, waiting for the end result. This may take a long time depending on the size of the target page and your options (namely `CrawlOptions.limit`).
@@ -245,38 +275,65 @@ impl FirecrawlApp {
        options: impl Into<Option<CrawlOptions>>,
    ) -> Result<CrawlStatus, FirecrawlError> {
        let options = options.into();
-        let poll_interval = options.as_ref().and_then(|x| x.poll_interval).unwrap_or(2000);
+        let poll_interval = options
+            .as_ref()
+            .and_then(|x| x.poll_interval)
+            .unwrap_or(2000);
        let res = self.crawl_url_async(url, options).await?;

        self.monitor_job_status(&res.id, poll_interval).await
    }

-    async fn check_crawl_status_next(&self, next: impl AsRef<str>) -> Result<CrawlStatus, FirecrawlError> {
+    async fn check_crawl_status_next(
+        &self,
+        next: impl AsRef<str>,
+    ) -> Result<CrawlStatus, FirecrawlError> {
        let response = self
            .client
            .get(next.as_ref())
            .headers(self.prepare_headers(None))
            .send()
            .await
-            .map_err(|e| FirecrawlError::HttpError(format!("Paginating crawl using URL {:?}", next.as_ref()), e))?;
+            .map_err(|e| {
+                FirecrawlError::HttpError(
+                    format!("Paginating crawl using URL {:?}", next.as_ref()),
+                    e,
+                )
+            })?;

-        self.handle_response(response, format!("Paginating crawl using URL {:?}", next.as_ref())).await
+        self.handle_response(
+            response,
+            format!("Paginating crawl using URL {:?}", next.as_ref()),
+        )
+        .await
    }

    /// Checks for the status of a crawl, based on the crawl's ID. To be used in conjunction with `FirecrawlApp::crawl_url_async`.
-    pub async fn check_crawl_status(&self, id: impl AsRef<str>) -> Result<CrawlStatus, FirecrawlError> {
+    pub async fn check_crawl_status(
+        &self,
+        id: impl AsRef<str>,
+    ) -> Result<CrawlStatus, FirecrawlError> {
        let response = self
            .client
-            .get(&format!(
+            .get(format!(
                "{}{}/crawl/{}",
-                self.api_url, API_VERSION, id.as_ref()
+                self.api_url,
+                API_VERSION,
+                id.as_ref()
            ))
            .headers(self.prepare_headers(None))
            .send()
            .await
-            .map_err(|e| FirecrawlError::HttpError(format!("Checking status of crawl {}", id.as_ref()), e))?;
+            .map_err(|e| {
+                FirecrawlError::HttpError(format!("Checking status of crawl {}", id.as_ref()), e)
+            })?;

-        let mut status: CrawlStatus = self.handle_response(response, format!("Checking status of crawl {}", id.as_ref())).await?;
+        let mut status: CrawlStatus = self
+            .handle_response(
+                response,
+                format!("Checking status of crawl {}", id.as_ref()),
+            )
+            .await?;

        if status.status == CrawlStatusTypes::Completed {
            while let Some(next) = status.next {
@@ -304,16 +361,240 @@ impl FirecrawlApp {
                    tokio::time::sleep(tokio::time::Duration::from_millis(poll_interval)).await;
                }
                CrawlStatusTypes::Failed => {
-                    break Err(FirecrawlError::CrawlJobFailed(format!(
-                        "Crawl job failed."
-                    ), status_data));
+                    break Err(FirecrawlError::CrawlJobFailed(
+                        "Crawl job failed".into(),
+                        status_data,
+                    ));
                }
                CrawlStatusTypes::Cancelled => {
-                    break Err(FirecrawlError::CrawlJobFailed(format!(
-                        "Crawl job was cancelled."
-                    ), status_data));
+                    break Err(FirecrawlError::CrawlJobFailed(
+                        "Crawl job was cancelled.".into(),
+                        status_data,
+                    ));
                }
            }
        }
    }
+
+    /// Cancel an asynchronous crawl job using the Firecrawl API.
+    ///
+    /// # Returns
+    ///
+    /// A response indicating whether the cancellation was successful, or a FirecrawlError if the request fails.
+    pub async fn cancel_crawl(
+        &self,
+        id: impl AsRef<str>,
+    ) -> Result<CancelCrawlResponse, FirecrawlError> {
+        let response = self
+            .client
+            .delete(format!(
+                "{}{}/crawl/{}",
+                self.api_url,
+                API_VERSION,
+                id.as_ref()
+            ))
+            .headers(self.prepare_headers(None))
+            .send()
+            .await
+            .map_err(|e| {
+                FirecrawlError::HttpError(format!("Cancelling crawl {}", id.as_ref()), e)
+            })?;
+
+        self.handle_response(response, "crawl_cancel").await
+    }
+
+    /// Returns information about crawl errors.
+    ///
+    /// # Returns
+    ///
+    /// A response containing information about crawl errors, or a FirecrawlError if the request fails.
+    pub async fn check_crawl_errors(
+        &self,
+        id: impl AsRef<str>,
+    ) -> Result<CrawlErrorsResponse, FirecrawlError> {
+        let response = self
+            .client
+            .get(format!(
+                "{}{}/crawl/{}/errors",
+                self.api_url,
+                API_VERSION,
+                id.as_ref()
+            ))
+            .headers(self.prepare_headers(None))
+            .send()
+            .await
+            .map_err(|e| {
+                FirecrawlError::HttpError(format!("Checking errors for crawl {}", id.as_ref()), e)
+            })?;
+
+        self.handle_response(response, "crawl_check").await
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use serde_json::json;
+
+    #[tokio::test]
+    #[ignore = "Makes real network request"]
+    async fn test_real_cancel_crawl() {
+        let api_url = std::env::var("FIRECRAWL_API_URL")
+            .expect("Please set the FIRECRAWL_API_URL environment variable");
+        let app = FirecrawlApp::new_selfhosted(api_url, None::<&str>).unwrap();
+
+        // First start a crawl job
+        let crawl_response = app
+            .crawl_url_async("https://example.com", None)
+            .await
+            .unwrap();
+
+        // Then cancel it
+        let cancel_response = app.cancel_crawl(crawl_response.id).await.unwrap();
+
+        assert_eq!(cancel_response.status, "cancelled");
+    }
+
+    #[tokio::test]
+    async fn test_cancel_crawl_with_mock() {
+        let mut server = mockito::Server::new_async().await;
+
+        // Set up the mock for the cancel request
+        let mock = server
+            .mock("DELETE", "/v1/crawl/test-crawl-id")
+            .with_status(200)
+            .with_header("content-type", "application/json")
+            .with_body(
+                json!({
+                    "success": null,
+                    "status": "cancelled"
+                })
+                .to_string(),
+            )
+            .create();
+
+        let app = FirecrawlApp::new_selfhosted(server.url(), Some("test_key")).unwrap();
+        let response = app.cancel_crawl("test-crawl-id").await.unwrap();
+
+        assert_eq!(response.status, "cancelled");
+        mock.assert();
+    }
+
+    #[tokio::test]
+    async fn test_cancel_crawl_error_response() {
+        let mut server = mockito::Server::new_async().await;
+
+        // Set up the mock for an error response
+        let mock = server
+            .mock("DELETE", "/v1/crawl/invalid-id")
+            .with_status(404)
+            .with_header("content-type", "application/json")
+            .with_body(
+                json!({
+                    "success": false,
+                    "error": "Crawl job not found"
+                })
+                .to_string(),
+            )
+            .create();
+
+        let app = FirecrawlApp::new_selfhosted(server.url(), Some("test_key")).unwrap();
+        let result = app.cancel_crawl("invalid-id").await;
+
+        assert!(result.is_err());
+        mock.assert();
+    }
+
+    #[tokio::test]
+    #[ignore = "Makes real network request"]
+    async fn test_real_check_crawl_errors() {
+        let api_url = std::env::var("FIRECRAWL_API_URL")
+            .expect("Please set the FIRECRAWL_API_URL environment variable");
+        let app = FirecrawlApp::new_selfhosted(api_url, None::<&str>).unwrap();
+
+        // First start a crawl job
+        let crawl_response = app
+            .crawl_url_async("https://no-wer-agg.invalid", None)
+            .await
+            .unwrap();
+
+        // Check for errors
+        let errors_response = app.check_crawl_errors(crawl_response.id).await.unwrap();
+        println!("{errors_response:?}");
+
+        tokio::time::sleep(tokio::time::Duration::from_secs(3)).await;
+
+        assert!(
+            !errors_response.errors.is_empty(),
+            "WARN: Error returned related to Supabase not in my environment. It may fail"
+        );
+    }
+
+    #[tokio::test]
+    async fn test_check_crawl_errors_with_mock() {
+        let mut server = mockito::Server::new_async().await;
+
+        // Set up the mock for the check errors request
+        let mock = server
+            .mock("GET", "/v1/crawl/test-crawl-id/errors")
+            .with_status(200)
+            .with_header("content-type", "application/json")
+            .with_body(
+                json!({
+                    "success": true,
+                    "errors": [
+                        {
+                            "id": "error1",
+                            "timestamp": "2023-01-01T00:00:00Z",
+                            "url": "https://example.com/error-page",
+                            "error": "Failed to load page"
+                        }
+                    ],
+                    "robotsBlocked": [
+                        "https://example.com/blocked-by-robots"
+                    ]
+                })
+                .to_string(),
+            )
+            .create();
+
+        let app = FirecrawlApp::new_selfhosted(server.url(), Some("test_key")).unwrap();
+        let response = app.check_crawl_errors("test-crawl-id").await.unwrap();
+
+        assert_eq!(response.errors.len(), 1);
+        assert_eq!(response.errors[0].id, "error1");
+        assert_eq!(response.errors[0].url, "https://example.com/error-page");
+        assert_eq!(response.errors[0].error, "Failed to load page");
+        assert_eq!(response.robots_blocked.len(), 1);
+        assert_eq!(
+            response.robots_blocked[0],
+            "https://example.com/blocked-by-robots"
+        );
+        mock.assert();
+    }
+
+    #[tokio::test]
+    async fn test_check_crawl_errors_error_response() {
+        let mut server = mockito::Server::new_async().await;
+
+        // Set up the mock for an error response
+        let mock = server
+            .mock("GET", "/v1/crawl/invalid-id/errors")
+            .with_status(404)
+            .with_header("content-type", "application/json")
+            .with_body(
+                json!({
+                    "success": false,
+                    "error": "Crawl job not found"
+                })
+                .to_string(),
+            )
+            .create();
+
+        let app = FirecrawlApp::new_selfhosted(server.url(), Some("test_key")).unwrap();
+        let result = app.check_crawl_errors("invalid-id").await;
+
+        assert!(result.is_err());
+        mock.assert();
+    }
 }