feat(rust): update rust sdk to support new features (#1446)
* chore(rust-sdk): cargo fmt * feat(rust-sdk): implement search api + example + test * feat(rust-sdk): implement crawl cancel api + example + test * feat(rust-sdk): implement crawl check errors api + example + test * feat(rust-sdk): implement batch crawl + test + example + Fix MapOptions * feat(rust-sdk): implement extract api + test + example * feat(rust-sdk): implement llmtxt api + test + example * chore(rust-sdk): correct mock tests * chore(rust-sdk): prep for cargo distribution
This commit is contained in:
+307
-26
@@ -2,7 +2,11 @@ use std::collections::HashMap;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::{document::Document, scrape::{ScrapeFormats, ScrapeOptions}, FirecrawlApp, FirecrawlError, API_VERSION};
|
||||
use crate::{
|
||||
document::Document,
|
||||
scrape::{ScrapeFormats, ScrapeOptions},
|
||||
FirecrawlApp, FirecrawlError, API_VERSION,
|
||||
};
|
||||
|
||||
#[derive(Deserialize, Serialize, Clone, Copy, Debug)]
|
||||
pub enum CrawlScrapeFormats {
|
||||
@@ -23,13 +27,13 @@ pub enum CrawlScrapeFormats {
|
||||
Links,
|
||||
|
||||
/// Will result in a URL to a screenshot of the page.
|
||||
///
|
||||
///
|
||||
/// Can not be used in conjunction with `CrawlScrapeFormats::ScreenshotFullPage`.
|
||||
#[serde(rename = "screenshot")]
|
||||
Screenshot,
|
||||
|
||||
/// Will result in a URL to a full-page screenshot of the page.
|
||||
///
|
||||
///
|
||||
/// Can not be used in conjunction with `CrawlScrapeFormats::Screenshot`.
|
||||
#[serde(rename = "screenshot@fullPage")]
|
||||
ScreenshotFullPage,
|
||||
@@ -59,12 +63,12 @@ pub struct CrawlScrapeOptions {
|
||||
pub only_main_content: Option<bool>,
|
||||
|
||||
/// HTML tags to exclusively include.
|
||||
///
|
||||
///
|
||||
/// For example, if you pass `div`, you will only get content from `<div>`s and their children.
|
||||
pub include_tags: Option<Vec<String>>,
|
||||
|
||||
/// HTML tags to exclude.
|
||||
///
|
||||
///
|
||||
/// For example, if you pass `img`, you will never get image URLs in your results.
|
||||
pub exclude_tags: Option<Vec<String>>,
|
||||
|
||||
@@ -81,7 +85,9 @@ pub struct CrawlScrapeOptions {
|
||||
impl From<CrawlScrapeOptions> for ScrapeOptions {
|
||||
fn from(value: CrawlScrapeOptions) -> Self {
|
||||
ScrapeOptions {
|
||||
formats: value.formats.map(|formats| formats.into_iter().map(|x| x.into()).collect()),
|
||||
formats: value
|
||||
.formats
|
||||
.map(|formats| formats.into_iter().map(|x| x.into()).collect()),
|
||||
only_main_content: value.only_main_content,
|
||||
include_tags: value.include_tags,
|
||||
exclude_tags: value.exclude_tags,
|
||||
@@ -101,12 +107,12 @@ pub struct CrawlOptions {
|
||||
pub scrape_options: Option<CrawlScrapeOptions>,
|
||||
|
||||
/// URL RegEx patterns to (exclusively) include.
|
||||
///
|
||||
///
|
||||
/// For example, if you specified `"blog"`, only pages that have `blog` somewhere in the URL would be crawled.
|
||||
pub include_paths: Option<Vec<String>>,
|
||||
|
||||
/// URL RegEx patterns to exclude.
|
||||
///
|
||||
///
|
||||
/// For example, if you specified `"blog"`, pages that have `blog` somewhere in the URL would not be crawled.
|
||||
pub exclude_paths: Option<Vec<String>>,
|
||||
|
||||
@@ -200,6 +206,29 @@ pub struct CrawlStatus {
|
||||
pub data: Vec<Document>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Serialize, Debug, Clone)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct CrawlError {
|
||||
pub id: String,
|
||||
pub timestamp: Option<String>,
|
||||
pub url: String,
|
||||
pub error: String,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Serialize, Debug, Clone)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct CrawlErrorsResponse {
|
||||
pub errors: Vec<CrawlError>,
|
||||
#[serde(rename = "robotsBlocked")]
|
||||
pub robots_blocked: Vec<String>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Serialize, Debug, Clone)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct CancelCrawlResponse {
|
||||
pub status: String,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Serialize, Debug, Clone)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct CrawlAsyncResponse {
|
||||
@@ -223,19 +252,20 @@ impl FirecrawlApp {
|
||||
url: url.as_ref().to_string(),
|
||||
options: options.unwrap_or_default(),
|
||||
};
|
||||
|
||||
|
||||
let headers = self.prepare_headers(body.options.idempotency_key.as_ref());
|
||||
|
||||
let response = self
|
||||
.client
|
||||
.post(&format!("{}{}/crawl", self.api_url, API_VERSION))
|
||||
.post(format!("{}{}/crawl", self.api_url, API_VERSION))
|
||||
.headers(headers.clone())
|
||||
.json(&body)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| FirecrawlError::HttpError(format!("Crawling {:?}", url.as_ref()), e))?;
|
||||
|
||||
self.handle_response::<CrawlAsyncResponse>(response, "start crawl job").await
|
||||
self.handle_response::<CrawlAsyncResponse>(response, "start crawl job")
|
||||
.await
|
||||
}
|
||||
|
||||
/// Performs a crawl job for a URL using the Firecrawl API, waiting for the end result. This may take a long time depending on the size of the target page and your options (namely `CrawlOptions.limit`).
|
||||
@@ -245,38 +275,65 @@ impl FirecrawlApp {
|
||||
options: impl Into<Option<CrawlOptions>>,
|
||||
) -> Result<CrawlStatus, FirecrawlError> {
|
||||
let options = options.into();
|
||||
let poll_interval = options.as_ref().and_then(|x| x.poll_interval).unwrap_or(2000);
|
||||
let poll_interval = options
|
||||
.as_ref()
|
||||
.and_then(|x| x.poll_interval)
|
||||
.unwrap_or(2000);
|
||||
let res = self.crawl_url_async(url, options).await?;
|
||||
|
||||
self.monitor_job_status(&res.id, poll_interval).await
|
||||
}
|
||||
|
||||
async fn check_crawl_status_next(&self, next: impl AsRef<str>) -> Result<CrawlStatus, FirecrawlError> {
|
||||
async fn check_crawl_status_next(
|
||||
&self,
|
||||
next: impl AsRef<str>,
|
||||
) -> Result<CrawlStatus, FirecrawlError> {
|
||||
let response = self
|
||||
.client
|
||||
.get(next.as_ref())
|
||||
.headers(self.prepare_headers(None))
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| FirecrawlError::HttpError(format!("Paginating crawl using URL {:?}", next.as_ref()), e))?;
|
||||
.map_err(|e| {
|
||||
FirecrawlError::HttpError(
|
||||
format!("Paginating crawl using URL {:?}", next.as_ref()),
|
||||
e,
|
||||
)
|
||||
})?;
|
||||
|
||||
self.handle_response(response, format!("Paginating crawl using URL {:?}", next.as_ref())).await
|
||||
self.handle_response(
|
||||
response,
|
||||
format!("Paginating crawl using URL {:?}", next.as_ref()),
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
/// Checks for the status of a crawl, based on the crawl's ID. To be used in conjunction with `FirecrawlApp::crawl_url_async`.
|
||||
pub async fn check_crawl_status(&self, id: impl AsRef<str>) -> Result<CrawlStatus, FirecrawlError> {
|
||||
pub async fn check_crawl_status(
|
||||
&self,
|
||||
id: impl AsRef<str>,
|
||||
) -> Result<CrawlStatus, FirecrawlError> {
|
||||
let response = self
|
||||
.client
|
||||
.get(&format!(
|
||||
.get(format!(
|
||||
"{}{}/crawl/{}",
|
||||
self.api_url, API_VERSION, id.as_ref()
|
||||
self.api_url,
|
||||
API_VERSION,
|
||||
id.as_ref()
|
||||
))
|
||||
.headers(self.prepare_headers(None))
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| FirecrawlError::HttpError(format!("Checking status of crawl {}", id.as_ref()), e))?;
|
||||
.map_err(|e| {
|
||||
FirecrawlError::HttpError(format!("Checking status of crawl {}", id.as_ref()), e)
|
||||
})?;
|
||||
|
||||
let mut status: CrawlStatus = self.handle_response(response, format!("Checking status of crawl {}", id.as_ref())).await?;
|
||||
let mut status: CrawlStatus = self
|
||||
.handle_response(
|
||||
response,
|
||||
format!("Checking status of crawl {}", id.as_ref()),
|
||||
)
|
||||
.await?;
|
||||
|
||||
if status.status == CrawlStatusTypes::Completed {
|
||||
while let Some(next) = status.next {
|
||||
@@ -304,16 +361,240 @@ impl FirecrawlApp {
|
||||
tokio::time::sleep(tokio::time::Duration::from_millis(poll_interval)).await;
|
||||
}
|
||||
CrawlStatusTypes::Failed => {
|
||||
break Err(FirecrawlError::CrawlJobFailed(format!(
|
||||
"Crawl job failed."
|
||||
), status_data));
|
||||
break Err(FirecrawlError::CrawlJobFailed(
|
||||
"Crawl job failed".into(),
|
||||
status_data,
|
||||
));
|
||||
}
|
||||
CrawlStatusTypes::Cancelled => {
|
||||
break Err(FirecrawlError::CrawlJobFailed(format!(
|
||||
"Crawl job was cancelled."
|
||||
), status_data));
|
||||
break Err(FirecrawlError::CrawlJobFailed(
|
||||
"Crawl job was cancelled.".into(),
|
||||
status_data,
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Cancel an asynchronous crawl job using the Firecrawl API.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// A response indicating whether the cancellation was successful, or a FirecrawlError if the request fails.
|
||||
pub async fn cancel_crawl(
|
||||
&self,
|
||||
id: impl AsRef<str>,
|
||||
) -> Result<CancelCrawlResponse, FirecrawlError> {
|
||||
let response = self
|
||||
.client
|
||||
.delete(format!(
|
||||
"{}{}/crawl/{}",
|
||||
self.api_url,
|
||||
API_VERSION,
|
||||
id.as_ref()
|
||||
))
|
||||
.headers(self.prepare_headers(None))
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| {
|
||||
FirecrawlError::HttpError(format!("Cancelling crawl {}", id.as_ref()), e)
|
||||
})?;
|
||||
|
||||
self.handle_response(response, "crawl_cancel").await
|
||||
}
|
||||
|
||||
/// Returns information about crawl errors.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// A response containing information about crawl errors, or a FirecrawlError if the request fails.
|
||||
pub async fn check_crawl_errors(
|
||||
&self,
|
||||
id: impl AsRef<str>,
|
||||
) -> Result<CrawlErrorsResponse, FirecrawlError> {
|
||||
let response = self
|
||||
.client
|
||||
.get(format!(
|
||||
"{}{}/crawl/{}/errors",
|
||||
self.api_url,
|
||||
API_VERSION,
|
||||
id.as_ref()
|
||||
))
|
||||
.headers(self.prepare_headers(None))
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| {
|
||||
FirecrawlError::HttpError(format!("Checking errors for crawl {}", id.as_ref()), e)
|
||||
})?;
|
||||
|
||||
self.handle_response(response, "crawl_check").await
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use serde_json::json;
|
||||
|
||||
#[tokio::test]
|
||||
#[ignore = "Makes real network request"]
|
||||
async fn test_real_cancel_crawl() {
|
||||
let api_url = std::env::var("FIRECRAWL_API_URL")
|
||||
.expect("Please set the FIRECRAWL_API_URL environment variable");
|
||||
let app = FirecrawlApp::new_selfhosted(api_url, None::<&str>).unwrap();
|
||||
|
||||
// First start a crawl job
|
||||
let crawl_response = app
|
||||
.crawl_url_async("https://example.com", None)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Then cancel it
|
||||
let cancel_response = app.cancel_crawl(crawl_response.id).await.unwrap();
|
||||
|
||||
assert_eq!(cancel_response.status, "cancelled");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_cancel_crawl_with_mock() {
|
||||
let mut server = mockito::Server::new_async().await;
|
||||
|
||||
// Set up the mock for the cancel request
|
||||
let mock = server
|
||||
.mock("DELETE", "/v1/crawl/test-crawl-id")
|
||||
.with_status(200)
|
||||
.with_header("content-type", "application/json")
|
||||
.with_body(
|
||||
json!({
|
||||
"success": null,
|
||||
"status": "cancelled"
|
||||
})
|
||||
.to_string(),
|
||||
)
|
||||
.create();
|
||||
|
||||
let app = FirecrawlApp::new_selfhosted(server.url(), Some("test_key")).unwrap();
|
||||
let response = app.cancel_crawl("test-crawl-id").await.unwrap();
|
||||
|
||||
assert_eq!(response.status, "cancelled");
|
||||
mock.assert();
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_cancel_crawl_error_response() {
|
||||
let mut server = mockito::Server::new_async().await;
|
||||
|
||||
// Set up the mock for an error response
|
||||
let mock = server
|
||||
.mock("DELETE", "/v1/crawl/invalid-id")
|
||||
.with_status(404)
|
||||
.with_header("content-type", "application/json")
|
||||
.with_body(
|
||||
json!({
|
||||
"success": false,
|
||||
"error": "Crawl job not found"
|
||||
})
|
||||
.to_string(),
|
||||
)
|
||||
.create();
|
||||
|
||||
let app = FirecrawlApp::new_selfhosted(server.url(), Some("test_key")).unwrap();
|
||||
let result = app.cancel_crawl("invalid-id").await;
|
||||
|
||||
assert!(result.is_err());
|
||||
mock.assert();
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
#[ignore = "Makes real network request"]
|
||||
async fn test_real_check_crawl_errors() {
|
||||
let api_url = std::env::var("FIRECRAWL_API_URL")
|
||||
.expect("Please set the FIRECRAWL_API_URL environment variable");
|
||||
let app = FirecrawlApp::new_selfhosted(api_url, None::<&str>).unwrap();
|
||||
|
||||
// First start a crawl job
|
||||
let crawl_response = app
|
||||
.crawl_url_async("https://no-wer-agg.invalid", None)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Check for errors
|
||||
let errors_response = app.check_crawl_errors(crawl_response.id).await.unwrap();
|
||||
println!("{errors_response:?}");
|
||||
|
||||
tokio::time::sleep(tokio::time::Duration::from_secs(3)).await;
|
||||
|
||||
assert!(
|
||||
!errors_response.errors.is_empty(),
|
||||
"WARN: Error returned related to Supabase not in my environment. It may fail"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_check_crawl_errors_with_mock() {
|
||||
let mut server = mockito::Server::new_async().await;
|
||||
|
||||
// Set up the mock for the check errors request
|
||||
let mock = server
|
||||
.mock("GET", "/v1/crawl/test-crawl-id/errors")
|
||||
.with_status(200)
|
||||
.with_header("content-type", "application/json")
|
||||
.with_body(
|
||||
json!({
|
||||
"success": true,
|
||||
"errors": [
|
||||
{
|
||||
"id": "error1",
|
||||
"timestamp": "2023-01-01T00:00:00Z",
|
||||
"url": "https://example.com/error-page",
|
||||
"error": "Failed to load page"
|
||||
}
|
||||
],
|
||||
"robotsBlocked": [
|
||||
"https://example.com/blocked-by-robots"
|
||||
]
|
||||
})
|
||||
.to_string(),
|
||||
)
|
||||
.create();
|
||||
|
||||
let app = FirecrawlApp::new_selfhosted(server.url(), Some("test_key")).unwrap();
|
||||
let response = app.check_crawl_errors("test-crawl-id").await.unwrap();
|
||||
|
||||
assert_eq!(response.errors.len(), 1);
|
||||
assert_eq!(response.errors[0].id, "error1");
|
||||
assert_eq!(response.errors[0].url, "https://example.com/error-page");
|
||||
assert_eq!(response.errors[0].error, "Failed to load page");
|
||||
assert_eq!(response.robots_blocked.len(), 1);
|
||||
assert_eq!(
|
||||
response.robots_blocked[0],
|
||||
"https://example.com/blocked-by-robots"
|
||||
);
|
||||
mock.assert();
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_check_crawl_errors_error_response() {
|
||||
let mut server = mockito::Server::new_async().await;
|
||||
|
||||
// Set up the mock for an error response
|
||||
let mock = server
|
||||
.mock("GET", "/v1/crawl/invalid-id/errors")
|
||||
.with_status(404)
|
||||
.with_header("content-type", "application/json")
|
||||
.with_body(
|
||||
json!({
|
||||
"success": false,
|
||||
"error": "Crawl job not found"
|
||||
})
|
||||
.to_string(),
|
||||
)
|
||||
.create();
|
||||
|
||||
let app = FirecrawlApp::new_selfhosted(server.url(), Some("test_key")).unwrap();
|
||||
let result = app.check_crawl_errors("invalid-id").await;
|
||||
|
||||
assert!(result.is_err());
|
||||
mock.assert();
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user