fix(rust): improve

This commit is contained in:
Móricz Gergő
2025-01-25 12:59:14 +01:00
parent 4a1ab6f01c
commit dacc5d4f45
+28 -13
View File
@@ -5,8 +5,12 @@ use serde::Deserialize;
use serde_json::Value; use serde_json::Value;
use url::Url; use url::Url;
/// Extracts links from HTML
///
/// # Safety
/// Input options must be a C HTML string. Output will be a JSON string array. Output string must be freed with free_string.
#[no_mangle] #[no_mangle]
pub extern "C" fn extract_links(html: *const libc::c_char) -> *mut i8 { pub unsafe extern "C" fn extract_links(html: *const libc::c_char) -> *mut i8 {
let html = unsafe { CStr::from_ptr(html) }.to_str().unwrap(); let html = unsafe { CStr::from_ptr(html) }.to_str().unwrap();
let document = parse_html().one(html); let document = parse_html().one(html);
@@ -44,9 +48,12 @@ macro_rules! insert_meta_property {
}; };
} }
/// Extracts metadata from HTML
///
/// # Safety
/// Input options must be a C HTML string. Output will be a JSON object. Output string must be freed with free_string.
#[no_mangle] #[no_mangle]
pub extern "C" fn extract_metadata(html: *const libc::c_char) -> *mut i8 { pub unsafe extern "C" fn extract_metadata(html: *const libc::c_char) -> *mut i8 {
let html = unsafe { CStr::from_ptr(html) }.to_str().unwrap(); let html = unsafe { CStr::from_ptr(html) }.to_str().unwrap();
let document = parse_html().one(html); let document = parse_html().one(html);
@@ -209,12 +216,12 @@ struct ImageSource {
fn _transform_html_inner(opts: TranformHTMLOptions) -> Result<String, ()> { fn _transform_html_inner(opts: TranformHTMLOptions) -> Result<String, ()> {
let mut document = parse_html().one(opts.html); let mut document = parse_html().one(opts.html);
if opts.include_tags.len() > 0 { if !opts.include_tags.is_empty() {
let new_document = parse_html().one("<div></div>"); let new_document = parse_html().one("<div></div>");
let root = new_document.select_first("div")?; let root = new_document.select_first("div")?;
for x in opts.include_tags.iter() { for x in opts.include_tags.iter() {
for tag in document.select(&x)? { for tag in document.select(x)? {
root.as_node().append(tag.as_node().clone()); root.as_node().append(tag.as_node().clone());
} }
} }
@@ -244,16 +251,16 @@ fn _transform_html_inner(opts: TranformHTMLOptions) -> Result<String, ()> {
for x in opts.exclude_tags.iter() { for x in opts.exclude_tags.iter() {
// TODO: implement weird version // TODO: implement weird version
while let Ok(x) = document.select_first(&x) { while let Ok(x) = document.select_first(x) {
x.as_node().detach(); x.as_node().detach();
} }
} }
if opts.only_main_content { if opts.only_main_content {
for x in EXCLUDE_NON_MAIN_TAGS.iter() { for x in EXCLUDE_NON_MAIN_TAGS.iter() {
let x: Vec<_> = document.select(&format!("{}", x))?.collect(); let x: Vec<_> = document.select(x)?.collect();
for tag in x { for tag in x {
if !FORCE_INCLUDE_MAIN_TAGS.iter().any(|x| tag.as_node().select(&x).is_ok_and(|mut x| x.next().is_some())) { if !FORCE_INCLUDE_MAIN_TAGS.iter().any(|x| tag.as_node().select(x).is_ok_and(|mut x| x.next().is_some())) {
tag.as_node().detach(); tag.as_node().detach();
} }
} }
@@ -261,9 +268,9 @@ fn _transform_html_inner(opts: TranformHTMLOptions) -> Result<String, ()> {
} }
for img in document.select("img[srcset]")? { for img in document.select("img[srcset]")? {
let mut sizes: Vec<ImageSource> = img.attributes.borrow().get("srcset").ok_or(())?.to_string().split(",").filter_map(|x| { let mut sizes: Vec<ImageSource> = img.attributes.borrow().get("srcset").ok_or(())?.split(",").filter_map(|x| {
let tok: Vec<&str> = x.trim().split(" ").collect(); let tok: Vec<&str> = x.trim().split(" ").collect();
let tok_1 = if tok.len() > 1 && tok[1].len() > 0 { let tok_1 = if tok.len() > 1 && !tok[1].is_empty() {
tok[1] tok[1]
} else { } else {
"1x" "1x"
@@ -315,9 +322,13 @@ fn _transform_html_inner(opts: TranformHTMLOptions) -> Result<String, ()> {
Ok(document.to_string()) Ok(document.to_string())
} }
/// Transforms rawHtml to html (formerly removeUnwantedElements)
///
/// # Safety
/// Input options must be a C JSON string. Output will be an HTML string. Output string must be freed with free_string.
#[no_mangle] #[no_mangle]
pub extern "C" fn transform_html(opts: *const libc::c_char) -> *mut i8 { pub unsafe extern "C" fn transform_html(opts: *const libc::c_char) -> *mut i8 {
let opts: TranformHTMLOptions = match unsafe { CStr::from_ptr(opts) }.to_str().map_err(|_| ()).and_then(|x| serde_json::de::from_str(&x).map_err(|_| ())) { let opts: TranformHTMLOptions = match unsafe { CStr::from_ptr(opts) }.to_str().map_err(|_| ()).and_then(|x| serde_json::de::from_str(x).map_err(|_| ())) {
Ok(x) => x, Ok(x) => x,
Err(_) => { Err(_) => {
return CString::new("RUSTFC:ERROR").unwrap().into_raw(); return CString::new("RUSTFC:ERROR").unwrap().into_raw();
@@ -332,7 +343,11 @@ pub extern "C" fn transform_html(opts: *const libc::c_char) -> *mut i8 {
CString::new(out).unwrap().into_raw() CString::new(out).unwrap().into_raw()
} }
/// Frees a string allocated in Rust-land.
///
/// # Safety
/// ptr must be a non-freed string pointer returned by Rust code.
#[no_mangle] #[no_mangle]
pub extern "C" fn free_string(ptr: *mut i8) { pub unsafe extern "C" fn free_string(ptr: *mut i8) {
drop(unsafe { CString::from_raw(ptr) }) drop(unsafe { CString::from_raw(ptr) })
} }