Fix: Concatenate metadata arrays into strings with exceptions (#1574)
* Fix: Concatenate metadata arrays into strings except for ogLocaleAlternate Co-Authored-By: Nicolas Camara <nicolascamara29@gmail.com> * Fix: Only concatenate description field, preserve other metadata arrays Co-Authored-By: Nicolas Camara <nicolascamara29@gmail.com> * Fix: Only concatenate description field, keep other metadata fields in original format Co-Authored-By: Nicolas Camara <nicolascamara29@gmail.com> --------- Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: Nicolas Camara <nicolascamara29@gmail.com>
This commit is contained in:
committed by
GitHub
parent
f838190ba6
commit
a5a915d639
@@ -128,17 +128,30 @@ pub unsafe extern "C" fn extract_metadata(html: *const libc::c_char) -> *mut lib
|
|||||||
if let Some(content) = attrs.get("content") {
|
if let Some(content) = attrs.get("content") {
|
||||||
if let Some(v) = out.get(name) {
|
if let Some(v) = out.get(name) {
|
||||||
match v {
|
match v {
|
||||||
Value::String(_) => {
|
Value::String(existing) => {
|
||||||
if name != "title" { // preserve title tag in metadata
|
if name == "description" {
|
||||||
out.insert(name.to_string(), Value::Array(vec! [v.clone(), Value::String(content.to_string())]));
|
out.insert(name.to_string(), Value::String(format!("{}, {}", existing, content)));
|
||||||
|
} else if name != "title" { // preserve title tag in metadata
|
||||||
|
out.insert(name.to_string(), Value::Array(vec! [Value::String(existing.clone()), Value::String(content.to_string())]));
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
Value::Array(_) => {
|
Value::Array(existing_array) => {
|
||||||
match out.get_mut(name) {
|
if name == "description" {
|
||||||
Some(Value::Array(x)) => {
|
let mut values: Vec<String> = existing_array.iter()
|
||||||
x.push(Value::String(content.to_string()));
|
.filter_map(|v| match v {
|
||||||
},
|
Value::String(s) => Some(s.clone()),
|
||||||
_ => unreachable!(),
|
_ => None,
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
values.push(content.to_string());
|
||||||
|
out.insert(name.to_string(), Value::String(values.join(", ")));
|
||||||
|
} else {
|
||||||
|
match out.get_mut(name) {
|
||||||
|
Some(Value::Array(x)) => {
|
||||||
|
x.push(Value::String(content.to_string()));
|
||||||
|
},
|
||||||
|
_ => unreachable!(),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
_ => unreachable!(),
|
_ => unreachable!(),
|
||||||
|
|||||||
@@ -0,0 +1,44 @@
|
|||||||
|
import { extractMetadata } from "../../scraper/scrapeURL/lib/extractMetadata";
|
||||||
|
import { jest, describe, it, expect } from "@jest/globals";
|
||||||
|
|
||||||
|
describe("Metadata concatenation", () => {
|
||||||
|
it("should concatenate description field into a string while preserving arrays for other metadata fields", async () => {
|
||||||
|
const html = `
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<meta name="description" content="First description">
|
||||||
|
<meta name="description" content="Second description">
|
||||||
|
<meta property="og:locale:alternate" content="en_US">
|
||||||
|
<meta property="og:locale:alternate" content="fr_FR">
|
||||||
|
<meta name="keywords" content="first keyword">
|
||||||
|
<meta name="keywords" content="second keyword">
|
||||||
|
</head>
|
||||||
|
<body></body>
|
||||||
|
</html>
|
||||||
|
`;
|
||||||
|
|
||||||
|
const meta: any = {
|
||||||
|
url: "https://example.com",
|
||||||
|
id: "test-id",
|
||||||
|
logger: {
|
||||||
|
warn: jest.fn(),
|
||||||
|
error: jest.fn()
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
const metadata = await extractMetadata(meta, html);
|
||||||
|
|
||||||
|
expect(metadata.description).toBeDefined();
|
||||||
|
expect(Array.isArray(metadata.description)).toBe(false);
|
||||||
|
expect(typeof metadata.description).toBe("string");
|
||||||
|
expect(metadata.description).toBe("First description, Second description");
|
||||||
|
|
||||||
|
expect(metadata.ogLocaleAlternate).toBeDefined();
|
||||||
|
expect(Array.isArray(metadata.ogLocaleAlternate)).toBe(true);
|
||||||
|
expect(metadata.ogLocaleAlternate).toEqual(["en_US", "fr_FR"]);
|
||||||
|
|
||||||
|
expect(metadata.keywords).toBeDefined();
|
||||||
|
expect(Array.isArray(metadata.keywords)).toBe(true);
|
||||||
|
expect(metadata.keywords).toEqual(["first keyword", "second keyword"]);
|
||||||
|
});
|
||||||
|
});
|
||||||
@@ -137,12 +137,22 @@ export async function extractMetadata(
|
|||||||
const content = soup(elem).attr("content");
|
const content = soup(elem).attr("content");
|
||||||
|
|
||||||
if (name && content) {
|
if (name && content) {
|
||||||
if (customMetadata[name] === undefined) {
|
if (name === "description") {
|
||||||
customMetadata[name] = content;
|
if (customMetadata[name] === undefined) {
|
||||||
} else if (Array.isArray(customMetadata[name])) {
|
customMetadata[name] = content;
|
||||||
(customMetadata[name] as string[]).push(content);
|
} else {
|
||||||
|
customMetadata[name] = Array.isArray(customMetadata[name])
|
||||||
|
? [...customMetadata[name] as string[], content].join(", ")
|
||||||
|
: `${customMetadata[name]}, ${content}`;
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
customMetadata[name] = [customMetadata[name] as string, content];
|
if (customMetadata[name] === undefined) {
|
||||||
|
customMetadata[name] = content;
|
||||||
|
} else if (Array.isArray(customMetadata[name])) {
|
||||||
|
(customMetadata[name] as string[]).push(content);
|
||||||
|
} else {
|
||||||
|
customMetadata[name] = [customMetadata[name] as string, content];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
|
|||||||
Reference in New Issue
Block a user