diff --git a/apps/api/sharedLibs/html-transformer/src/lib.rs b/apps/api/sharedLibs/html-transformer/src/lib.rs
index 8d944f22..eb3be8b9 100644
--- a/apps/api/sharedLibs/html-transformer/src/lib.rs
+++ b/apps/api/sharedLibs/html-transformer/src/lib.rs
@@ -128,17 +128,30 @@ pub unsafe extern "C" fn extract_metadata(html: *const libc::c_char) -> *mut lib
if let Some(content) = attrs.get("content") {
if let Some(v) = out.get(name) {
match v {
- Value::String(_) => {
- if name != "title" { // preserve title tag in metadata
- out.insert(name.to_string(), Value::Array(vec! [v.clone(), Value::String(content.to_string())]));
+ Value::String(existing) => {
+ if name == "description" {
+ out.insert(name.to_string(), Value::String(format!("{}, {}", existing, content)));
+ } else if name != "title" { // preserve title tag in metadata
+ out.insert(name.to_string(), Value::Array(vec! [Value::String(existing.clone()), Value::String(content.to_string())]));
}
},
- Value::Array(_) => {
- match out.get_mut(name) {
- Some(Value::Array(x)) => {
- x.push(Value::String(content.to_string()));
- },
- _ => unreachable!(),
+ Value::Array(existing_array) => {
+ if name == "description" {
+ let mut values: Vec = existing_array.iter()
+ .filter_map(|v| match v {
+ Value::String(s) => Some(s.clone()),
+ _ => None,
+ })
+ .collect();
+ values.push(content.to_string());
+ out.insert(name.to_string(), Value::String(values.join(", ")));
+ } else {
+ match out.get_mut(name) {
+ Some(Value::Array(x)) => {
+ x.push(Value::String(content.to_string()));
+ },
+ _ => unreachable!(),
+ }
}
},
_ => unreachable!(),
diff --git a/apps/api/src/__tests__/snips/metadata-concat.test.ts b/apps/api/src/__tests__/snips/metadata-concat.test.ts
new file mode 100644
index 00000000..b02319a8
--- /dev/null
+++ b/apps/api/src/__tests__/snips/metadata-concat.test.ts
@@ -0,0 +1,44 @@
+import { extractMetadata } from "../../scraper/scrapeURL/lib/extractMetadata";
+import { jest, describe, it, expect } from "@jest/globals";
+
+describe("Metadata concatenation", () => {
+ it("should concatenate description field into a string while preserving arrays for other metadata fields", async () => {
+ const html = `
+
+
+
+
+
+
+
+
+
+
+
+ `;
+
+ const meta: any = {
+ url: "https://example.com",
+ id: "test-id",
+ logger: {
+ warn: jest.fn(),
+ error: jest.fn()
+ }
+ };
+
+ const metadata = await extractMetadata(meta, html);
+
+ expect(metadata.description).toBeDefined();
+ expect(Array.isArray(metadata.description)).toBe(false);
+ expect(typeof metadata.description).toBe("string");
+ expect(metadata.description).toBe("First description, Second description");
+
+ expect(metadata.ogLocaleAlternate).toBeDefined();
+ expect(Array.isArray(metadata.ogLocaleAlternate)).toBe(true);
+ expect(metadata.ogLocaleAlternate).toEqual(["en_US", "fr_FR"]);
+
+ expect(metadata.keywords).toBeDefined();
+ expect(Array.isArray(metadata.keywords)).toBe(true);
+ expect(metadata.keywords).toEqual(["first keyword", "second keyword"]);
+ });
+});
diff --git a/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts b/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts
index e0d68534..61d5ab04 100644
--- a/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts
+++ b/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts
@@ -137,12 +137,22 @@ export async function extractMetadata(
const content = soup(elem).attr("content");
if (name && content) {
- if (customMetadata[name] === undefined) {
- customMetadata[name] = content;
- } else if (Array.isArray(customMetadata[name])) {
- (customMetadata[name] as string[]).push(content);
+ if (name === "description") {
+ if (customMetadata[name] === undefined) {
+ customMetadata[name] = content;
+ } else {
+ customMetadata[name] = Array.isArray(customMetadata[name])
+ ? [...customMetadata[name] as string[], content].join(", ")
+ : `${customMetadata[name]}, ${content}`;
+ }
} else {
- customMetadata[name] = [customMetadata[name] as string, content];
+ if (customMetadata[name] === undefined) {
+ customMetadata[name] = content;
+ } else if (Array.isArray(customMetadata[name])) {
+ (customMetadata[name] as string[]).push(content);
+ } else {
+ customMetadata[name] = [customMetadata[name] as string, content];
+ }
}
}
} catch (error) {