diff --git a/apps/api/sharedLibs/html-transformer/src/lib.rs b/apps/api/sharedLibs/html-transformer/src/lib.rs index 8d944f22..eb3be8b9 100644 --- a/apps/api/sharedLibs/html-transformer/src/lib.rs +++ b/apps/api/sharedLibs/html-transformer/src/lib.rs @@ -128,17 +128,30 @@ pub unsafe extern "C" fn extract_metadata(html: *const libc::c_char) -> *mut lib if let Some(content) = attrs.get("content") { if let Some(v) = out.get(name) { match v { - Value::String(_) => { - if name != "title" { // preserve title tag in metadata - out.insert(name.to_string(), Value::Array(vec! [v.clone(), Value::String(content.to_string())])); + Value::String(existing) => { + if name == "description" { + out.insert(name.to_string(), Value::String(format!("{}, {}", existing, content))); + } else if name != "title" { // preserve title tag in metadata + out.insert(name.to_string(), Value::Array(vec! [Value::String(existing.clone()), Value::String(content.to_string())])); } }, - Value::Array(_) => { - match out.get_mut(name) { - Some(Value::Array(x)) => { - x.push(Value::String(content.to_string())); - }, - _ => unreachable!(), + Value::Array(existing_array) => { + if name == "description" { + let mut values: Vec = existing_array.iter() + .filter_map(|v| match v { + Value::String(s) => Some(s.clone()), + _ => None, + }) + .collect(); + values.push(content.to_string()); + out.insert(name.to_string(), Value::String(values.join(", "))); + } else { + match out.get_mut(name) { + Some(Value::Array(x)) => { + x.push(Value::String(content.to_string())); + }, + _ => unreachable!(), + } } }, _ => unreachable!(), diff --git a/apps/api/src/__tests__/snips/metadata-concat.test.ts b/apps/api/src/__tests__/snips/metadata-concat.test.ts new file mode 100644 index 00000000..b02319a8 --- /dev/null +++ b/apps/api/src/__tests__/snips/metadata-concat.test.ts @@ -0,0 +1,44 @@ +import { extractMetadata } from "../../scraper/scrapeURL/lib/extractMetadata"; +import { jest, describe, it, expect } from "@jest/globals"; + +describe("Metadata concatenation", () => { + it("should concatenate description field into a string while preserving arrays for other metadata fields", async () => { + const html = ` + + + + + + + + + + + + `; + + const meta: any = { + url: "https://example.com", + id: "test-id", + logger: { + warn: jest.fn(), + error: jest.fn() + } + }; + + const metadata = await extractMetadata(meta, html); + + expect(metadata.description).toBeDefined(); + expect(Array.isArray(metadata.description)).toBe(false); + expect(typeof metadata.description).toBe("string"); + expect(metadata.description).toBe("First description, Second description"); + + expect(metadata.ogLocaleAlternate).toBeDefined(); + expect(Array.isArray(metadata.ogLocaleAlternate)).toBe(true); + expect(metadata.ogLocaleAlternate).toEqual(["en_US", "fr_FR"]); + + expect(metadata.keywords).toBeDefined(); + expect(Array.isArray(metadata.keywords)).toBe(true); + expect(metadata.keywords).toEqual(["first keyword", "second keyword"]); + }); +}); diff --git a/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts b/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts index e0d68534..61d5ab04 100644 --- a/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts +++ b/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts @@ -137,12 +137,22 @@ export async function extractMetadata( const content = soup(elem).attr("content"); if (name && content) { - if (customMetadata[name] === undefined) { - customMetadata[name] = content; - } else if (Array.isArray(customMetadata[name])) { - (customMetadata[name] as string[]).push(content); + if (name === "description") { + if (customMetadata[name] === undefined) { + customMetadata[name] = content; + } else { + customMetadata[name] = Array.isArray(customMetadata[name]) + ? [...customMetadata[name] as string[], content].join(", ") + : `${customMetadata[name]}, ${content}`; + } } else { - customMetadata[name] = [customMetadata[name] as string, content]; + if (customMetadata[name] === undefined) { + customMetadata[name] = content; + } else if (Array.isArray(customMetadata[name])) { + (customMetadata[name] as string[]).push(content); + } else { + customMetadata[name] = [customMetadata[name] as string, content]; + } } } } catch (error) {