From 2e2c3d52ced4003611c124e7cf6df1e069c8fff9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 3 Apr 2025 09:57:16 +0200 Subject: [PATCH] feat: add swoogo classes to force include main tags --- .../api/sharedLibs/html-transformer/src/lib.rs | 18 ++++++++++++++++-- .../scrapeURL/lib/removeUnwantedElements.ts | 16 +++++++++++++++- 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/apps/api/sharedLibs/html-transformer/src/lib.rs b/apps/api/sharedLibs/html-transformer/src/lib.rs index 8643208d..8d944f22 100644 --- a/apps/api/sharedLibs/html-transformer/src/lib.rs +++ b/apps/api/sharedLibs/html-transformer/src/lib.rs @@ -197,8 +197,22 @@ const EXCLUDE_NON_MAIN_TAGS: [&str; 41] = [ "#cookie", ]; -const FORCE_INCLUDE_MAIN_TAGS: [&str; 1] = [ - "#main" +const FORCE_INCLUDE_MAIN_TAGS: [&str; 13] = [ + "#main", + + // swoogo event software as .widget in all of their content + ".swoogo-cols", + ".swoogo-text", + ".swoogo-table-div", + ".swoogo-space", + ".swoogo-alert", + ".swoogo-sponsors", + ".swoogo-title", + ".swoogo-tabs", + ".swoogo-logo", + ".swoogo-image", + ".swoogo-button", + ".swoogo-agenda", ]; #[derive(Deserialize)] diff --git a/apps/api/src/scraper/scrapeURL/lib/removeUnwantedElements.ts b/apps/api/src/scraper/scrapeURL/lib/removeUnwantedElements.ts index 693b02bb..62a0a726 100644 --- a/apps/api/src/scraper/scrapeURL/lib/removeUnwantedElements.ts +++ b/apps/api/src/scraper/scrapeURL/lib/removeUnwantedElements.ts @@ -49,7 +49,21 @@ const excludeNonMainTags = [ "#cookie", ]; -const forceIncludeMainTags = ["#main"]; +const forceIncludeMainTags = [ + "#main", + ".swoogo-cols", + ".swoogo-text", + ".swoogo-table-div", + ".swoogo-space", + ".swoogo-alert", + ".swoogo-sponsors", + ".swoogo-title", + ".swoogo-tabs", + ".swoogo-logo", + ".swoogo-image", + ".swoogo-button", + ".swoogo-agenda" +]; export const htmlTransform = async ( html: string,