diff --git a/src/services/chatbot.py b/src/services/chatbot.py index 4a2903a..212a5f5 100644 --- a/src/services/chatbot.py +++ b/src/services/chatbot.py @@ -1,5 +1,6 @@ import os import json +import re from openai import OpenAI from pydantic import BaseModel, Field from typing import List, Dict, Optional @@ -79,10 +80,14 @@ class Chatbot: self.api_key = os.getenv("OPENAI_API_KEY") self.client = OpenAI(api_key=self.api_key) self.model = "gpt-4o-mini" + + def clean_text(self, text): + # Remove all surrogate characters + return re.sub(r'[\uD800-\uDFFF]', '', text) def _extract_text_from_docs(self, docs): """Extract text content from document objects.""" - return [doc.page_content for doc in docs] + return [self.clean_text(doc.page_content) for doc in docs] # Existing methods... def validate_worker(self, question, docs) -> VisionMissionResponse: