From 31d85a83e76b4399a16798e8dbfa1a6e5cb23a78 Mon Sep 17 00:00:00 2001 From: teslim Date: Fri, 13 Jun 2025 19:32:33 +0100 Subject: [PATCH] fix --- src/api/routes/chatbot.py | 4 ---- src/services/chatbot.py | 6 +----- src/utils/document_loader.py | 13 +++++++++++-- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/src/api/routes/chatbot.py b/src/api/routes/chatbot.py index e0b7460..256d431 100644 --- a/src/api/routes/chatbot.py +++ b/src/api/routes/chatbot.py @@ -38,13 +38,9 @@ def validate_worker_document(): upload_folder = current_app.config['UPLOAD_FOLDER'] file_path = os.path.join(upload_folder, filename) file.save(file_path) - - print("loading document") # Load the document for processing docs = load_document(file_path) - - print("document loaded") # Instantiate the chatbot service chatbot = Chatbot() diff --git a/src/services/chatbot.py b/src/services/chatbot.py index c55e97c..03e5994 100644 --- a/src/services/chatbot.py +++ b/src/services/chatbot.py @@ -77,7 +77,7 @@ dummy_data = generate_dummy_data(num_assessments=100, max_users_per_assessment=5 #SopGeneratorDocument class Chatbot: def __init__(self): - self.api_key = os.getenv("OPENAI_API_KEY") + self.api_key = os.getenv("OPENAI_API_KEY") or "sk-svcacct-v2m4jSLxCTLR-WizUhZnkOHEuftWNVy2k7vIGWDGJogaBr5VogTTVT3BlbkFJjCxOowETlz7muR8eAS7ExO1NA7kvcrZ4HVhS66jxK8PpvNce1kAAA" self.client = OpenAI(api_key=self.api_key) self.model = "gpt-4o-mini" @@ -87,11 +87,7 @@ class Chatbot: def _extract_text_from_docs(self, docs): """Extract text content from document objects.""" - - print(docs) - - return [self.clean_text(doc.page_content) for doc in docs] # Existing methods... diff --git a/src/utils/document_loader.py b/src/utils/document_loader.py index a9743f9..438cf8b 100644 --- a/src/utils/document_loader.py +++ b/src/utils/document_loader.py @@ -4,6 +4,16 @@ from reportlab.lib.pagesizes import letter from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer from reportlab.lib.styles import getSampleStyleSheet from langchain_community.document_loaders import PyPDFLoader, UnstructuredWordDocumentLoader +import pdfplumber +from langchain_core.documents import Document + +def load_pdf_with_pdfplumber(file_path): + docs = [] + with pdfplumber.open(file_path) as pdf: + for i, page in enumerate(pdf.pages): + text = page.extract_text() + docs.append(Document(page_content=text, metadata={"page": i})) + return docs def convert_word_to_pdf(doc_path: str) -> str: """ @@ -61,8 +71,7 @@ def load_document(file_path: str): loader = PyPDFLoader(pdf_path) return loader.load() elif extension == '.pdf': - loader = PyPDFLoader(file_path) - return loader.load() + return load_pdf_with_pdfplumber(file_path) else: raise ValueError(f"Unsupported file type: {extension}. Only .pdf, .docx, and .doc are supported.")