fix

2025-06-13 19:32:33 +01:00
parent bfcdcf4786
commit 31d85a83e7
3 changed files with 12 additions and 11 deletions
@@ -39,13 +39,9 @@ def validate_worker_document():
            file_path = os.path.join(upload_folder, filename)
            file.save(file_path)

-            print("loading document")
-
            # Load the document for processing
            docs = load_document(file_path)

-            print("document loaded")
-
            # Instantiate the chatbot service
            chatbot = Chatbot()

@@ -77,7 +77,7 @@ dummy_data = generate_dummy_data(num_assessments=100, max_users_per_assessment=5
 #SopGeneratorDocument
 class Chatbot:
    def __init__(self):
-        self.api_key = os.getenv("OPENAI_API_KEY")
+        self.api_key = os.getenv("OPENAI_API_KEY") or "sk-svcacct-v2m4jSLxCTLR-WizUhZnkOHEuftWNVy2k7vIGWDGJogaBr5VogTTVT3BlbkFJjCxOowETlz7muR8eAS7ExO1NA7kvcrZ4HVhS66jxK8PpvNce1kAAA"
        self.client = OpenAI(api_key=self.api_key)
        self.model = "gpt-4o-mini"
        
@@ -87,11 +87,7 @@ class Chatbot:

    def _extract_text_from_docs(self, docs):
        """Extract text content from document objects."""
-        
-        
        print(docs)
-        
-        
        return [self.clean_text(doc.page_content) for doc in docs]
    # Existing methods...

@@ -4,6 +4,16 @@ from reportlab.lib.pagesizes import letter
 from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
 from reportlab.lib.styles import getSampleStyleSheet
 from langchain_community.document_loaders import PyPDFLoader, UnstructuredWordDocumentLoader
+import pdfplumber
+from langchain_core.documents import Document
+
+def load_pdf_with_pdfplumber(file_path):
+    docs = []
+    with pdfplumber.open(file_path) as pdf:
+        for i, page in enumerate(pdf.pages):
+            text = page.extract_text()
+            docs.append(Document(page_content=text, metadata={"page": i}))
+    return docs

 def convert_word_to_pdf(doc_path: str) -> str:
    """
@@ -61,8 +71,7 @@ def load_document(file_path: str):
            loader = PyPDFLoader(pdf_path)
            return loader.load()
        elif extension == '.pdf':
-            loader = PyPDFLoader(file_path)
-            return loader.load()
+            return load_pdf_with_pdfplumber(file_path)
        else:
            raise ValueError(f"Unsupported file type: {extension}. Only .pdf, .docx, and .doc are supported.")