fixed missing up updates

2025-04-09 00:53:16 +01:00
parent d1ed8b9e3f
commit d34e304017
5 changed files with 248 additions and 137 deletions
@@ -1,32 +1,45 @@
 import os
-from spire.doc import Document, FileFormat
-from langchain_community.document_loaders import PyPDFLoader
+from docx import Document as DocxDocument
+from reportlab.lib.pagesizes import letter
+from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
+from reportlab.lib.styles import getSampleStyleSheet
+from langchain_community.document_loaders import PyPDFLoader, UnstructuredWordDocumentLoader

 def convert_word_to_pdf(doc_path: str) -> str:
    """
-    Convert a .doc or .docx file to PDF using Spire.Doc.
+    Convert a .docx file to PDF using python-docx and reportlab.
    
    Args:
-        doc_path (str): The path to the .doc or .docx file.
+        doc_path (str): The path to the .docx file.

    Returns:
        str: The path to the converted PDF file.
    """
    pdf_path = os.path.splitext(doc_path)[0] + '.pdf'
    
-    # Create a Document object
-    document = Document()
    # Load the Word document
-    document.LoadFromFile(doc_path)
-    # Save as PDF
-    document.SaveToFile(pdf_path, FileFormat.PDF)
-    document.Close()
+    doc = DocxDocument(doc_path)
+    
+    # Create a PDF
+    pdf = SimpleDocTemplate(pdf_path, pagesize=letter)
+    styles = getSampleStyleSheet()
+    flowables = []
+    
+    # Extract text from paragraphs and add to PDF
+    for para in doc.paragraphs:
+        if para.text:
+            p = Paragraph(para.text, styles['Normal'])
+            flowables.append(p)
+            flowables.append(Spacer(1, 12))
+    
+    # Build the PDF
+    pdf.build(flowables)
    
    return pdf_path

 def load_document(file_path: str):
    """
-    Utility function to load a PDF, DOCX, or DOC file by first converting it to PDF.
+    Utility function to load a PDF, DOCX, or DOC file.

    Args:
        file_path (str): The path to the file to load.
@@ -38,16 +51,21 @@ def load_document(file_path: str):
    try:
        extension = os.path.splitext(file_path)[1].lower()
        
-        if extension in ['.doc', '.docx']:
-            # Convert .doc or .docx to PDF first
+        if extension == '.docx':
+            # For .docx files, use UnstructuredWordDocumentLoader directly
+            loader = UnstructuredWordDocumentLoader(file_path)
+            return loader.load()
+        elif extension == '.doc':
+            # Convert .doc to .pdf first
            pdf_path = convert_word_to_pdf(file_path)
            loader = PyPDFLoader(pdf_path)
+            return loader.load()
        elif extension == '.pdf':
            loader = PyPDFLoader(file_path)
+            return loader.load()
        else:
            raise ValueError(f"Unsupported file type: {extension}. Only .pdf, .docx, and .doc are supported.")
        
-        return loader.load()
    except Exception as e:
        print(f"Error loading document: {str(e)}")
        return None