import os from docx import Document as DocxDocument from reportlab.lib.pagesizes import letter from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer from reportlab.lib.styles import getSampleStyleSheet from langchain_community.document_loaders import PyPDFLoader, UnstructuredWordDocumentLoader def convert_word_to_pdf(doc_path: str) -> str: """ Convert a .docx file to PDF using python-docx and reportlab. Args: doc_path (str): The path to the .docx file. Returns: str: The path to the converted PDF file. """ pdf_path = os.path.splitext(doc_path)[0] + '.pdf' # Load the Word document doc = DocxDocument(doc_path) # Create a PDF pdf = SimpleDocTemplate(pdf_path, pagesize=letter) styles = getSampleStyleSheet() flowables = [] # Extract text from paragraphs and add to PDF for para in doc.paragraphs: if para.text: p = Paragraph(para.text, styles['Normal']) flowables.append(p) flowables.append(Spacer(1, 12)) # Build the PDF pdf.build(flowables) return pdf_path def load_document(file_path: str): """ Utility function to load a PDF, DOCX, or DOC file. Args: file_path (str): The path to the file to load. Returns: List[Document]: A list of Document objects representing the contents of the file. """ try: extension = os.path.splitext(file_path)[1].lower() if extension == '.docx': # For .docx files, use UnstructuredWordDocumentLoader directly loader = UnstructuredWordDocumentLoader(file_path) return loader.load() elif extension == '.doc': # Convert .doc to .pdf first pdf_path = convert_word_to_pdf(file_path) loader = PyPDFLoader(pdf_path) return loader.load() elif extension == '.pdf': loader = PyPDFLoader(file_path) return loader.load() else: raise ValueError(f"Unsupported file type: {extension}. Only .pdf, .docx, and .doc are supported.") except Exception as e: print(f"Error loading document: {str(e)}") return None