import os from spire.doc import Document, FileFormat from langchain_community.document_loaders import PyPDFLoader def convert_word_to_pdf(doc_path: str) -> str: """ Convert a .doc or .docx file to PDF using Spire.Doc. Args: doc_path (str): The path to the .doc or .docx file. Returns: str: The path to the converted PDF file. """ pdf_path = os.path.splitext(doc_path)[0] + '.pdf' # Create a Document object document = Document() # Load the Word document document.LoadFromFile(doc_path) # Save as PDF document.SaveToFile(pdf_path, FileFormat.PDF) document.Close() return pdf_path def load_document(file_path: str): """ Utility function to load a PDF, DOCX, or DOC file by first converting it to PDF. Args: file_path (str): The path to the file to load. Returns: List[Document]: A list of Document objects representing the contents of the file. """ try: extension = os.path.splitext(file_path)[1].lower() if extension in ['.doc', '.docx']: # Convert .doc or .docx to PDF first pdf_path = convert_word_to_pdf(file_path) loader = PyPDFLoader(pdf_path) elif extension == '.pdf': loader = PyPDFLoader(file_path) else: raise ValueError(f"Unsupported file type: {extension}. Only .pdf, .docx, and .doc are supported.") return loader.load() except Exception as e: print(f"Error loading document: {str(e)}") return None