ds_fire_fighter/utils/document_loader.py

import os
from spire.doc import Document, FileFormat
from langchain_community.document_loaders import PyPDFLoader

def convert_word_to_pdf(doc_path: str) -> str:
    """
    Convert a .doc or .docx file to PDF using Spire.Doc.
    
    Args:
        doc_path (str): The path to the .doc or .docx file.

    Returns:
        str: The path to the converted PDF file.
    """
    pdf_path = os.path.splitext(doc_path)[0] + '.pdf'
    
    # Create a Document object
    document = Document()
    # Load the Word document
    document.LoadFromFile(doc_path)
    # Save as PDF
    document.SaveToFile(pdf_path, FileFormat.PDF)
    document.Close()
    
    return pdf_path

def load_document(file_path: str):
    """
    Utility function to load a PDF, DOCX, or DOC file by first converting it to PDF.

    Args:
        file_path (str): The path to the file to load.

    Returns:
        List[Document]: A list of Document objects representing the contents of the file.
    """
    
    try:
        extension = os.path.splitext(file_path)[1].lower()
        
        if extension in ['.doc', '.docx']:
            # Convert .doc or .docx to PDF first
            pdf_path = convert_word_to_pdf(file_path)
            loader = PyPDFLoader(pdf_path)
        elif extension == '.pdf':
            loader = PyPDFLoader(file_path)
        else:
            raise ValueError(f"Unsupported file type: {extension}. Only .pdf, .docx, and .doc are supported.")
        
        return loader.load()
    except Exception as e:
        print(f"Error loading document: {str(e)}")
        return None
ds apis implemneted 2025-02-06 20:12:43 +00:00			`import os`
			`from spire.doc import Document, FileFormat`
			`from langchain_community.document_loaders import PyPDFLoader`

			`def convert_word_to_pdf(doc_path: str) -> str:`
			`"""`
			`Convert a .doc or .docx file to PDF using Spire.Doc.`

			`Args:`
			`doc_path (str): The path to the .doc or .docx file.`

			`Returns:`
			`str: The path to the converted PDF file.`
			`"""`
			`pdf_path = os.path.splitext(doc_path)[0] + '.pdf'`

			`# Create a Document object`
			`document = Document()`
			`# Load the Word document`
			`document.LoadFromFile(doc_path)`
			`# Save as PDF`
			`document.SaveToFile(pdf_path, FileFormat.PDF)`
			`document.Close()`

			`return pdf_path`

			`def load_document(file_path: str):`
			`"""`
			`Utility function to load a PDF, DOCX, or DOC file by first converting it to PDF.`

			`Args:`
			`file_path (str): The path to the file to load.`

			`Returns:`
			`List[Document]: A list of Document objects representing the contents of the file.`
			`"""`

			`try:`
			`extension = os.path.splitext(file_path)[1].lower()`

			`if extension in ['.doc', '.docx']:`
			`# Convert .doc or .docx to PDF first`
			`pdf_path = convert_word_to_pdf(file_path)`
			`loader = PyPDFLoader(pdf_path)`
			`elif extension == '.pdf':`
			`loader = PyPDFLoader(file_path)`
			`else:`
			`raise ValueError(f"Unsupported file type: {extension}. Only .pdf, .docx, and .doc are supported.")`

			`return loader.load()`
			`except Exception as e:`
			`print(f"Error loading document: {str(e)}")`
			`return None`