53 lines
1.5 KiB
Python
53 lines
1.5 KiB
Python
|
|
import os
|
||
|
|
from spire.doc import Document, FileFormat
|
||
|
|
from langchain_community.document_loaders import PyPDFLoader
|
||
|
|
|
||
|
|
def convert_word_to_pdf(doc_path: str) -> str:
|
||
|
|
"""
|
||
|
|
Convert a .doc or .docx file to PDF using Spire.Doc.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
doc_path (str): The path to the .doc or .docx file.
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
str: The path to the converted PDF file.
|
||
|
|
"""
|
||
|
|
pdf_path = os.path.splitext(doc_path)[0] + '.pdf'
|
||
|
|
|
||
|
|
# Create a Document object
|
||
|
|
document = Document()
|
||
|
|
# Load the Word document
|
||
|
|
document.LoadFromFile(doc_path)
|
||
|
|
# Save as PDF
|
||
|
|
document.SaveToFile(pdf_path, FileFormat.PDF)
|
||
|
|
document.Close()
|
||
|
|
|
||
|
|
return pdf_path
|
||
|
|
|
||
|
|
def load_document(file_path: str):
|
||
|
|
"""
|
||
|
|
Utility function to load a PDF, DOCX, or DOC file by first converting it to PDF.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
file_path (str): The path to the file to load.
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
List[Document]: A list of Document objects representing the contents of the file.
|
||
|
|
"""
|
||
|
|
|
||
|
|
try:
|
||
|
|
extension = os.path.splitext(file_path)[1].lower()
|
||
|
|
|
||
|
|
if extension in ['.doc', '.docx']:
|
||
|
|
# Convert .doc or .docx to PDF first
|
||
|
|
pdf_path = convert_word_to_pdf(file_path)
|
||
|
|
loader = PyPDFLoader(pdf_path)
|
||
|
|
elif extension == '.pdf':
|
||
|
|
loader = PyPDFLoader(file_path)
|
||
|
|
else:
|
||
|
|
raise ValueError(f"Unsupported file type: {extension}. Only .pdf, .docx, and .doc are supported.")
|
||
|
|
|
||
|
|
return loader.load()
|
||
|
|
except Exception as e:
|
||
|
|
print(f"Error loading document: {str(e)}")
|
||
|
|
return None
|