Files
ds-fire-fighter/utils/document_loader.py
T
2025-04-09 00:53:16 +01:00

71 lines
2.2 KiB
Python

import os
from docx import Document as DocxDocument
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
from reportlab.lib.styles import getSampleStyleSheet
from langchain_community.document_loaders import PyPDFLoader, UnstructuredWordDocumentLoader
def convert_word_to_pdf(doc_path: str) -> str:
"""
Convert a .docx file to PDF using python-docx and reportlab.
Args:
doc_path (str): The path to the .docx file.
Returns:
str: The path to the converted PDF file.
"""
pdf_path = os.path.splitext(doc_path)[0] + '.pdf'
# Load the Word document
doc = DocxDocument(doc_path)
# Create a PDF
pdf = SimpleDocTemplate(pdf_path, pagesize=letter)
styles = getSampleStyleSheet()
flowables = []
# Extract text from paragraphs and add to PDF
for para in doc.paragraphs:
if para.text:
p = Paragraph(para.text, styles['Normal'])
flowables.append(p)
flowables.append(Spacer(1, 12))
# Build the PDF
pdf.build(flowables)
return pdf_path
def load_document(file_path: str):
"""
Utility function to load a PDF, DOCX, or DOC file.
Args:
file_path (str): The path to the file to load.
Returns:
List[Document]: A list of Document objects representing the contents of the file.
"""
try:
extension = os.path.splitext(file_path)[1].lower()
if extension == '.docx':
# For .docx files, use UnstructuredWordDocumentLoader directly
loader = UnstructuredWordDocumentLoader(file_path)
return loader.load()
elif extension == '.doc':
# Convert .doc to .pdf first
pdf_path = convert_word_to_pdf(file_path)
loader = PyPDFLoader(pdf_path)
return loader.load()
elif extension == '.pdf':
loader = PyPDFLoader(file_path)
return loader.load()
else:
raise ValueError(f"Unsupported file type: {extension}. Only .pdf, .docx, and .doc are supported.")
except Exception as e:
print(f"Error loading document: {str(e)}")
return None