fixed missing up updates
This commit is contained in:
+32
-14
@@ -1,32 +1,45 @@
|
||||
import os
|
||||
from spire.doc import Document, FileFormat
|
||||
from langchain_community.document_loaders import PyPDFLoader
|
||||
from docx import Document as DocxDocument
|
||||
from reportlab.lib.pagesizes import letter
|
||||
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
|
||||
from reportlab.lib.styles import getSampleStyleSheet
|
||||
from langchain_community.document_loaders import PyPDFLoader, UnstructuredWordDocumentLoader
|
||||
|
||||
def convert_word_to_pdf(doc_path: str) -> str:
|
||||
"""
|
||||
Convert a .doc or .docx file to PDF using Spire.Doc.
|
||||
Convert a .docx file to PDF using python-docx and reportlab.
|
||||
|
||||
Args:
|
||||
doc_path (str): The path to the .doc or .docx file.
|
||||
doc_path (str): The path to the .docx file.
|
||||
|
||||
Returns:
|
||||
str: The path to the converted PDF file.
|
||||
"""
|
||||
pdf_path = os.path.splitext(doc_path)[0] + '.pdf'
|
||||
|
||||
# Create a Document object
|
||||
document = Document()
|
||||
# Load the Word document
|
||||
document.LoadFromFile(doc_path)
|
||||
# Save as PDF
|
||||
document.SaveToFile(pdf_path, FileFormat.PDF)
|
||||
document.Close()
|
||||
doc = DocxDocument(doc_path)
|
||||
|
||||
# Create a PDF
|
||||
pdf = SimpleDocTemplate(pdf_path, pagesize=letter)
|
||||
styles = getSampleStyleSheet()
|
||||
flowables = []
|
||||
|
||||
# Extract text from paragraphs and add to PDF
|
||||
for para in doc.paragraphs:
|
||||
if para.text:
|
||||
p = Paragraph(para.text, styles['Normal'])
|
||||
flowables.append(p)
|
||||
flowables.append(Spacer(1, 12))
|
||||
|
||||
# Build the PDF
|
||||
pdf.build(flowables)
|
||||
|
||||
return pdf_path
|
||||
|
||||
def load_document(file_path: str):
|
||||
"""
|
||||
Utility function to load a PDF, DOCX, or DOC file by first converting it to PDF.
|
||||
Utility function to load a PDF, DOCX, or DOC file.
|
||||
|
||||
Args:
|
||||
file_path (str): The path to the file to load.
|
||||
@@ -38,16 +51,21 @@ def load_document(file_path: str):
|
||||
try:
|
||||
extension = os.path.splitext(file_path)[1].lower()
|
||||
|
||||
if extension in ['.doc', '.docx']:
|
||||
# Convert .doc or .docx to PDF first
|
||||
if extension == '.docx':
|
||||
# For .docx files, use UnstructuredWordDocumentLoader directly
|
||||
loader = UnstructuredWordDocumentLoader(file_path)
|
||||
return loader.load()
|
||||
elif extension == '.doc':
|
||||
# Convert .doc to .pdf first
|
||||
pdf_path = convert_word_to_pdf(file_path)
|
||||
loader = PyPDFLoader(pdf_path)
|
||||
return loader.load()
|
||||
elif extension == '.pdf':
|
||||
loader = PyPDFLoader(file_path)
|
||||
return loader.load()
|
||||
else:
|
||||
raise ValueError(f"Unsupported file type: {extension}. Only .pdf, .docx, and .doc are supported.")
|
||||
|
||||
return loader.load()
|
||||
except Exception as e:
|
||||
print(f"Error loading document: {str(e)}")
|
||||
return None
|
||||
Reference in New Issue
Block a user