role extracion and sop generation added

This commit is contained in:
2024-08-31 01:29:39 +00:00
parent ccb0db21d6
commit 1f02a30a16
15 changed files with 734 additions and 11 deletions
+48
View File
@@ -0,0 +1,48 @@
import os
from spire.doc import Document, FileFormat
from langchain_community.document_loaders import PyPDFLoader
def convert_word_to_pdf(doc_path: str) -> str:
"""
Convert a .doc or .docx file to PDF using Spire.Doc.
Args:
doc_path (str): The path to the .doc or .docx file.
Returns:
str: The path to the converted PDF file.
"""
pdf_path = os.path.splitext(doc_path)[0] + '.pdf'
# Create a Document object
document = Document()
# Load the Word document
document.LoadFromFile(doc_path)
# Save as PDF
document.SaveToFile(pdf_path, FileFormat.PDF)
document.Close()
return pdf_path
def load_document(file_path: str):
"""
Utility function to load a PDF, DOCX, or DOC file by first converting it to PDF.
Args:
file_path (str): The path to the file to load.
Returns:
List[Document]: A list of Document objects representing the contents of the file.
"""
extension = os.path.splitext(file_path)[1].lower()
if extension in ['.doc', '.docx']:
# Convert .doc or .docx to PDF first
pdf_path = convert_word_to_pdf(file_path)
loader = PyPDFLoader(pdf_path)
elif extension == '.pdf':
loader = PyPDFLoader(file_path)
else:
raise ValueError(f"Unsupported file type: {extension}. Only .pdf, .docx, and .doc are supported.")
return loader.load()
+20
View File
@@ -0,0 +1,20 @@
import os
def delete_file(file_path):
try:
os.remove(file_path)
print(f"Deleted file: {file_path}")
except OSError as e:
print(f"Error deleting file {file_path}: {e}")
import os
def delete_all_files_in_directory(directory_path):
try:
for filename in os.listdir(directory_path):
file_path = os.path.join(directory_path, filename)
if os.path.isfile(file_path):
os.remove(file_path)
print(f"Deleted file: {file_path}")
except OSError as e:
print(f"Error deleting files in {directory_path}: {e}")