ds apis implemented
This commit is contained in:
@@ -0,0 +1,53 @@
|
||||
import os
|
||||
from spire.doc import Document, FileFormat
|
||||
from langchain_community.document_loaders import PyPDFLoader
|
||||
|
||||
def convert_word_to_pdf(doc_path: str) -> str:
|
||||
"""
|
||||
Convert a .doc or .docx file to PDF using Spire.Doc.
|
||||
|
||||
Args:
|
||||
doc_path (str): The path to the .doc or .docx file.
|
||||
|
||||
Returns:
|
||||
str: The path to the converted PDF file.
|
||||
"""
|
||||
pdf_path = os.path.splitext(doc_path)[0] + '.pdf'
|
||||
|
||||
# Create a Document object
|
||||
document = Document()
|
||||
# Load the Word document
|
||||
document.LoadFromFile(doc_path)
|
||||
# Save as PDF
|
||||
document.SaveToFile(pdf_path, FileFormat.PDF)
|
||||
document.Close()
|
||||
|
||||
return pdf_path
|
||||
|
||||
def load_document(file_path: str):
|
||||
"""
|
||||
Utility function to load a PDF, DOCX, or DOC file by first converting it to PDF.
|
||||
|
||||
Args:
|
||||
file_path (str): The path to the file to load.
|
||||
|
||||
Returns:
|
||||
List[Document]: A list of Document objects representing the contents of the file.
|
||||
"""
|
||||
|
||||
try:
|
||||
extension = os.path.splitext(file_path)[1].lower()
|
||||
|
||||
if extension in ['.doc', '.docx']:
|
||||
# Convert .doc or .docx to PDF first
|
||||
pdf_path = convert_word_to_pdf(file_path)
|
||||
loader = PyPDFLoader(pdf_path)
|
||||
elif extension == '.pdf':
|
||||
loader = PyPDFLoader(file_path)
|
||||
else:
|
||||
raise ValueError(f"Unsupported file type: {extension}. Only .pdf, .docx, and .doc are supported.")
|
||||
|
||||
return loader.load()
|
||||
except Exception as e:
|
||||
print(f"Error loading document: {str(e)}")
|
||||
return None
|
||||
@@ -0,0 +1,73 @@
|
||||
import os
|
||||
import requests
|
||||
import json
|
||||
from PyPDF2 import PdfReader
|
||||
|
||||
base_path = os.path.join("data", "config_files")
|
||||
THEME_CONTEXT_PATH = os.path.join(base_path, "theme_context.json")
|
||||
|
||||
with open(THEME_CONTEXT_PATH, "r") as f:
|
||||
themes = json.load(f)
|
||||
|
||||
def delete_file(file_path):
|
||||
try:
|
||||
os.remove(file_path)
|
||||
print(f"Deleted file: {file_path}")
|
||||
except OSError as e:
|
||||
print(f"Error deleting file {file_path}: {e}")
|
||||
|
||||
|
||||
|
||||
def delete_all_files_in_directory(directory_path):
|
||||
try:
|
||||
for filename in os.listdir(directory_path):
|
||||
file_path = os.path.join(directory_path, filename)
|
||||
if os.path.isfile(file_path):
|
||||
os.remove(file_path)
|
||||
print(f"Deleted file: {file_path}")
|
||||
except OSError as e:
|
||||
print(f"Error deleting files in {directory_path}: {e}")
|
||||
|
||||
|
||||
def format_questions_text(questions_dict,key):
|
||||
"""Format questions as text with dashes."""
|
||||
formatted_text = ""
|
||||
for question in questions_dict[key]:
|
||||
formatted_text += f"- {question['question']}\n"
|
||||
return formatted_text.strip()
|
||||
|
||||
|
||||
|
||||
def format_theme_text(theme_id):
|
||||
"""Format questions as text with dashes."""
|
||||
formatted_text = ""
|
||||
matching_themes = [t for t in themes if t["id"] == theme_id]
|
||||
current_theme = matching_themes[0]
|
||||
formatted_text += f"- {current_theme['id']}\n"
|
||||
formatted_text += f"- {current_theme['theme']}\n"
|
||||
formatted_text += f"- {current_theme['context']}\n"
|
||||
|
||||
return formatted_text.strip()
|
||||
|
||||
|
||||
|
||||
def download_pdf_and_extract_text(url: str) -> str:
|
||||
# Create a temporary file path
|
||||
temp_file_path = 'temp.pdf'
|
||||
|
||||
# Download the PDF from the URL
|
||||
response = requests.get(url)
|
||||
response.raise_for_status() # Raise an error for bad responses
|
||||
with open(temp_file_path, 'wb') as f:
|
||||
f.write(response.content)
|
||||
|
||||
# Load the PDF
|
||||
reader = PdfReader(temp_file_path)
|
||||
|
||||
# Extract text from all pages and combine into one text
|
||||
combined_text = "\n\n".join(page.extract_text() for page in reader.pages if page.extract_text())
|
||||
|
||||
# Delete the temporary file
|
||||
os.remove(temp_file_path)
|
||||
|
||||
return combined_text
|
||||
Reference in New Issue
Block a user