ds apis implemneted

2025-02-06 20:12:43 +00:00
parent 58e0cbfa3c
commit 4cd9aeac51
17 changed files with 5612 additions and 0 deletions
@@ -0,0 +1,53 @@
+import os
+from spire.doc import Document, FileFormat
+from langchain_community.document_loaders import PyPDFLoader
+
+def convert_word_to_pdf(doc_path: str) -> str:
+    """
+    Convert a .doc or .docx file to PDF using Spire.Doc.
+    
+    Args:
+        doc_path (str): The path to the .doc or .docx file.
+
+    Returns:
+        str: The path to the converted PDF file.
+    """
+    pdf_path = os.path.splitext(doc_path)[0] + '.pdf'
+    
+    # Create a Document object
+    document = Document()
+    # Load the Word document
+    document.LoadFromFile(doc_path)
+    # Save as PDF
+    document.SaveToFile(pdf_path, FileFormat.PDF)
+    document.Close()
+    
+    return pdf_path
+
+def load_document(file_path: str):
+    """
+    Utility function to load a PDF, DOCX, or DOC file by first converting it to PDF.
+
+    Args:
+        file_path (str): The path to the file to load.
+
+    Returns:
+        List[Document]: A list of Document objects representing the contents of the file.
+    """
+    
+    try:
+        extension = os.path.splitext(file_path)[1].lower()
+        
+        if extension in ['.doc', '.docx']:
+            # Convert .doc or .docx to PDF first
+            pdf_path = convert_word_to_pdf(file_path)
+            loader = PyPDFLoader(pdf_path)
+        elif extension == '.pdf':
+            loader = PyPDFLoader(file_path)
+        else:
+            raise ValueError(f"Unsupported file type: {extension}. Only .pdf, .docx, and .doc are supported.")
+        
+        return loader.load()
+    except Exception as e:
+        print(f"Error loading document: {str(e)}")
+        return None
@@ -0,0 +1,73 @@
+import os
+import requests
+import json
+from PyPDF2 import PdfReader
+
+base_path = os.path.join("data", "config_files")
+THEME_CONTEXT_PATH = os.path.join(base_path, "theme_context.json")
+
+with open(THEME_CONTEXT_PATH, "r") as f:
+    themes = json.load(f)
+
+def delete_file(file_path):
+    try:
+        os.remove(file_path)
+        print(f"Deleted file: {file_path}")
+    except OSError as e:
+        print(f"Error deleting file {file_path}: {e}")
+
+
+
+def delete_all_files_in_directory(directory_path):
+    try:
+        for filename in os.listdir(directory_path):
+            file_path = os.path.join(directory_path, filename)
+            if os.path.isfile(file_path):
+                os.remove(file_path)
+                print(f"Deleted file: {file_path}")
+    except OSError as e:
+        print(f"Error deleting files in {directory_path}: {e}")
+
+
+def format_questions_text(questions_dict,key):
+    """Format questions as text with dashes."""
+    formatted_text = ""
+    for question in questions_dict[key]:
+        formatted_text += f"- {question['question']}\n"
+    return formatted_text.strip()
+
+
+
+def format_theme_text(theme_id):
+    """Format questions as text with dashes."""
+    formatted_text = "" 
+    matching_themes = [t for t in themes if t["id"] == theme_id]
+    current_theme = matching_themes[0]
+    formatted_text += f"- {current_theme['id']}\n"
+    formatted_text += f"- {current_theme['theme']}\n"
+    formatted_text += f"- {current_theme['context']}\n"
+    
+    return formatted_text.strip()
+
+
+
+def download_pdf_and_extract_text(url: str) -> str:
+    # Create a temporary file path
+    temp_file_path = 'temp.pdf'
+    
+    # Download the PDF from the URL
+    response = requests.get(url)
+    response.raise_for_status()  # Raise an error for bad responses
+    with open(temp_file_path, 'wb') as f:
+        f.write(response.content)
+
+    # Load the PDF
+    reader = PdfReader(temp_file_path)
+
+    # Extract text from all pages and combine into one text
+    combined_text = "\n\n".join(page.extract_text() for page in reader.pages if page.extract_text())
+
+    # Delete the temporary file
+    os.remove(temp_file_path)
+
+    return combined_text