103 lines
2.9 KiB
Python
103 lines
2.9 KiB
Python
import os
|
|
import requests
|
|
import json
|
|
from PyPDF2 import PdfReader
|
|
|
|
base_path = os.path.join("data", "config_files")
|
|
THEME_CONTEXT_PATH = os.path.join(base_path, "theme_context.json")
|
|
|
|
with open(THEME_CONTEXT_PATH, "r", encoding="utf-8") as f:
|
|
themes = json.load(f)
|
|
|
|
|
|
def delete_file(file_path):
|
|
try:
|
|
os.remove(file_path)
|
|
print(f"Deleted file: {file_path}")
|
|
except OSError as e:
|
|
print(f"Error deleting file {file_path}: {e}")
|
|
|
|
|
|
|
|
def delete_all_files_in_directory(directory_path):
|
|
try:
|
|
for filename in os.listdir(directory_path):
|
|
file_path = os.path.join(directory_path, filename)
|
|
if os.path.isfile(file_path):
|
|
os.remove(file_path)
|
|
print(f"Deleted file: {file_path}")
|
|
except OSError as e:
|
|
print(f"Error deleting files in {directory_path}: {e}")
|
|
|
|
|
|
def format_questions_text(questions_dict,key):
|
|
"""Format questions as text with dashes."""
|
|
formatted_text = ""
|
|
for question in questions_dict[key]:
|
|
formatted_text += f"- {question['question']}\n"
|
|
return formatted_text.strip()
|
|
|
|
|
|
|
|
def format_theme_text(theme_id):
|
|
"""Format questions as text with dashes."""
|
|
formatted_text = ""
|
|
matching_themes = [t for t in themes if t["id"] == theme_id]
|
|
current_theme = matching_themes[0]
|
|
formatted_text += f"- {current_theme['id']}\n"
|
|
formatted_text += f"- {current_theme['theme']}\n"
|
|
formatted_text += f"- {current_theme['context']}\n"
|
|
|
|
return formatted_text.strip()
|
|
|
|
|
|
|
|
def download_pdf_and_extract_text(url: str) -> str:
|
|
# Create a temporary file path
|
|
temp_file_path = 'temp.pdf'
|
|
|
|
# Download the PDF from the URL
|
|
response = requests.get(url)
|
|
response.raise_for_status() # Raise an error for bad responses
|
|
with open(temp_file_path, 'wb') as f:
|
|
f.write(response.content)
|
|
|
|
# Load the PDF
|
|
reader = PdfReader(temp_file_path)
|
|
|
|
# Extract text from all pages and combine into one text
|
|
combined_text = "\n\n".join(page.extract_text() for page in reader.pages if page.extract_text())
|
|
|
|
# Delete the temporary file
|
|
os.remove(temp_file_path)
|
|
|
|
return combined_text
|
|
|
|
|
|
def format_qna_json_text(json_data):
|
|
"""
|
|
Format a list of Q&A JSON data into a text string with dashes.
|
|
|
|
Parameters:
|
|
- json_data (list): A list of Q&A dictionaries with 'question' and 'answer' keys.
|
|
|
|
Returns:
|
|
- str: A formatted text string.
|
|
"""
|
|
formatted_text = ""
|
|
|
|
# Check if input is a list of Q&A dictionaries
|
|
if isinstance(json_data, list):
|
|
for item in json_data:
|
|
if 'question' in item and 'answer' in item:
|
|
formatted_text += f"- Question: {item['question']}\n"
|
|
formatted_text += f" Answer: {item['answer']}\n"
|
|
else:
|
|
formatted_text += "- Incomplete Q&A entry\n"
|
|
|
|
return formatted_text.strip()
|
|
|
|
# Example usage:
|
|
|
|
|