import os import requests import json from PyPDF2 import PdfReader base_path = os.path.join("data", "config_files") THEME_CONTEXT_PATH = os.path.join(base_path, "theme_context.json") with open(THEME_CONTEXT_PATH, "r") as f: themes = json.load(f) def delete_file(file_path): try: os.remove(file_path) print(f"Deleted file: {file_path}") except OSError as e: print(f"Error deleting file {file_path}: {e}") def delete_all_files_in_directory(directory_path): try: for filename in os.listdir(directory_path): file_path = os.path.join(directory_path, filename) if os.path.isfile(file_path): os.remove(file_path) print(f"Deleted file: {file_path}") except OSError as e: print(f"Error deleting files in {directory_path}: {e}") def format_questions_text(questions_dict,key): """Format questions as text with dashes.""" formatted_text = "" for question in questions_dict[key]: formatted_text += f"- {question['question']}\n" return formatted_text.strip() def format_theme_text(theme_id): """Format questions as text with dashes.""" formatted_text = "" matching_themes = [t for t in themes if t["id"] == theme_id] current_theme = matching_themes[0] formatted_text += f"- {current_theme['id']}\n" formatted_text += f"- {current_theme['theme']}\n" formatted_text += f"- {current_theme['context']}\n" return formatted_text.strip() def download_pdf_and_extract_text(url: str) -> str: # Create a temporary file path temp_file_path = 'temp.pdf' # Download the PDF from the URL response = requests.get(url) response.raise_for_status() # Raise an error for bad responses with open(temp_file_path, 'wb') as f: f.write(response.content) # Load the PDF reader = PdfReader(temp_file_path) # Extract text from all pages and combine into one text combined_text = "\n\n".join(page.extract_text() for page in reader.pages if page.extract_text()) # Delete the temporary file os.remove(temp_file_path) return combined_text