diff --git a/docs/.python-version b/docs/.python-version new file mode 100644 index 0000000..423f188 --- /dev/null +++ b/docs/.python-version @@ -0,0 +1 @@ +python=3.11 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 07fd597..72223b8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,23 +1,25 @@ -openai -pandas -python-dotenv -fastapi -uvicorn -langchain-community -langchain-openai -pydantic -pypdf -pypandoc -Spire.Doc +openai==1.72.0 +pandas==2.2.3 +python-dotenv==1.1.0 +fastapi==0.115.9 +uvicorn==0.34.0 +langchain-community==0.3.21 +langchain-openai==0.3.12 +pydantic==2.11.3 +pypdf==5.4.0 +pypandoc==1.15 plum-dispatch==1.7.4 -scikit-learn -werkzeug -python-multipart -langgraph -tiktoken -langchainhub -chromadb -langchain -langchain-text-splitters -beautifulsoup4 -langchain-core \ No newline at end of file +scikit-learn==1.6.1 +Werkzeug==3.1.3 +python-multipart==0.0.20 +langgraph==0.3.27 +tiktoken==0.9.0 +langchainhub==0.1.21 +chromadb==1.0.3 +langchain==0.3.23 +langchain-text-splitters==0.3.8 +beautifulsoup4==4.13.3 +langchain-core==0.3.51 +PyPDF2==3.0.1 +reportlab==4.3.1 +python-docx==1.1.2 diff --git a/scripts/generate_quiz.py b/scripts/generate_quiz.py index c0bd820..cd0c0fc 100644 --- a/scripts/generate_quiz.py +++ b/scripts/generate_quiz.py @@ -149,10 +149,18 @@ def generate_quiz(startpop_pdf, quiz_type=None) -> dict: - Conclude strongly, avoiding phrases like “and so yeah…”. ----END------ - NOTE: THE QUIZ FOCUES ON BULIDNG USER CONFIDENCE BY ANANLYZING THE QUESTIONS AND FRAMEWORK FOR EACH QUESTION IN THE STARTPOP FRAMEWORK PDF,SOLELY USE THIS PDF PROVIDED BY THE USER - BASED ON THIS FRAMEWORK , CREATE INTERVIEW BASED QUIZ FOR FIRE FIGHTING ROLE BY ANALYZING THIS DOCUMENT - NOTE : THE QUIZ SHOULD NOT BE BASED ON STARTPOP FRAMEWORK ITSELF BUT ANALYZE THE STARTPOP FRAMEWORK PRESENTED TO GENERATE INTERVIEW BASED QUIZ - e.g "The STARTPOP framework is specifically designed for firefighter interviews", THIS KIND OF QUESTION SHOULD NOT BE ASKED IN THE QUIZ.... + ### Instructions: + - Analyze the provided STARTPOP PDF to extract relevant themes and concepts. + - Generate a quiz that builds user confidence by focusing on interview-based scenarios. + - Avoid questions directly about the STARTPOP framework itself (e.g., "What is STARTPOP?"). + - Use the specified quiz type (`quiz_type`) to determine the output format. + - Generate at least 15 questions and above + + NOTE: The quiz focuses on building user confidence by analyzing the questions and framework presented in the STARTPOP PDF provided by the user. + Based on this framework, create an interview-based quiz specifically for firefighting roles by thoroughly analyzing the document. + + IMPORTANT: The quiz should not directly reference the STARTPOP framework itself. Instead, it should generate interview-based questions derived from the insights of the STARTPOP framework. + For example, avoid questions like "The STARTPOP framework is specifically designed for firefighter interviews." Such questions should not be included in the quiz. Thank you for your thorough and precise processing! STARTPOP FULL PDF :{startpop_pdf} question type : {quiz_type} diff --git a/test.py b/test.py new file mode 100644 index 0000000..04f71c5 --- /dev/null +++ b/test.py @@ -0,0 +1,61 @@ +import subprocess +import re + +# List of packages you want to include +packages = [ + "openai", + "pandas", + "python-dotenv", + "fastapi", + "uvicorn", + "langchain-community", + "langchain-openai", + "pydantic", + "pypdf", + "pypandoc", + "plum-dispatch==1.7.4", # You specified exact version here + "scikit-learn", + "werkzeug", + "python-multipart", + "langgraph", + "tiktoken", + "langchainhub", + "chromadb", + "langchain", + "langchain-text-splitters", + "beautifulsoup4", + "langchain-core", + "PyPDF2", + "reportlab", + "python-docx" +] + +# Get all installed packages with versions +result = subprocess.run(["pip", "freeze"], capture_output=True, text=True) +installed_packages = result.stdout.strip().split('\n') + +# Create a dictionary of package names to their full name with version +package_dict = {} +for pkg in installed_packages: + if '==' in pkg: + name = pkg.split('==')[0].lower() + package_dict[name] = pkg + +# Write only the requested packages to requirements.txt +with open('requirements.txt', 'w') as f: + for package in packages: + # Handle cases where version is already specifixed + if '==' in package: + f.write(f"{package}\n") + continue + + # Try to find the package in installed packages + pkg_name = package.lower() + if pkg_name in package_dict: + f.write(f"{package_dict[pkg_name]}\n") + else: + # If not found, just write the package name + f.write(f"{package}\n") + print(f"Warning: {package} not found in installed packages") + +print("requirements.txt has been generated.") \ No newline at end of file diff --git a/utils/document_loader.py b/utils/document_loader.py index b30d61a..a9743f9 100644 --- a/utils/document_loader.py +++ b/utils/document_loader.py @@ -1,32 +1,45 @@ import os -from spire.doc import Document, FileFormat -from langchain_community.document_loaders import PyPDFLoader +from docx import Document as DocxDocument +from reportlab.lib.pagesizes import letter +from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer +from reportlab.lib.styles import getSampleStyleSheet +from langchain_community.document_loaders import PyPDFLoader, UnstructuredWordDocumentLoader def convert_word_to_pdf(doc_path: str) -> str: """ - Convert a .doc or .docx file to PDF using Spire.Doc. + Convert a .docx file to PDF using python-docx and reportlab. Args: - doc_path (str): The path to the .doc or .docx file. + doc_path (str): The path to the .docx file. Returns: str: The path to the converted PDF file. """ pdf_path = os.path.splitext(doc_path)[0] + '.pdf' - # Create a Document object - document = Document() # Load the Word document - document.LoadFromFile(doc_path) - # Save as PDF - document.SaveToFile(pdf_path, FileFormat.PDF) - document.Close() + doc = DocxDocument(doc_path) + + # Create a PDF + pdf = SimpleDocTemplate(pdf_path, pagesize=letter) + styles = getSampleStyleSheet() + flowables = [] + + # Extract text from paragraphs and add to PDF + for para in doc.paragraphs: + if para.text: + p = Paragraph(para.text, styles['Normal']) + flowables.append(p) + flowables.append(Spacer(1, 12)) + + # Build the PDF + pdf.build(flowables) return pdf_path def load_document(file_path: str): """ - Utility function to load a PDF, DOCX, or DOC file by first converting it to PDF. + Utility function to load a PDF, DOCX, or DOC file. Args: file_path (str): The path to the file to load. @@ -38,16 +51,21 @@ def load_document(file_path: str): try: extension = os.path.splitext(file_path)[1].lower() - if extension in ['.doc', '.docx']: - # Convert .doc or .docx to PDF first + if extension == '.docx': + # For .docx files, use UnstructuredWordDocumentLoader directly + loader = UnstructuredWordDocumentLoader(file_path) + return loader.load() + elif extension == '.doc': + # Convert .doc to .pdf first pdf_path = convert_word_to_pdf(file_path) loader = PyPDFLoader(pdf_path) + return loader.load() elif extension == '.pdf': loader = PyPDFLoader(file_path) + return loader.load() else: raise ValueError(f"Unsupported file type: {extension}. Only .pdf, .docx, and .doc are supported.") - return loader.load() except Exception as e: print(f"Error loading document: {str(e)}") return None \ No newline at end of file