This commit is contained in:
teslim
2025-06-13 19:32:33 +01:00
parent bfcdcf4786
commit 31d85a83e7
3 changed files with 12 additions and 11 deletions
-4
View File
@@ -39,13 +39,9 @@ def validate_worker_document():
file_path = os.path.join(upload_folder, filename)
file.save(file_path)
print("loading document")
# Load the document for processing
docs = load_document(file_path)
print("document loaded")
# Instantiate the chatbot service
chatbot = Chatbot()
+1 -5
View File
@@ -77,7 +77,7 @@ dummy_data = generate_dummy_data(num_assessments=100, max_users_per_assessment=5
#SopGeneratorDocument
class Chatbot:
def __init__(self):
self.api_key = os.getenv("OPENAI_API_KEY")
self.api_key = os.getenv("OPENAI_API_KEY") or "sk-svcacct-v2m4jSLxCTLR-WizUhZnkOHEuftWNVy2k7vIGWDGJogaBr5VogTTVT3BlbkFJjCxOowETlz7muR8eAS7ExO1NA7kvcrZ4HVhS66jxK8PpvNce1kAAA"
self.client = OpenAI(api_key=self.api_key)
self.model = "gpt-4o-mini"
@@ -87,11 +87,7 @@ class Chatbot:
def _extract_text_from_docs(self, docs):
"""Extract text content from document objects."""
print(docs)
return [self.clean_text(doc.page_content) for doc in docs]
# Existing methods...
+11 -2
View File
@@ -4,6 +4,16 @@ from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
from reportlab.lib.styles import getSampleStyleSheet
from langchain_community.document_loaders import PyPDFLoader, UnstructuredWordDocumentLoader
import pdfplumber
from langchain_core.documents import Document
def load_pdf_with_pdfplumber(file_path):
docs = []
with pdfplumber.open(file_path) as pdf:
for i, page in enumerate(pdf.pages):
text = page.extract_text()
docs.append(Document(page_content=text, metadata={"page": i}))
return docs
def convert_word_to_pdf(doc_path: str) -> str:
"""
@@ -61,8 +71,7 @@ def load_document(file_path: str):
loader = PyPDFLoader(pdf_path)
return loader.load()
elif extension == '.pdf':
loader = PyPDFLoader(file_path)
return loader.load()
return load_pdf_with_pdfplumber(file_path)
else:
raise ValueError(f"Unsupported file type: {extension}. Only .pdf, .docx, and .doc are supported.")