role extracion and sop generation added
This commit is contained in:
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,236 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pypandoc"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import sys\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"# Adjust this path to point to the root of your project\n",
|
||||
"project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))\n",
|
||||
"\n",
|
||||
"# Add the project root to sys.path\n",
|
||||
"if project_root not in sys.path:\n",
|
||||
" sys.path.insert(0, project_root)\n",
|
||||
"\n",
|
||||
"# Now you can import your modules\n",
|
||||
"from src.services.sop_generator import SopGenerator\n",
|
||||
"from src.utils.pdf_loader import load_pdf_to_docs\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pip install pypandoc\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "OSError",
|
||||
"evalue": "No pandoc was found: either install pandoc and add it\nto your PATH or or call pypandoc.download_pandoc(...) or\ninstall pypandoc wheels with included pandoc.",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[0;31mOSError\u001b[0m Traceback (most recent call last)",
|
||||
"Cell \u001b[0;32mIn[1], line 54\u001b[0m\n\u001b[1;32m 51\u001b[0m pdf_file_path \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m/root/ds_erp_ai/data/raw/test_sop.pdf\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 53\u001b[0m \u001b[38;5;66;03m# Load a .doc file (converts to .docx internally)\u001b[39;00m\n\u001b[0;32m---> 54\u001b[0m doc_docs \u001b[38;5;241m=\u001b[39m \u001b[43mload_document\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdoc_file_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 56\u001b[0m \u001b[38;5;66;03m# Load a .docx file\u001b[39;00m\n\u001b[1;32m 57\u001b[0m \u001b[38;5;66;03m#docx_docs = load_document(docx_file_path)\u001b[39;00m\n\u001b[1;32m 58\u001b[0m \n\u001b[1;32m 59\u001b[0m \u001b[38;5;66;03m# Load a PDF file\u001b[39;00m\n\u001b[1;32m 60\u001b[0m pdf_docs \u001b[38;5;241m=\u001b[39m load_document(pdf_file_path)\n",
|
||||
"Cell \u001b[0;32mIn[1], line 41\u001b[0m, in \u001b[0;36mload_document\u001b[0;34m(file_path, use_unstructured)\u001b[0m\n\u001b[1;32m 38\u001b[0m loader \u001b[38;5;241m=\u001b[39m Docx2txtLoader(file_path)\n\u001b[1;32m 39\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m extension \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m.doc\u001b[39m\u001b[38;5;124m'\u001b[39m:\n\u001b[1;32m 40\u001b[0m \u001b[38;5;66;03m# Convert .doc to .docx first\u001b[39;00m\n\u001b[0;32m---> 41\u001b[0m docx_path \u001b[38;5;241m=\u001b[39m \u001b[43mconvert_doc_to_docx\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfile_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 42\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m load_document(docx_path, use_unstructured\u001b[38;5;241m=\u001b[39muse_unstructured)\n\u001b[1;32m 43\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n",
|
||||
"Cell \u001b[0;32mIn[1], line 16\u001b[0m, in \u001b[0;36mconvert_doc_to_docx\u001b[0;34m(doc_path)\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 7\u001b[0m \u001b[38;5;124;03mConvert a .doc file to .docx using pypandoc.\u001b[39;00m\n\u001b[1;32m 8\u001b[0m \u001b[38;5;124;03m\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;124;03m str: The path to the converted .docx file.\u001b[39;00m\n\u001b[1;32m 14\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 15\u001b[0m docx_path \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39msplitext(doc_path)[\u001b[38;5;241m0\u001b[39m] \u001b[38;5;241m+\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m.docx\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[0;32m---> 16\u001b[0m \u001b[43mpypandoc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconvert_file\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdoc_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mdocx\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moutputfile\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdocx_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 17\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m docx_path\n",
|
||||
"File \u001b[0;32m~/ds_erp_ai/erp/lib/python3.10/site-packages/pypandoc/__init__.py:200\u001b[0m, in \u001b[0;36mconvert_file\u001b[0;34m(source_file, to, format, extra_args, encoding, outputfile, filters, verify_format, sandbox, cworkdir)\u001b[0m\n\u001b[1;32m 197\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(discovered_source_files) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 198\u001b[0m discovered_source_files \u001b[38;5;241m=\u001b[39m discovered_source_files[\u001b[38;5;241m0\u001b[39m]\n\u001b[0;32m--> 200\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_convert_input\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdiscovered_source_files\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mformat\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mpath\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mto\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mextra_args\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mextra_args\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 201\u001b[0m \u001b[43m \u001b[49m\u001b[43moutputfile\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutputfile\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfilters\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfilters\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 202\u001b[0m \u001b[43m \u001b[49m\u001b[43mverify_format\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverify_format\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msandbox\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msandbox\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 203\u001b[0m \u001b[43m \u001b[49m\u001b[43mcworkdir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcworkdir\u001b[49m\u001b[43m)\u001b[49m\n",
|
||||
"File \u001b[0;32m~/ds_erp_ai/erp/lib/python3.10/site-packages/pypandoc/__init__.py:364\u001b[0m, in \u001b[0;36m_convert_input\u001b[0;34m(source, format, input_type, to, extra_args, outputfile, filters, verify_format, sandbox, cworkdir)\u001b[0m\n\u001b[1;32m 361\u001b[0m _check_log_handler()\n\u001b[1;32m 363\u001b[0m logger\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mEnsuring pandoc path...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 364\u001b[0m \u001b[43m_ensure_pandoc_path\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 366\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m verify_format:\n\u001b[1;32m 367\u001b[0m logger\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mVerifying format...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
|
||||
"File \u001b[0;32m~/ds_erp_ai/erp/lib/python3.10/site-packages/pypandoc/__init__.py:797\u001b[0m, in \u001b[0;36m_ensure_pandoc_path\u001b[0;34m()\u001b[0m\n\u001b[1;32m 789\u001b[0m logger\u001b[38;5;241m.\u001b[39minfo(textwrap\u001b[38;5;241m.\u001b[39mdedent(\u001b[38;5;124m\"\"\"\u001b[39m\u001b[38;5;130;01m\\\u001b[39;00m\n\u001b[1;32m 790\u001b[0m \u001b[38;5;124m See http://johnmacfarlane.net/pandoc/installing.html\u001b[39m\n\u001b[1;32m 791\u001b[0m \u001b[38;5;124m for installation options\u001b[39m\n\u001b[1;32m 792\u001b[0m \u001b[38;5;124m\u001b[39m\u001b[38;5;124m\"\"\"\u001b[39m))\n\u001b[1;32m 793\u001b[0m logger\u001b[38;5;241m.\u001b[39minfo(textwrap\u001b[38;5;241m.\u001b[39mdedent(\u001b[38;5;124m\"\"\"\u001b[39m\u001b[38;5;130;01m\\\u001b[39;00m\n\u001b[1;32m 794\u001b[0m \u001b[38;5;124m ---------------------------------------------------------------\u001b[39m\n\u001b[1;32m 795\u001b[0m \n\u001b[1;32m 796\u001b[0m \u001b[38;5;124m\u001b[39m\u001b[38;5;124m\"\"\"\u001b[39m))\n\u001b[0;32m--> 797\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mOSError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNo pandoc was found: either install pandoc and add it\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 798\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mto your PATH or or call pypandoc.download_pandoc(...) or\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 799\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minstall pypandoc wheels with included pandoc.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
|
||||
"\u001b[0;31mOSError\u001b[0m: No pandoc was found: either install pandoc and add it\nto your PATH or or call pypandoc.download_pandoc(...) or\ninstall pypandoc wheels with included pandoc."
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import pypandoc\n",
|
||||
"from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, UnstructuredWordDocumentLoader\n",
|
||||
"\n",
|
||||
"def convert_doc_to_docx(doc_path: str) -> str:\n",
|
||||
" \"\"\"\n",
|
||||
" Convert a .doc file to .docx using pypandoc.\n",
|
||||
" \n",
|
||||
" Args:\n",
|
||||
" doc_path (str): The path to the .doc file.\n",
|
||||
"\n",
|
||||
" Returns:\n",
|
||||
" str: The path to the converted .docx file.\n",
|
||||
" \"\"\"\n",
|
||||
" docx_path = os.path.splitext(doc_path)[0] + '.docx'\n",
|
||||
" pypandoc.convert_file(doc_path, 'docx', outputfile=docx_path)\n",
|
||||
" return docx_path\n",
|
||||
"\n",
|
||||
"def load_document(file_path: str, use_unstructured: bool = False):\n",
|
||||
" \"\"\"\n",
|
||||
" Utility function to load a PDF, DOCX, or DOC file and convert it to document objects.\n",
|
||||
"\n",
|
||||
" Args:\n",
|
||||
" file_path (str): The path to the file to load.\n",
|
||||
" use_unstructured (bool): Whether to use the Unstructured loader for .docx files. Defaults to False.\n",
|
||||
"\n",
|
||||
" Returns:\n",
|
||||
" List[Document]: A list of Document objects representing the contents of the file.\n",
|
||||
" \"\"\"\n",
|
||||
" extension = os.path.splitext(file_path)[1].lower()\n",
|
||||
" \n",
|
||||
" if extension == '.pdf':\n",
|
||||
" loader = PyPDFLoader(file_path)\n",
|
||||
" elif extension == '.docx':\n",
|
||||
" if use_unstructured:\n",
|
||||
" loader = UnstructuredWordDocumentLoader(file_path)\n",
|
||||
" else:\n",
|
||||
" loader = Docx2txtLoader(file_path)\n",
|
||||
" elif extension == '.doc':\n",
|
||||
" # Convert .doc to .docx first\n",
|
||||
" docx_path = convert_doc_to_docx(file_path)\n",
|
||||
" return load_document(docx_path, use_unstructured=use_unstructured)\n",
|
||||
" else:\n",
|
||||
" raise ValueError(f\"Unsupported file type: {extension}. Only .pdf, .docx, and .doc are supported.\")\n",
|
||||
" \n",
|
||||
" return loader.load()\n",
|
||||
"\n",
|
||||
"# Example usage:\n",
|
||||
"doc_file_path = \"/root/ds_erp_ai/data/raw/document.doc\"\n",
|
||||
"#docx_file_path = \"/root/ds_erp_ai/data/raw/test_docx.docx\"\n",
|
||||
"pdf_file_path = \"/root/ds_erp_ai/data/raw/test_sop.pdf\"\n",
|
||||
"\n",
|
||||
"# Load a .doc file (converts to .docx internally)\n",
|
||||
"doc_docs = load_document(doc_file_path)\n",
|
||||
"\n",
|
||||
"# Load a .docx file\n",
|
||||
"#docx_docs = load_document(docx_file_path)\n",
|
||||
"\n",
|
||||
"# Load a PDF file\n",
|
||||
"pdf_docs = load_document(pdf_file_path)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "OSError",
|
||||
"evalue": "No pandoc was found: either install pandoc and add it\nto your PATH or or call pypandoc.download_pandoc(...) or\ninstall pypandoc wheels with included pandoc.",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[0;31mOSError\u001b[0m Traceback (most recent call last)",
|
||||
"Cell \u001b[0;32mIn[4], line 48\u001b[0m\n\u001b[1;32m 45\u001b[0m pdf_file_path \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m/root/ds_erp_ai/data/raw/test_sop.pdf\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 47\u001b[0m \u001b[38;5;66;03m# Load a .doc file (converts to PDF first)\u001b[39;00m\n\u001b[0;32m---> 48\u001b[0m doc_docs \u001b[38;5;241m=\u001b[39m \u001b[43mload_document\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdoc_file_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 50\u001b[0m \u001b[38;5;66;03m# Load a .docx file (converts to PDF first)\u001b[39;00m\n\u001b[1;32m 51\u001b[0m \u001b[38;5;66;03m#docx_docs = load_document(docx_file_path)\u001b[39;00m\n\u001b[1;32m 52\u001b[0m \n\u001b[1;32m 53\u001b[0m \u001b[38;5;66;03m# Load a PDF file\u001b[39;00m\n\u001b[1;32m 54\u001b[0m pdf_docs \u001b[38;5;241m=\u001b[39m load_document(pdf_file_path)\n",
|
||||
"Cell \u001b[0;32mIn[4], line 33\u001b[0m, in \u001b[0;36mload_document\u001b[0;34m(file_path)\u001b[0m\n\u001b[1;32m 29\u001b[0m extension \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39msplitext(file_path)[\u001b[38;5;241m1\u001b[39m]\u001b[38;5;241m.\u001b[39mlower()\n\u001b[1;32m 31\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m extension \u001b[38;5;129;01min\u001b[39;00m [\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m.doc\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m.docx\u001b[39m\u001b[38;5;124m'\u001b[39m]:\n\u001b[1;32m 32\u001b[0m \u001b[38;5;66;03m# Convert .doc or .docx to PDF first\u001b[39;00m\n\u001b[0;32m---> 33\u001b[0m pdf_path \u001b[38;5;241m=\u001b[39m \u001b[43mconvert_to_pdf\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfile_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 34\u001b[0m loader \u001b[38;5;241m=\u001b[39m PyPDFLoader(pdf_path)\n\u001b[1;32m 35\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m extension \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m.pdf\u001b[39m\u001b[38;5;124m'\u001b[39m:\n",
|
||||
"Cell \u001b[0;32mIn[4], line 16\u001b[0m, in \u001b[0;36mconvert_to_pdf\u001b[0;34m(doc_path)\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 7\u001b[0m \u001b[38;5;124;03mConvert a .doc or .docx file to PDF using pypandoc.\u001b[39;00m\n\u001b[1;32m 8\u001b[0m \u001b[38;5;124;03m\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;124;03m str: The path to the converted PDF file.\u001b[39;00m\n\u001b[1;32m 14\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 15\u001b[0m pdf_path \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39msplitext(doc_path)[\u001b[38;5;241m0\u001b[39m] \u001b[38;5;241m+\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m.pdf\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[0;32m---> 16\u001b[0m \u001b[43mpypandoc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconvert_file\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdoc_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mpdf\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moutputfile\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpdf_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 17\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m pdf_path\n",
|
||||
"File \u001b[0;32m~/ds_erp_ai/erp/lib/python3.10/site-packages/pypandoc/__init__.py:200\u001b[0m, in \u001b[0;36mconvert_file\u001b[0;34m(source_file, to, format, extra_args, encoding, outputfile, filters, verify_format, sandbox, cworkdir)\u001b[0m\n\u001b[1;32m 197\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(discovered_source_files) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 198\u001b[0m discovered_source_files \u001b[38;5;241m=\u001b[39m discovered_source_files[\u001b[38;5;241m0\u001b[39m]\n\u001b[0;32m--> 200\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_convert_input\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdiscovered_source_files\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mformat\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mpath\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mto\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mextra_args\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mextra_args\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 201\u001b[0m \u001b[43m \u001b[49m\u001b[43moutputfile\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutputfile\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfilters\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfilters\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 202\u001b[0m \u001b[43m \u001b[49m\u001b[43mverify_format\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverify_format\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msandbox\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msandbox\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 203\u001b[0m \u001b[43m \u001b[49m\u001b[43mcworkdir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcworkdir\u001b[49m\u001b[43m)\u001b[49m\n",
|
||||
"File \u001b[0;32m~/ds_erp_ai/erp/lib/python3.10/site-packages/pypandoc/__init__.py:364\u001b[0m, in \u001b[0;36m_convert_input\u001b[0;34m(source, format, input_type, to, extra_args, outputfile, filters, verify_format, sandbox, cworkdir)\u001b[0m\n\u001b[1;32m 361\u001b[0m _check_log_handler()\n\u001b[1;32m 363\u001b[0m logger\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mEnsuring pandoc path...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 364\u001b[0m \u001b[43m_ensure_pandoc_path\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 366\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m verify_format:\n\u001b[1;32m 367\u001b[0m logger\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mVerifying format...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
|
||||
"File \u001b[0;32m~/ds_erp_ai/erp/lib/python3.10/site-packages/pypandoc/__init__.py:797\u001b[0m, in \u001b[0;36m_ensure_pandoc_path\u001b[0;34m()\u001b[0m\n\u001b[1;32m 789\u001b[0m logger\u001b[38;5;241m.\u001b[39minfo(textwrap\u001b[38;5;241m.\u001b[39mdedent(\u001b[38;5;124m\"\"\"\u001b[39m\u001b[38;5;130;01m\\\u001b[39;00m\n\u001b[1;32m 790\u001b[0m \u001b[38;5;124m See http://johnmacfarlane.net/pandoc/installing.html\u001b[39m\n\u001b[1;32m 791\u001b[0m \u001b[38;5;124m for installation options\u001b[39m\n\u001b[1;32m 792\u001b[0m \u001b[38;5;124m\u001b[39m\u001b[38;5;124m\"\"\"\u001b[39m))\n\u001b[1;32m 793\u001b[0m logger\u001b[38;5;241m.\u001b[39minfo(textwrap\u001b[38;5;241m.\u001b[39mdedent(\u001b[38;5;124m\"\"\"\u001b[39m\u001b[38;5;130;01m\\\u001b[39;00m\n\u001b[1;32m 794\u001b[0m \u001b[38;5;124m ---------------------------------------------------------------\u001b[39m\n\u001b[1;32m 795\u001b[0m \n\u001b[1;32m 796\u001b[0m \u001b[38;5;124m\u001b[39m\u001b[38;5;124m\"\"\"\u001b[39m))\n\u001b[0;32m--> 797\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mOSError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNo pandoc was found: either install pandoc and add it\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 798\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mto your PATH or or call pypandoc.download_pandoc(...) or\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 799\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minstall pypandoc wheels with included pandoc.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
|
||||
"\u001b[0;31mOSError\u001b[0m: No pandoc was found: either install pandoc and add it\nto your PATH or or call pypandoc.download_pandoc(...) or\ninstall pypandoc wheels with included pandoc."
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"from spire.doc import Document, FileFormat\n",
|
||||
"from langchain_community.document_loaders import PyPDFLoader\n",
|
||||
"\n",
|
||||
"def convert_word_to_pdf(doc_path: str) -> str:\n",
|
||||
" \"\"\"\n",
|
||||
" Convert a .doc or .docx file to PDF using Spire.Doc.\n",
|
||||
" \n",
|
||||
" Args:\n",
|
||||
" doc_path (str): The path to the .doc or .docx file.\n",
|
||||
"\n",
|
||||
" Returns:\n",
|
||||
" str: The path to the converted PDF file.\n",
|
||||
" \"\"\"\n",
|
||||
" pdf_path = os.path.splitext(doc_path)[0] + '.pdf'\n",
|
||||
" \n",
|
||||
" # Create a Document object\n",
|
||||
" document = Document()\n",
|
||||
" # Load the Word document\n",
|
||||
" document.load_from_file(doc_path)\n",
|
||||
" # Save as PDF\n",
|
||||
" document.save_to_file(pdf_path, FileFormat.PDF)\n",
|
||||
" document.close()\n",
|
||||
" \n",
|
||||
" return pdf_path\n",
|
||||
"\n",
|
||||
"def load_document(file_path: str):\n",
|
||||
" \"\"\"\n",
|
||||
" Utility function to load a PDF, DOCX, or DOC file by first converting it to PDF.\n",
|
||||
"\n",
|
||||
" Args:\n",
|
||||
" file_path (str): The path to the file to load.\n",
|
||||
"\n",
|
||||
" Returns:\n",
|
||||
" List[Document]: A list of Document objects representing the contents of the file.\n",
|
||||
" \"\"\"\n",
|
||||
" extension = os.path.splitext(file_path)[1].lower()\n",
|
||||
" \n",
|
||||
" if extension in ['.doc', '.docx']:\n",
|
||||
" # Convert .doc or .docx to PDF first\n",
|
||||
" pdf_path = convert_word_to_pdf(file_path)\n",
|
||||
" loader = PyPDFLoader(pdf_path)\n",
|
||||
" elif extension == '.pdf':\n",
|
||||
" loader = PyPDFLoader(file_path)\n",
|
||||
" else:\n",
|
||||
" raise ValueError(f\"Unsupported file type: {extension}. Only .pdf, .docx, and .doc are supported.\")\n",
|
||||
" \n",
|
||||
" return loader.load()\n",
|
||||
"\n",
|
||||
"# Example usage:\n",
|
||||
"doc_file_path = \"/root/ds_erp_ai/data/raw/document.doc\"\n",
|
||||
"docx_file_path = \"/root/ds_erp_ai/data/raw/test_docx.docx\"\n",
|
||||
"pdf_file_path = \"/root/ds_erp_ai/data/raw/test_sop.pdf\"\n",
|
||||
"\n",
|
||||
"# Load a .doc file (converts to PDF first)\n",
|
||||
"doc_docs = load_document(doc_file_path)\n",
|
||||
"\n",
|
||||
"# Load a .docx file (converts to PDF first)\n",
|
||||
"docx_docs = load_document(docx_file_path)\n",
|
||||
"\n",
|
||||
"# Load a PDF file\n",
|
||||
"pdf_docs = load_document(pdf_file_path)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "erp",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
@@ -7,10 +7,19 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain_community.document_loaders import PyPDFLoader\n",
|
||||
"loader = PyPDFLoader(\"/content/Example SOP (1) (1).pdf\")\n",
|
||||
"loader = PyPDFLoader(\"/root/ds_erp_ai/data/raw/test_sop.pdf\")\n",
|
||||
"docs = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
@@ -162,20 +171,12 @@
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "erp",
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.13"
|
||||
}
|
||||
},
|
||||
|
||||
+5
-1
@@ -3,4 +3,8 @@ langchain-community
|
||||
langchain-openai
|
||||
pydantic
|
||||
flask
|
||||
python-dotenv
|
||||
python-dotenv
|
||||
pypdf
|
||||
pypandoc
|
||||
Spire.Doc
|
||||
plum-dispatch==1.7.4
|
||||
@@ -0,0 +1,7 @@
|
||||
from src.api.app import create_app
|
||||
app = create_app()
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
app.run(debug=True, port=5401)
|
||||
@@ -0,0 +1,21 @@
|
||||
import os
|
||||
from flask import Flask
|
||||
from src.api.routes.sops import sops_bp
|
||||
|
||||
def create_app():
|
||||
app = Flask(__name__)
|
||||
|
||||
# Register the blueprint with the desired prefix
|
||||
app.register_blueprint(sops_bp, url_prefix='/api/v1/sop')
|
||||
|
||||
# Set up the upload folder configuration inside the src directory
|
||||
UPLOAD_FOLDER = os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../uploads')
|
||||
UPLOAD_FOLDER = os.path.abspath(UPLOAD_FOLDER)
|
||||
|
||||
# Make sure the upload folder exists
|
||||
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
|
||||
|
||||
# Assign the upload folder path to Flask config
|
||||
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
|
||||
|
||||
return app
|
||||
@@ -0,0 +1,116 @@
|
||||
import os
|
||||
from flask import Blueprint, request, jsonify, current_app
|
||||
from werkzeug.utils import secure_filename
|
||||
|
||||
from src.services.sop_generator import SopGenerator
|
||||
from src.utils.utils import delete_all_files_in_directory
|
||||
from src.utils.document_loader import load_document
|
||||
import json
|
||||
# Initialize the Blueprint
|
||||
sops_bp = Blueprint('sops', __name__)
|
||||
|
||||
# Initialize SopGenerator
|
||||
sop_generator = SopGenerator()
|
||||
|
||||
ALLOWED_EXTENSIONS = {'pdf', 'doc', 'docx'}
|
||||
|
||||
def allowed_file(filename):
|
||||
"""Check if the file has an allowed extension."""
|
||||
return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
|
||||
|
||||
@sops_bp.route('/get_roles', methods=['POST'])
|
||||
def get_roles():
|
||||
# Check if the post request has the file part
|
||||
if 'document' not in request.files:
|
||||
return jsonify({"error": "No file part", "message": "Please upload a file with the key 'document'."}), 400
|
||||
|
||||
file = request.files['document']
|
||||
|
||||
# If the user does not select a file, the browser may also submit an empty part without filename
|
||||
if file.filename == '':
|
||||
return jsonify({"error": "No selected file", "message": "A file was not selected for upload. Please select a valid file."}), 400
|
||||
|
||||
if file and allowed_file(file.filename):
|
||||
filename = secure_filename(file.filename)
|
||||
upload_folder = current_app.config['UPLOAD_FOLDER']
|
||||
file_path = os.path.join(upload_folder, filename)
|
||||
|
||||
# Save the file to the upload folder
|
||||
file.save(file_path)
|
||||
|
||||
try:
|
||||
# Use the utility function to generate docs from the file
|
||||
docs = load_document(file_path)
|
||||
|
||||
# Generate roles from the docs
|
||||
roles = sop_generator.get_roles(docs)["roles"]
|
||||
|
||||
# Cleanup: Delete all files in the upload directory after processing
|
||||
delete_all_files_in_directory(upload_folder)
|
||||
|
||||
return jsonify({"roles": roles, "message": "Roles successfully extracted from the document."}), 200
|
||||
|
||||
except Exception as e:
|
||||
# Cleanup: Delete all files in the upload directory if an error occurs
|
||||
delete_all_files_in_directory(upload_folder)
|
||||
return jsonify({"error": "Processing error", "message": f"An error occurred while processing the document: {str(e)}"}), 500
|
||||
|
||||
return jsonify({"error": "File type not allowed", "message": "The uploaded file type is not allowed. Please upload a PDF, DOC, or DOCX file."}), 400
|
||||
|
||||
|
||||
|
||||
@sops_bp.route('/generate_sops', methods=['POST'])
|
||||
def generate_sops():
|
||||
# Check if the POST request has the file part
|
||||
if 'document' not in request.files:
|
||||
return jsonify({"error": "No file part", "message": "Please upload a file with the key 'document'."}), 400
|
||||
|
||||
print("Running................")
|
||||
|
||||
file = request.files['document']
|
||||
roles_json = request.form.get('roles') # Get the roles as a JSON string
|
||||
if not roles_json:
|
||||
return jsonify({"error": "No roles provided", "message": "Please provide a list of roles in the 'roles' field."}), 400
|
||||
|
||||
try:
|
||||
roles = json.loads(roles_json) # Parse the roles from JSON string to a list
|
||||
print(f"Roles are:{roles}")
|
||||
except json.JSONDecodeError:
|
||||
return jsonify({"error": "Invalid JSON", "message": "The 'roles' field contains invalid JSON."}), 400
|
||||
|
||||
# If the user does not select a file, the browser may also submit an empty part without a filename
|
||||
if file.filename == '':
|
||||
return jsonify({"error": "No selected file", "message": "A file was not selected for upload. Please select a valid file."}), 400
|
||||
|
||||
if file and allowed_file(file.filename):
|
||||
filename = secure_filename(file.filename)
|
||||
upload_folder = current_app.config['UPLOAD_FOLDER']
|
||||
file_path = os.path.join(upload_folder, filename)
|
||||
|
||||
# Save the file to the upload folder
|
||||
file.save(file_path)
|
||||
|
||||
try:
|
||||
# Use the utility function to generate docs from the file
|
||||
docs = load_document(file_path)
|
||||
|
||||
# Check if the document can generate SOPs for the roles
|
||||
status_check = sop_generator.check_role_sop(roles=roles, docs=docs)
|
||||
|
||||
if not status_check["status"]:
|
||||
return jsonify({"error": "Document cannot extract SOPs", "message": status_check["message"]}), 400
|
||||
|
||||
# Generate SOPs based on the roles provided
|
||||
sops = sop_generator.generate_sops(roles, docs)
|
||||
|
||||
# Cleanup: Delete all files in the upload directory after processing
|
||||
delete_all_files_in_directory(upload_folder)
|
||||
|
||||
return jsonify({"sops": sops, "message": "SOPs successfully generated for the roles from the document."}), 200
|
||||
|
||||
except Exception as e:
|
||||
# Cleanup: Delete all files in the upload directory if an error occurs
|
||||
delete_all_files_in_directory(upload_folder)
|
||||
return jsonify({"error": "Processing error", "message": f"An error occurred while processing the document: {str(e)}"}), 500
|
||||
|
||||
return jsonify({"error": "File type not allowed", "message": "The uploaded file type is not allowed. Please upload a PDF, DOC, or DOCX file."}), 400
|
||||
@@ -0,0 +1,113 @@
|
||||
import os
|
||||
import json
|
||||
from openai import OpenAI
|
||||
from pydantic import BaseModel, Field
|
||||
from typing import List, Dict, Optional
|
||||
|
||||
class SOPs(BaseModel):
|
||||
must: Optional[List[str]] = Field(default_factory=list)
|
||||
shall: Optional[List[str]] = Field(default_factory=list)
|
||||
will: Optional[List[str]] = Field(default_factory=list)
|
||||
|
||||
class RoleSOPs(BaseModel):
|
||||
sops: SOPs
|
||||
|
||||
class SOPsFound(BaseModel):
|
||||
message: str
|
||||
status: bool
|
||||
|
||||
class RolesResponse(BaseModel):
|
||||
roles: List[str]
|
||||
|
||||
class SOPsResponse(BaseModel):
|
||||
roles_sops: Dict[str, SOPs] = Field(default_factory=dict)
|
||||
|
||||
class SopGenerator:
|
||||
def __init__(self):
|
||||
self.api_key = os.getenv("OPENAI_API_KEY")
|
||||
self.client = OpenAI(api_key=self.api_key)
|
||||
self.model = "gpt-4o-mini"
|
||||
|
||||
def _extract_text_from_docs(self, docs):
|
||||
"""Extract text content from document objects."""
|
||||
return [doc.page_content for doc in docs]
|
||||
|
||||
def get_roles(self, docs) -> RolesResponse:
|
||||
docs_text = self._extract_text_from_docs(docs)
|
||||
response = self.client.beta.chat.completions.parse(
|
||||
model=self.model,
|
||||
messages=[
|
||||
{
|
||||
"role": "system",
|
||||
"content": '''Suppose you are a role/position extractor from a company document.
|
||||
You extract the roles as a list, e.g., ["financial analyst", "data scientist", etc.].
|
||||
If no roles are found, return an empty list.''',
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": [{"type": "text", "text": text} for text in docs_text],
|
||||
}
|
||||
],
|
||||
response_format=RolesResponse,
|
||||
max_tokens=1024,
|
||||
temperature=0.1
|
||||
)
|
||||
return json.loads(response.choices[0].message.content)
|
||||
|
||||
def check_role_sop(self, roles: str, docs) -> SOPsFound:
|
||||
docs_text = self._extract_text_from_docs(docs)
|
||||
response = self.client.beta.chat.completions.parse(
|
||||
model=self.model,
|
||||
messages=[
|
||||
{
|
||||
"role": "system",
|
||||
"content": f'''Your role is to check if the SOPs for the provided roles "{roles}" are found in the document.
|
||||
You are validating if the document can provide the SOPs.
|
||||
Return status=True with a proper message if found, and status=False with a proper message if not.
|
||||
Keep the message short, e.g., "SOPs found for the role: {roles}" or "SOPs not found for the role: {roles}".'''
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": [{"type": "text", "text": text} for text in docs_text],
|
||||
}
|
||||
],
|
||||
response_format=SOPsFound,
|
||||
max_tokens=1024,
|
||||
temperature=0.1
|
||||
)
|
||||
return json.loads(response.choices[0].message.content)
|
||||
|
||||
def generate_sops(self, roles: List[str], docs) -> SOPsResponse:
|
||||
roles_sops_all = {}
|
||||
|
||||
docs_text = self._extract_text_from_docs(docs)
|
||||
|
||||
for role in roles:
|
||||
response = self.client.beta.chat.completions.parse(
|
||||
model=self.model,
|
||||
messages=[
|
||||
{
|
||||
"role": "system",
|
||||
"content": f'''You are a Standard Operating Procedure (SOP) extractor.
|
||||
Your task is to find SOPs for the role "{role}" in the provided text.
|
||||
SOPs should be categorized under "must", "shall", and "will".
|
||||
If the SOPs for the role are not explicitly stated, you are required to infer them from the context provided in the document,
|
||||
but only if there is clear evidence within the text.
|
||||
Do not generate or assume SOPs that are not directly supported by the document.
|
||||
Your extraction should strictly adhere to the content of the document, ensuring that no information is fabricated or inferred beyond what is present.
|
||||
If no SOPs are found for the role, return an empty list for each category.''',
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": [{"type": "text", "text": text} for text in docs_text],
|
||||
}
|
||||
],
|
||||
response_format=RoleSOPs,
|
||||
max_tokens=1024,
|
||||
temperature=0.1
|
||||
)
|
||||
role_sop = json.loads(response.choices[0].message.content)
|
||||
roles_sops_all[role] = role_sop
|
||||
|
||||
return roles_sops_all
|
||||
|
||||
@@ -0,0 +1,48 @@
|
||||
import os
|
||||
from spire.doc import Document, FileFormat
|
||||
from langchain_community.document_loaders import PyPDFLoader
|
||||
|
||||
def convert_word_to_pdf(doc_path: str) -> str:
|
||||
"""
|
||||
Convert a .doc or .docx file to PDF using Spire.Doc.
|
||||
|
||||
Args:
|
||||
doc_path (str): The path to the .doc or .docx file.
|
||||
|
||||
Returns:
|
||||
str: The path to the converted PDF file.
|
||||
"""
|
||||
pdf_path = os.path.splitext(doc_path)[0] + '.pdf'
|
||||
|
||||
# Create a Document object
|
||||
document = Document()
|
||||
# Load the Word document
|
||||
document.LoadFromFile(doc_path)
|
||||
# Save as PDF
|
||||
document.SaveToFile(pdf_path, FileFormat.PDF)
|
||||
document.Close()
|
||||
|
||||
return pdf_path
|
||||
|
||||
def load_document(file_path: str):
|
||||
"""
|
||||
Utility function to load a PDF, DOCX, or DOC file by first converting it to PDF.
|
||||
|
||||
Args:
|
||||
file_path (str): The path to the file to load.
|
||||
|
||||
Returns:
|
||||
List[Document]: A list of Document objects representing the contents of the file.
|
||||
"""
|
||||
extension = os.path.splitext(file_path)[1].lower()
|
||||
|
||||
if extension in ['.doc', '.docx']:
|
||||
# Convert .doc or .docx to PDF first
|
||||
pdf_path = convert_word_to_pdf(file_path)
|
||||
loader = PyPDFLoader(pdf_path)
|
||||
elif extension == '.pdf':
|
||||
loader = PyPDFLoader(file_path)
|
||||
else:
|
||||
raise ValueError(f"Unsupported file type: {extension}. Only .pdf, .docx, and .doc are supported.")
|
||||
|
||||
return loader.load()
|
||||
@@ -0,0 +1,20 @@
|
||||
import os
|
||||
def delete_file(file_path):
|
||||
try:
|
||||
os.remove(file_path)
|
||||
print(f"Deleted file: {file_path}")
|
||||
except OSError as e:
|
||||
print(f"Error deleting file {file_path}: {e}")
|
||||
|
||||
|
||||
import os
|
||||
|
||||
def delete_all_files_in_directory(directory_path):
|
||||
try:
|
||||
for filename in os.listdir(directory_path):
|
||||
file_path = os.path.join(directory_path, filename)
|
||||
if os.path.isfile(file_path):
|
||||
os.remove(file_path)
|
||||
print(f"Deleted file: {file_path}")
|
||||
except OSError as e:
|
||||
print(f"Error deleting files in {directory_path}: {e}")
|
||||
@@ -0,0 +1,21 @@
|
||||
from src.services.sop_generator import SopGenerator
|
||||
from src.utils.document_loader import load_document
|
||||
|
||||
|
||||
file_path = "/root/ds_erp_ai/data/raw/test_sop.pdf"
|
||||
docs = load_document(file_path)
|
||||
sop = SopGenerator()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Assuming 'sop' is an instance of SopGenerator and 'docs' is the loaded document content.
|
||||
|
||||
# Step 1: Get the roles from the document
|
||||
roles = sop.get_roles(docs)["roles"]
|
||||
print(f"Roles {roles}")
|
||||
|
||||
sop_status = sop.check_role_sop(roles=roles, docs=docs)
|
||||
print(sop_status)
|
||||
roles = ["cloud engineer"]
|
||||
sops = sop.generate_sops(roles=roles,docs=docs)
|
||||
print(f"sops:{sops}")
|
||||
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user