role extracion and sop generation added

This commit is contained in:
2024-08-31 01:29:39 +00:00
parent ccb0db21d6
commit 1f02a30a16
15 changed files with 734 additions and 11 deletions
Binary file not shown.
Binary file not shown.
Binary file not shown.
+236
View File
@@ -0,0 +1,236 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import pypandoc"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"import os\n",
"\n",
"# Adjust this path to point to the root of your project\n",
"project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))\n",
"\n",
"# Add the project root to sys.path\n",
"if project_root not in sys.path:\n",
" sys.path.insert(0, project_root)\n",
"\n",
"# Now you can import your modules\n",
"from src.services.sop_generator import SopGenerator\n",
"from src.utils.pdf_loader import load_pdf_to_docs\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pip install pypandoc\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"ename": "OSError",
"evalue": "No pandoc was found: either install pandoc and add it\nto your PATH or or call pypandoc.download_pandoc(...) or\ninstall pypandoc wheels with included pandoc.",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mOSError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[1], line 54\u001b[0m\n\u001b[1;32m 51\u001b[0m pdf_file_path \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m/root/ds_erp_ai/data/raw/test_sop.pdf\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 53\u001b[0m \u001b[38;5;66;03m# Load a .doc file (converts to .docx internally)\u001b[39;00m\n\u001b[0;32m---> 54\u001b[0m doc_docs \u001b[38;5;241m=\u001b[39m \u001b[43mload_document\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdoc_file_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 56\u001b[0m \u001b[38;5;66;03m# Load a .docx file\u001b[39;00m\n\u001b[1;32m 57\u001b[0m \u001b[38;5;66;03m#docx_docs = load_document(docx_file_path)\u001b[39;00m\n\u001b[1;32m 58\u001b[0m \n\u001b[1;32m 59\u001b[0m \u001b[38;5;66;03m# Load a PDF file\u001b[39;00m\n\u001b[1;32m 60\u001b[0m pdf_docs \u001b[38;5;241m=\u001b[39m load_document(pdf_file_path)\n",
"Cell \u001b[0;32mIn[1], line 41\u001b[0m, in \u001b[0;36mload_document\u001b[0;34m(file_path, use_unstructured)\u001b[0m\n\u001b[1;32m 38\u001b[0m loader \u001b[38;5;241m=\u001b[39m Docx2txtLoader(file_path)\n\u001b[1;32m 39\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m extension \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m.doc\u001b[39m\u001b[38;5;124m'\u001b[39m:\n\u001b[1;32m 40\u001b[0m \u001b[38;5;66;03m# Convert .doc to .docx first\u001b[39;00m\n\u001b[0;32m---> 41\u001b[0m docx_path \u001b[38;5;241m=\u001b[39m \u001b[43mconvert_doc_to_docx\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfile_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 42\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m load_document(docx_path, use_unstructured\u001b[38;5;241m=\u001b[39muse_unstructured)\n\u001b[1;32m 43\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n",
"Cell \u001b[0;32mIn[1], line 16\u001b[0m, in \u001b[0;36mconvert_doc_to_docx\u001b[0;34m(doc_path)\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 7\u001b[0m \u001b[38;5;124;03mConvert a .doc file to .docx using pypandoc.\u001b[39;00m\n\u001b[1;32m 8\u001b[0m \u001b[38;5;124;03m\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;124;03m str: The path to the converted .docx file.\u001b[39;00m\n\u001b[1;32m 14\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 15\u001b[0m docx_path \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39msplitext(doc_path)[\u001b[38;5;241m0\u001b[39m] \u001b[38;5;241m+\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m.docx\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[0;32m---> 16\u001b[0m \u001b[43mpypandoc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconvert_file\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdoc_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mdocx\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moutputfile\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdocx_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 17\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m docx_path\n",
"File \u001b[0;32m~/ds_erp_ai/erp/lib/python3.10/site-packages/pypandoc/__init__.py:200\u001b[0m, in \u001b[0;36mconvert_file\u001b[0;34m(source_file, to, format, extra_args, encoding, outputfile, filters, verify_format, sandbox, cworkdir)\u001b[0m\n\u001b[1;32m 197\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(discovered_source_files) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 198\u001b[0m discovered_source_files \u001b[38;5;241m=\u001b[39m discovered_source_files[\u001b[38;5;241m0\u001b[39m]\n\u001b[0;32m--> 200\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_convert_input\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdiscovered_source_files\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mformat\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mpath\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mto\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mextra_args\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mextra_args\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 201\u001b[0m \u001b[43m \u001b[49m\u001b[43moutputfile\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutputfile\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfilters\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfilters\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 202\u001b[0m \u001b[43m \u001b[49m\u001b[43mverify_format\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverify_format\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msandbox\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msandbox\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 203\u001b[0m \u001b[43m \u001b[49m\u001b[43mcworkdir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcworkdir\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m~/ds_erp_ai/erp/lib/python3.10/site-packages/pypandoc/__init__.py:364\u001b[0m, in \u001b[0;36m_convert_input\u001b[0;34m(source, format, input_type, to, extra_args, outputfile, filters, verify_format, sandbox, cworkdir)\u001b[0m\n\u001b[1;32m 361\u001b[0m _check_log_handler()\n\u001b[1;32m 363\u001b[0m logger\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mEnsuring pandoc path...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 364\u001b[0m \u001b[43m_ensure_pandoc_path\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 366\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m verify_format:\n\u001b[1;32m 367\u001b[0m logger\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mVerifying format...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
"File \u001b[0;32m~/ds_erp_ai/erp/lib/python3.10/site-packages/pypandoc/__init__.py:797\u001b[0m, in \u001b[0;36m_ensure_pandoc_path\u001b[0;34m()\u001b[0m\n\u001b[1;32m 789\u001b[0m logger\u001b[38;5;241m.\u001b[39minfo(textwrap\u001b[38;5;241m.\u001b[39mdedent(\u001b[38;5;124m\"\"\"\u001b[39m\u001b[38;5;130;01m\\\u001b[39;00m\n\u001b[1;32m 790\u001b[0m \u001b[38;5;124m See http://johnmacfarlane.net/pandoc/installing.html\u001b[39m\n\u001b[1;32m 791\u001b[0m \u001b[38;5;124m for installation options\u001b[39m\n\u001b[1;32m 792\u001b[0m \u001b[38;5;124m\u001b[39m\u001b[38;5;124m\"\"\"\u001b[39m))\n\u001b[1;32m 793\u001b[0m logger\u001b[38;5;241m.\u001b[39minfo(textwrap\u001b[38;5;241m.\u001b[39mdedent(\u001b[38;5;124m\"\"\"\u001b[39m\u001b[38;5;130;01m\\\u001b[39;00m\n\u001b[1;32m 794\u001b[0m \u001b[38;5;124m ---------------------------------------------------------------\u001b[39m\n\u001b[1;32m 795\u001b[0m \n\u001b[1;32m 796\u001b[0m \u001b[38;5;124m\u001b[39m\u001b[38;5;124m\"\"\"\u001b[39m))\n\u001b[0;32m--> 797\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mOSError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNo pandoc was found: either install pandoc and add it\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 798\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mto your PATH or or call pypandoc.download_pandoc(...) or\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 799\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minstall pypandoc wheels with included pandoc.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
"\u001b[0;31mOSError\u001b[0m: No pandoc was found: either install pandoc and add it\nto your PATH or or call pypandoc.download_pandoc(...) or\ninstall pypandoc wheels with included pandoc."
]
}
],
"source": [
"import os\n",
"import pypandoc\n",
"from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, UnstructuredWordDocumentLoader\n",
"\n",
"def convert_doc_to_docx(doc_path: str) -> str:\n",
" \"\"\"\n",
" Convert a .doc file to .docx using pypandoc.\n",
" \n",
" Args:\n",
" doc_path (str): The path to the .doc file.\n",
"\n",
" Returns:\n",
" str: The path to the converted .docx file.\n",
" \"\"\"\n",
" docx_path = os.path.splitext(doc_path)[0] + '.docx'\n",
" pypandoc.convert_file(doc_path, 'docx', outputfile=docx_path)\n",
" return docx_path\n",
"\n",
"def load_document(file_path: str, use_unstructured: bool = False):\n",
" \"\"\"\n",
" Utility function to load a PDF, DOCX, or DOC file and convert it to document objects.\n",
"\n",
" Args:\n",
" file_path (str): The path to the file to load.\n",
" use_unstructured (bool): Whether to use the Unstructured loader for .docx files. Defaults to False.\n",
"\n",
" Returns:\n",
" List[Document]: A list of Document objects representing the contents of the file.\n",
" \"\"\"\n",
" extension = os.path.splitext(file_path)[1].lower()\n",
" \n",
" if extension == '.pdf':\n",
" loader = PyPDFLoader(file_path)\n",
" elif extension == '.docx':\n",
" if use_unstructured:\n",
" loader = UnstructuredWordDocumentLoader(file_path)\n",
" else:\n",
" loader = Docx2txtLoader(file_path)\n",
" elif extension == '.doc':\n",
" # Convert .doc to .docx first\n",
" docx_path = convert_doc_to_docx(file_path)\n",
" return load_document(docx_path, use_unstructured=use_unstructured)\n",
" else:\n",
" raise ValueError(f\"Unsupported file type: {extension}. Only .pdf, .docx, and .doc are supported.\")\n",
" \n",
" return loader.load()\n",
"\n",
"# Example usage:\n",
"doc_file_path = \"/root/ds_erp_ai/data/raw/document.doc\"\n",
"#docx_file_path = \"/root/ds_erp_ai/data/raw/test_docx.docx\"\n",
"pdf_file_path = \"/root/ds_erp_ai/data/raw/test_sop.pdf\"\n",
"\n",
"# Load a .doc file (converts to .docx internally)\n",
"doc_docs = load_document(doc_file_path)\n",
"\n",
"# Load a .docx file\n",
"#docx_docs = load_document(docx_file_path)\n",
"\n",
"# Load a PDF file\n",
"pdf_docs = load_document(pdf_file_path)\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"ename": "OSError",
"evalue": "No pandoc was found: either install pandoc and add it\nto your PATH or or call pypandoc.download_pandoc(...) or\ninstall pypandoc wheels with included pandoc.",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mOSError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[4], line 48\u001b[0m\n\u001b[1;32m 45\u001b[0m pdf_file_path \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m/root/ds_erp_ai/data/raw/test_sop.pdf\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 47\u001b[0m \u001b[38;5;66;03m# Load a .doc file (converts to PDF first)\u001b[39;00m\n\u001b[0;32m---> 48\u001b[0m doc_docs \u001b[38;5;241m=\u001b[39m \u001b[43mload_document\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdoc_file_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 50\u001b[0m \u001b[38;5;66;03m# Load a .docx file (converts to PDF first)\u001b[39;00m\n\u001b[1;32m 51\u001b[0m \u001b[38;5;66;03m#docx_docs = load_document(docx_file_path)\u001b[39;00m\n\u001b[1;32m 52\u001b[0m \n\u001b[1;32m 53\u001b[0m \u001b[38;5;66;03m# Load a PDF file\u001b[39;00m\n\u001b[1;32m 54\u001b[0m pdf_docs \u001b[38;5;241m=\u001b[39m load_document(pdf_file_path)\n",
"Cell \u001b[0;32mIn[4], line 33\u001b[0m, in \u001b[0;36mload_document\u001b[0;34m(file_path)\u001b[0m\n\u001b[1;32m 29\u001b[0m extension \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39msplitext(file_path)[\u001b[38;5;241m1\u001b[39m]\u001b[38;5;241m.\u001b[39mlower()\n\u001b[1;32m 31\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m extension \u001b[38;5;129;01min\u001b[39;00m [\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m.doc\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m.docx\u001b[39m\u001b[38;5;124m'\u001b[39m]:\n\u001b[1;32m 32\u001b[0m \u001b[38;5;66;03m# Convert .doc or .docx to PDF first\u001b[39;00m\n\u001b[0;32m---> 33\u001b[0m pdf_path \u001b[38;5;241m=\u001b[39m \u001b[43mconvert_to_pdf\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfile_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 34\u001b[0m loader \u001b[38;5;241m=\u001b[39m PyPDFLoader(pdf_path)\n\u001b[1;32m 35\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m extension \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m.pdf\u001b[39m\u001b[38;5;124m'\u001b[39m:\n",
"Cell \u001b[0;32mIn[4], line 16\u001b[0m, in \u001b[0;36mconvert_to_pdf\u001b[0;34m(doc_path)\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 7\u001b[0m \u001b[38;5;124;03mConvert a .doc or .docx file to PDF using pypandoc.\u001b[39;00m\n\u001b[1;32m 8\u001b[0m \u001b[38;5;124;03m\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;124;03m str: The path to the converted PDF file.\u001b[39;00m\n\u001b[1;32m 14\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 15\u001b[0m pdf_path \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39msplitext(doc_path)[\u001b[38;5;241m0\u001b[39m] \u001b[38;5;241m+\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m.pdf\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[0;32m---> 16\u001b[0m \u001b[43mpypandoc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconvert_file\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdoc_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mpdf\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moutputfile\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpdf_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 17\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m pdf_path\n",
"File \u001b[0;32m~/ds_erp_ai/erp/lib/python3.10/site-packages/pypandoc/__init__.py:200\u001b[0m, in \u001b[0;36mconvert_file\u001b[0;34m(source_file, to, format, extra_args, encoding, outputfile, filters, verify_format, sandbox, cworkdir)\u001b[0m\n\u001b[1;32m 197\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(discovered_source_files) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 198\u001b[0m discovered_source_files \u001b[38;5;241m=\u001b[39m discovered_source_files[\u001b[38;5;241m0\u001b[39m]\n\u001b[0;32m--> 200\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_convert_input\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdiscovered_source_files\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mformat\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mpath\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mto\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mextra_args\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mextra_args\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 201\u001b[0m \u001b[43m \u001b[49m\u001b[43moutputfile\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutputfile\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfilters\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfilters\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 202\u001b[0m \u001b[43m \u001b[49m\u001b[43mverify_format\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverify_format\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msandbox\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msandbox\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 203\u001b[0m \u001b[43m \u001b[49m\u001b[43mcworkdir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcworkdir\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m~/ds_erp_ai/erp/lib/python3.10/site-packages/pypandoc/__init__.py:364\u001b[0m, in \u001b[0;36m_convert_input\u001b[0;34m(source, format, input_type, to, extra_args, outputfile, filters, verify_format, sandbox, cworkdir)\u001b[0m\n\u001b[1;32m 361\u001b[0m _check_log_handler()\n\u001b[1;32m 363\u001b[0m logger\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mEnsuring pandoc path...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 364\u001b[0m \u001b[43m_ensure_pandoc_path\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 366\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m verify_format:\n\u001b[1;32m 367\u001b[0m logger\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mVerifying format...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
"File \u001b[0;32m~/ds_erp_ai/erp/lib/python3.10/site-packages/pypandoc/__init__.py:797\u001b[0m, in \u001b[0;36m_ensure_pandoc_path\u001b[0;34m()\u001b[0m\n\u001b[1;32m 789\u001b[0m logger\u001b[38;5;241m.\u001b[39minfo(textwrap\u001b[38;5;241m.\u001b[39mdedent(\u001b[38;5;124m\"\"\"\u001b[39m\u001b[38;5;130;01m\\\u001b[39;00m\n\u001b[1;32m 790\u001b[0m \u001b[38;5;124m See http://johnmacfarlane.net/pandoc/installing.html\u001b[39m\n\u001b[1;32m 791\u001b[0m \u001b[38;5;124m for installation options\u001b[39m\n\u001b[1;32m 792\u001b[0m \u001b[38;5;124m\u001b[39m\u001b[38;5;124m\"\"\"\u001b[39m))\n\u001b[1;32m 793\u001b[0m logger\u001b[38;5;241m.\u001b[39minfo(textwrap\u001b[38;5;241m.\u001b[39mdedent(\u001b[38;5;124m\"\"\"\u001b[39m\u001b[38;5;130;01m\\\u001b[39;00m\n\u001b[1;32m 794\u001b[0m \u001b[38;5;124m ---------------------------------------------------------------\u001b[39m\n\u001b[1;32m 795\u001b[0m \n\u001b[1;32m 796\u001b[0m \u001b[38;5;124m\u001b[39m\u001b[38;5;124m\"\"\"\u001b[39m))\n\u001b[0;32m--> 797\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mOSError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNo pandoc was found: either install pandoc and add it\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 798\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mto your PATH or or call pypandoc.download_pandoc(...) or\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 799\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minstall pypandoc wheels with included pandoc.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
"\u001b[0;31mOSError\u001b[0m: No pandoc was found: either install pandoc and add it\nto your PATH or or call pypandoc.download_pandoc(...) or\ninstall pypandoc wheels with included pandoc."
]
}
],
"source": [
"import os\n",
"from spire.doc import Document, FileFormat\n",
"from langchain_community.document_loaders import PyPDFLoader\n",
"\n",
"def convert_word_to_pdf(doc_path: str) -> str:\n",
" \"\"\"\n",
" Convert a .doc or .docx file to PDF using Spire.Doc.\n",
" \n",
" Args:\n",
" doc_path (str): The path to the .doc or .docx file.\n",
"\n",
" Returns:\n",
" str: The path to the converted PDF file.\n",
" \"\"\"\n",
" pdf_path = os.path.splitext(doc_path)[0] + '.pdf'\n",
" \n",
" # Create a Document object\n",
" document = Document()\n",
" # Load the Word document\n",
" document.load_from_file(doc_path)\n",
" # Save as PDF\n",
" document.save_to_file(pdf_path, FileFormat.PDF)\n",
" document.close()\n",
" \n",
" return pdf_path\n",
"\n",
"def load_document(file_path: str):\n",
" \"\"\"\n",
" Utility function to load a PDF, DOCX, or DOC file by first converting it to PDF.\n",
"\n",
" Args:\n",
" file_path (str): The path to the file to load.\n",
"\n",
" Returns:\n",
" List[Document]: A list of Document objects representing the contents of the file.\n",
" \"\"\"\n",
" extension = os.path.splitext(file_path)[1].lower()\n",
" \n",
" if extension in ['.doc', '.docx']:\n",
" # Convert .doc or .docx to PDF first\n",
" pdf_path = convert_word_to_pdf(file_path)\n",
" loader = PyPDFLoader(pdf_path)\n",
" elif extension == '.pdf':\n",
" loader = PyPDFLoader(file_path)\n",
" else:\n",
" raise ValueError(f\"Unsupported file type: {extension}. Only .pdf, .docx, and .doc are supported.\")\n",
" \n",
" return loader.load()\n",
"\n",
"# Example usage:\n",
"doc_file_path = \"/root/ds_erp_ai/data/raw/document.doc\"\n",
"docx_file_path = \"/root/ds_erp_ai/data/raw/test_docx.docx\"\n",
"pdf_file_path = \"/root/ds_erp_ai/data/raw/test_sop.pdf\"\n",
"\n",
"# Load a .doc file (converts to PDF first)\n",
"doc_docs = load_document(doc_file_path)\n",
"\n",
"# Load a .docx file (converts to PDF first)\n",
"docx_docs = load_document(docx_file_path)\n",
"\n",
"# Load a PDF file\n",
"pdf_docs = load_document(pdf_file_path)\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "erp",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
+11 -10
View File
@@ -7,10 +7,19 @@
"outputs": [],
"source": [
"from langchain_community.document_loaders import PyPDFLoader\n",
"loader = PyPDFLoader(\"/content/Example SOP (1) (1).pdf\")\n",
"loader = PyPDFLoader(\"/root/ds_erp_ai/data/raw/test_sop.pdf\")\n",
"docs = loader.load()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"docs"
]
},
{
"cell_type": "code",
"execution_count": 1,
@@ -162,20 +171,12 @@
],
"metadata": {
"kernelspec": {
"display_name": "erp",
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
}
},
+5 -1
View File
@@ -3,4 +3,8 @@ langchain-community
langchain-openai
pydantic
flask
python-dotenv
python-dotenv
pypdf
pypandoc
Spire.Doc
plum-dispatch==1.7.4
+7
View File
@@ -0,0 +1,7 @@
from src.api.app import create_app
app = create_app()
if __name__ == '__main__':
app.run(debug=True, port=5401)
+21
View File
@@ -0,0 +1,21 @@
import os
from flask import Flask
from src.api.routes.sops import sops_bp
def create_app():
app = Flask(__name__)
# Register the blueprint with the desired prefix
app.register_blueprint(sops_bp, url_prefix='/api/v1/sop')
# Set up the upload folder configuration inside the src directory
UPLOAD_FOLDER = os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../uploads')
UPLOAD_FOLDER = os.path.abspath(UPLOAD_FOLDER)
# Make sure the upload folder exists
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
# Assign the upload folder path to Flask config
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
return app
View File
+116
View File
@@ -0,0 +1,116 @@
import os
from flask import Blueprint, request, jsonify, current_app
from werkzeug.utils import secure_filename
from src.services.sop_generator import SopGenerator
from src.utils.utils import delete_all_files_in_directory
from src.utils.document_loader import load_document
import json
# Initialize the Blueprint
sops_bp = Blueprint('sops', __name__)
# Initialize SopGenerator
sop_generator = SopGenerator()
ALLOWED_EXTENSIONS = {'pdf', 'doc', 'docx'}
def allowed_file(filename):
"""Check if the file has an allowed extension."""
return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
@sops_bp.route('/get_roles', methods=['POST'])
def get_roles():
# Check if the post request has the file part
if 'document' not in request.files:
return jsonify({"error": "No file part", "message": "Please upload a file with the key 'document'."}), 400
file = request.files['document']
# If the user does not select a file, the browser may also submit an empty part without filename
if file.filename == '':
return jsonify({"error": "No selected file", "message": "A file was not selected for upload. Please select a valid file."}), 400
if file and allowed_file(file.filename):
filename = secure_filename(file.filename)
upload_folder = current_app.config['UPLOAD_FOLDER']
file_path = os.path.join(upload_folder, filename)
# Save the file to the upload folder
file.save(file_path)
try:
# Use the utility function to generate docs from the file
docs = load_document(file_path)
# Generate roles from the docs
roles = sop_generator.get_roles(docs)["roles"]
# Cleanup: Delete all files in the upload directory after processing
delete_all_files_in_directory(upload_folder)
return jsonify({"roles": roles, "message": "Roles successfully extracted from the document."}), 200
except Exception as e:
# Cleanup: Delete all files in the upload directory if an error occurs
delete_all_files_in_directory(upload_folder)
return jsonify({"error": "Processing error", "message": f"An error occurred while processing the document: {str(e)}"}), 500
return jsonify({"error": "File type not allowed", "message": "The uploaded file type is not allowed. Please upload a PDF, DOC, or DOCX file."}), 400
@sops_bp.route('/generate_sops', methods=['POST'])
def generate_sops():
# Check if the POST request has the file part
if 'document' not in request.files:
return jsonify({"error": "No file part", "message": "Please upload a file with the key 'document'."}), 400
print("Running................")
file = request.files['document']
roles_json = request.form.get('roles') # Get the roles as a JSON string
if not roles_json:
return jsonify({"error": "No roles provided", "message": "Please provide a list of roles in the 'roles' field."}), 400
try:
roles = json.loads(roles_json) # Parse the roles from JSON string to a list
print(f"Roles are:{roles}")
except json.JSONDecodeError:
return jsonify({"error": "Invalid JSON", "message": "The 'roles' field contains invalid JSON."}), 400
# If the user does not select a file, the browser may also submit an empty part without a filename
if file.filename == '':
return jsonify({"error": "No selected file", "message": "A file was not selected for upload. Please select a valid file."}), 400
if file and allowed_file(file.filename):
filename = secure_filename(file.filename)
upload_folder = current_app.config['UPLOAD_FOLDER']
file_path = os.path.join(upload_folder, filename)
# Save the file to the upload folder
file.save(file_path)
try:
# Use the utility function to generate docs from the file
docs = load_document(file_path)
# Check if the document can generate SOPs for the roles
status_check = sop_generator.check_role_sop(roles=roles, docs=docs)
if not status_check["status"]:
return jsonify({"error": "Document cannot extract SOPs", "message": status_check["message"]}), 400
# Generate SOPs based on the roles provided
sops = sop_generator.generate_sops(roles, docs)
# Cleanup: Delete all files in the upload directory after processing
delete_all_files_in_directory(upload_folder)
return jsonify({"sops": sops, "message": "SOPs successfully generated for the roles from the document."}), 200
except Exception as e:
# Cleanup: Delete all files in the upload directory if an error occurs
delete_all_files_in_directory(upload_folder)
return jsonify({"error": "Processing error", "message": f"An error occurred while processing the document: {str(e)}"}), 500
return jsonify({"error": "File type not allowed", "message": "The uploaded file type is not allowed. Please upload a PDF, DOC, or DOCX file."}), 400
+113
View File
@@ -0,0 +1,113 @@
import os
import json
from openai import OpenAI
from pydantic import BaseModel, Field
from typing import List, Dict, Optional
class SOPs(BaseModel):
must: Optional[List[str]] = Field(default_factory=list)
shall: Optional[List[str]] = Field(default_factory=list)
will: Optional[List[str]] = Field(default_factory=list)
class RoleSOPs(BaseModel):
sops: SOPs
class SOPsFound(BaseModel):
message: str
status: bool
class RolesResponse(BaseModel):
roles: List[str]
class SOPsResponse(BaseModel):
roles_sops: Dict[str, SOPs] = Field(default_factory=dict)
class SopGenerator:
def __init__(self):
self.api_key = os.getenv("OPENAI_API_KEY")
self.client = OpenAI(api_key=self.api_key)
self.model = "gpt-4o-mini"
def _extract_text_from_docs(self, docs):
"""Extract text content from document objects."""
return [doc.page_content for doc in docs]
def get_roles(self, docs) -> RolesResponse:
docs_text = self._extract_text_from_docs(docs)
response = self.client.beta.chat.completions.parse(
model=self.model,
messages=[
{
"role": "system",
"content": '''Suppose you are a role/position extractor from a company document.
You extract the roles as a list, e.g., ["financial analyst", "data scientist", etc.].
If no roles are found, return an empty list.''',
},
{
"role": "user",
"content": [{"type": "text", "text": text} for text in docs_text],
}
],
response_format=RolesResponse,
max_tokens=1024,
temperature=0.1
)
return json.loads(response.choices[0].message.content)
def check_role_sop(self, roles: str, docs) -> SOPsFound:
docs_text = self._extract_text_from_docs(docs)
response = self.client.beta.chat.completions.parse(
model=self.model,
messages=[
{
"role": "system",
"content": f'''Your role is to check if the SOPs for the provided roles "{roles}" are found in the document.
You are validating if the document can provide the SOPs.
Return status=True with a proper message if found, and status=False with a proper message if not.
Keep the message short, e.g., "SOPs found for the role: {roles}" or "SOPs not found for the role: {roles}".'''
},
{
"role": "user",
"content": [{"type": "text", "text": text} for text in docs_text],
}
],
response_format=SOPsFound,
max_tokens=1024,
temperature=0.1
)
return json.loads(response.choices[0].message.content)
def generate_sops(self, roles: List[str], docs) -> SOPsResponse:
roles_sops_all = {}
docs_text = self._extract_text_from_docs(docs)
for role in roles:
response = self.client.beta.chat.completions.parse(
model=self.model,
messages=[
{
"role": "system",
"content": f'''You are a Standard Operating Procedure (SOP) extractor.
Your task is to find SOPs for the role "{role}" in the provided text.
SOPs should be categorized under "must", "shall", and "will".
If the SOPs for the role are not explicitly stated, you are required to infer them from the context provided in the document,
but only if there is clear evidence within the text.
Do not generate or assume SOPs that are not directly supported by the document.
Your extraction should strictly adhere to the content of the document, ensuring that no information is fabricated or inferred beyond what is present.
If no SOPs are found for the role, return an empty list for each category.''',
},
{
"role": "user",
"content": [{"type": "text", "text": text} for text in docs_text],
}
],
response_format=RoleSOPs,
max_tokens=1024,
temperature=0.1
)
role_sop = json.loads(response.choices[0].message.content)
roles_sops_all[role] = role_sop
return roles_sops_all
+48
View File
@@ -0,0 +1,48 @@
import os
from spire.doc import Document, FileFormat
from langchain_community.document_loaders import PyPDFLoader
def convert_word_to_pdf(doc_path: str) -> str:
"""
Convert a .doc or .docx file to PDF using Spire.Doc.
Args:
doc_path (str): The path to the .doc or .docx file.
Returns:
str: The path to the converted PDF file.
"""
pdf_path = os.path.splitext(doc_path)[0] + '.pdf'
# Create a Document object
document = Document()
# Load the Word document
document.LoadFromFile(doc_path)
# Save as PDF
document.SaveToFile(pdf_path, FileFormat.PDF)
document.Close()
return pdf_path
def load_document(file_path: str):
"""
Utility function to load a PDF, DOCX, or DOC file by first converting it to PDF.
Args:
file_path (str): The path to the file to load.
Returns:
List[Document]: A list of Document objects representing the contents of the file.
"""
extension = os.path.splitext(file_path)[1].lower()
if extension in ['.doc', '.docx']:
# Convert .doc or .docx to PDF first
pdf_path = convert_word_to_pdf(file_path)
loader = PyPDFLoader(pdf_path)
elif extension == '.pdf':
loader = PyPDFLoader(file_path)
else:
raise ValueError(f"Unsupported file type: {extension}. Only .pdf, .docx, and .doc are supported.")
return loader.load()
+20
View File
@@ -0,0 +1,20 @@
import os
def delete_file(file_path):
try:
os.remove(file_path)
print(f"Deleted file: {file_path}")
except OSError as e:
print(f"Error deleting file {file_path}: {e}")
import os
def delete_all_files_in_directory(directory_path):
try:
for filename in os.listdir(directory_path):
file_path = os.path.join(directory_path, filename)
if os.path.isfile(file_path):
os.remove(file_path)
print(f"Deleted file: {file_path}")
except OSError as e:
print(f"Error deleting files in {directory_path}: {e}")
+21
View File
@@ -0,0 +1,21 @@
from src.services.sop_generator import SopGenerator
from src.utils.document_loader import load_document
file_path = "/root/ds_erp_ai/data/raw/test_sop.pdf"
docs = load_document(file_path)
sop = SopGenerator()
if __name__ == "__main__":
# Assuming 'sop' is an instance of SopGenerator and 'docs' is the loaded document content.
# Step 1: Get the roles from the document
roles = sop.get_roles(docs)["roles"]
print(f"Roles {roles}")
sop_status = sop.check_role_sop(roles=roles, docs=docs)
print(sop_status)
roles = ["cloud engineer"]
sops = sop.generate_sops(roles=roles,docs=docs)
print(f"sops:{sops}")
+136
View File
File diff suppressed because one or more lines are too long