diff --git a/data/raw/document.doc b/data/raw/document.doc new file mode 100644 index 0000000..1136445 Binary files /dev/null and b/data/raw/document.doc differ diff --git a/data/raw/document.pdf b/data/raw/document.pdf new file mode 100644 index 0000000..7527313 Binary files /dev/null and b/data/raw/document.pdf differ diff --git a/data/raw/test_sop.pdf b/data/raw/test_sop.pdf new file mode 100644 index 0000000..feccf3d Binary files /dev/null and b/data/raw/test_sop.pdf differ diff --git a/notebooks/test.ipynb b/notebooks/test.ipynb new file mode 100644 index 0000000..fe0654c --- /dev/null +++ b/notebooks/test.ipynb @@ -0,0 +1,236 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import pypandoc" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "import os\n", + "\n", + "# Adjust this path to point to the root of your project\n", + "project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))\n", + "\n", + "# Add the project root to sys.path\n", + "if project_root not in sys.path:\n", + " sys.path.insert(0, project_root)\n", + "\n", + "# Now you can import your modules\n", + "from src.services.sop_generator import SopGenerator\n", + "from src.utils.pdf_loader import load_pdf_to_docs\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pip install pypandoc\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "ename": "OSError", + "evalue": "No pandoc was found: either install pandoc and add it\nto your PATH or or call pypandoc.download_pandoc(...) or\ninstall pypandoc wheels with included pandoc.", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mOSError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[1], line 54\u001b[0m\n\u001b[1;32m 51\u001b[0m pdf_file_path \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m/root/ds_erp_ai/data/raw/test_sop.pdf\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 53\u001b[0m \u001b[38;5;66;03m# Load a .doc file (converts to .docx internally)\u001b[39;00m\n\u001b[0;32m---> 54\u001b[0m doc_docs \u001b[38;5;241m=\u001b[39m \u001b[43mload_document\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdoc_file_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 56\u001b[0m \u001b[38;5;66;03m# Load a .docx file\u001b[39;00m\n\u001b[1;32m 57\u001b[0m \u001b[38;5;66;03m#docx_docs = load_document(docx_file_path)\u001b[39;00m\n\u001b[1;32m 58\u001b[0m \n\u001b[1;32m 59\u001b[0m \u001b[38;5;66;03m# Load a PDF file\u001b[39;00m\n\u001b[1;32m 60\u001b[0m pdf_docs \u001b[38;5;241m=\u001b[39m load_document(pdf_file_path)\n", + "Cell \u001b[0;32mIn[1], line 41\u001b[0m, in \u001b[0;36mload_document\u001b[0;34m(file_path, use_unstructured)\u001b[0m\n\u001b[1;32m 38\u001b[0m loader \u001b[38;5;241m=\u001b[39m Docx2txtLoader(file_path)\n\u001b[1;32m 39\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m extension \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m.doc\u001b[39m\u001b[38;5;124m'\u001b[39m:\n\u001b[1;32m 40\u001b[0m \u001b[38;5;66;03m# Convert .doc to .docx first\u001b[39;00m\n\u001b[0;32m---> 41\u001b[0m docx_path \u001b[38;5;241m=\u001b[39m \u001b[43mconvert_doc_to_docx\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfile_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 42\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m load_document(docx_path, use_unstructured\u001b[38;5;241m=\u001b[39muse_unstructured)\n\u001b[1;32m 43\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n", + "Cell \u001b[0;32mIn[1], line 16\u001b[0m, in \u001b[0;36mconvert_doc_to_docx\u001b[0;34m(doc_path)\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 7\u001b[0m \u001b[38;5;124;03mConvert a .doc file to .docx using pypandoc.\u001b[39;00m\n\u001b[1;32m 8\u001b[0m \u001b[38;5;124;03m\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;124;03m str: The path to the converted .docx file.\u001b[39;00m\n\u001b[1;32m 14\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 15\u001b[0m docx_path \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39msplitext(doc_path)[\u001b[38;5;241m0\u001b[39m] \u001b[38;5;241m+\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m.docx\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[0;32m---> 16\u001b[0m \u001b[43mpypandoc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconvert_file\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdoc_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mdocx\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moutputfile\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdocx_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 17\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m docx_path\n", + "File \u001b[0;32m~/ds_erp_ai/erp/lib/python3.10/site-packages/pypandoc/__init__.py:200\u001b[0m, in \u001b[0;36mconvert_file\u001b[0;34m(source_file, to, format, extra_args, encoding, outputfile, filters, verify_format, sandbox, cworkdir)\u001b[0m\n\u001b[1;32m 197\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(discovered_source_files) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 198\u001b[0m discovered_source_files \u001b[38;5;241m=\u001b[39m discovered_source_files[\u001b[38;5;241m0\u001b[39m]\n\u001b[0;32m--> 200\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_convert_input\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdiscovered_source_files\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mformat\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mpath\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mto\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mextra_args\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mextra_args\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 201\u001b[0m \u001b[43m \u001b[49m\u001b[43moutputfile\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutputfile\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfilters\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfilters\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 202\u001b[0m \u001b[43m \u001b[49m\u001b[43mverify_format\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverify_format\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msandbox\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msandbox\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 203\u001b[0m \u001b[43m \u001b[49m\u001b[43mcworkdir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcworkdir\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/ds_erp_ai/erp/lib/python3.10/site-packages/pypandoc/__init__.py:364\u001b[0m, in \u001b[0;36m_convert_input\u001b[0;34m(source, format, input_type, to, extra_args, outputfile, filters, verify_format, sandbox, cworkdir)\u001b[0m\n\u001b[1;32m 361\u001b[0m _check_log_handler()\n\u001b[1;32m 363\u001b[0m logger\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mEnsuring pandoc path...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 364\u001b[0m \u001b[43m_ensure_pandoc_path\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 366\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m verify_format:\n\u001b[1;32m 367\u001b[0m logger\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mVerifying format...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "File \u001b[0;32m~/ds_erp_ai/erp/lib/python3.10/site-packages/pypandoc/__init__.py:797\u001b[0m, in \u001b[0;36m_ensure_pandoc_path\u001b[0;34m()\u001b[0m\n\u001b[1;32m 789\u001b[0m logger\u001b[38;5;241m.\u001b[39minfo(textwrap\u001b[38;5;241m.\u001b[39mdedent(\u001b[38;5;124m\"\"\"\u001b[39m\u001b[38;5;130;01m\\\u001b[39;00m\n\u001b[1;32m 790\u001b[0m \u001b[38;5;124m See http://johnmacfarlane.net/pandoc/installing.html\u001b[39m\n\u001b[1;32m 791\u001b[0m \u001b[38;5;124m for installation options\u001b[39m\n\u001b[1;32m 792\u001b[0m \u001b[38;5;124m\u001b[39m\u001b[38;5;124m\"\"\"\u001b[39m))\n\u001b[1;32m 793\u001b[0m logger\u001b[38;5;241m.\u001b[39minfo(textwrap\u001b[38;5;241m.\u001b[39mdedent(\u001b[38;5;124m\"\"\"\u001b[39m\u001b[38;5;130;01m\\\u001b[39;00m\n\u001b[1;32m 794\u001b[0m \u001b[38;5;124m ---------------------------------------------------------------\u001b[39m\n\u001b[1;32m 795\u001b[0m \n\u001b[1;32m 796\u001b[0m \u001b[38;5;124m\u001b[39m\u001b[38;5;124m\"\"\"\u001b[39m))\n\u001b[0;32m--> 797\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mOSError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNo pandoc was found: either install pandoc and add it\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 798\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mto your PATH or or call pypandoc.download_pandoc(...) or\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 799\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minstall pypandoc wheels with included pandoc.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "\u001b[0;31mOSError\u001b[0m: No pandoc was found: either install pandoc and add it\nto your PATH or or call pypandoc.download_pandoc(...) or\ninstall pypandoc wheels with included pandoc." + ] + } + ], + "source": [ + "import os\n", + "import pypandoc\n", + "from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, UnstructuredWordDocumentLoader\n", + "\n", + "def convert_doc_to_docx(doc_path: str) -> str:\n", + " \"\"\"\n", + " Convert a .doc file to .docx using pypandoc.\n", + " \n", + " Args:\n", + " doc_path (str): The path to the .doc file.\n", + "\n", + " Returns:\n", + " str: The path to the converted .docx file.\n", + " \"\"\"\n", + " docx_path = os.path.splitext(doc_path)[0] + '.docx'\n", + " pypandoc.convert_file(doc_path, 'docx', outputfile=docx_path)\n", + " return docx_path\n", + "\n", + "def load_document(file_path: str, use_unstructured: bool = False):\n", + " \"\"\"\n", + " Utility function to load a PDF, DOCX, or DOC file and convert it to document objects.\n", + "\n", + " Args:\n", + " file_path (str): The path to the file to load.\n", + " use_unstructured (bool): Whether to use the Unstructured loader for .docx files. Defaults to False.\n", + "\n", + " Returns:\n", + " List[Document]: A list of Document objects representing the contents of the file.\n", + " \"\"\"\n", + " extension = os.path.splitext(file_path)[1].lower()\n", + " \n", + " if extension == '.pdf':\n", + " loader = PyPDFLoader(file_path)\n", + " elif extension == '.docx':\n", + " if use_unstructured:\n", + " loader = UnstructuredWordDocumentLoader(file_path)\n", + " else:\n", + " loader = Docx2txtLoader(file_path)\n", + " elif extension == '.doc':\n", + " # Convert .doc to .docx first\n", + " docx_path = convert_doc_to_docx(file_path)\n", + " return load_document(docx_path, use_unstructured=use_unstructured)\n", + " else:\n", + " raise ValueError(f\"Unsupported file type: {extension}. Only .pdf, .docx, and .doc are supported.\")\n", + " \n", + " return loader.load()\n", + "\n", + "# Example usage:\n", + "doc_file_path = \"/root/ds_erp_ai/data/raw/document.doc\"\n", + "#docx_file_path = \"/root/ds_erp_ai/data/raw/test_docx.docx\"\n", + "pdf_file_path = \"/root/ds_erp_ai/data/raw/test_sop.pdf\"\n", + "\n", + "# Load a .doc file (converts to .docx internally)\n", + "doc_docs = load_document(doc_file_path)\n", + "\n", + "# Load a .docx file\n", + "#docx_docs = load_document(docx_file_path)\n", + "\n", + "# Load a PDF file\n", + "pdf_docs = load_document(pdf_file_path)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "ename": "OSError", + "evalue": "No pandoc was found: either install pandoc and add it\nto your PATH or or call pypandoc.download_pandoc(...) or\ninstall pypandoc wheels with included pandoc.", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mOSError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[4], line 48\u001b[0m\n\u001b[1;32m 45\u001b[0m pdf_file_path \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m/root/ds_erp_ai/data/raw/test_sop.pdf\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 47\u001b[0m \u001b[38;5;66;03m# Load a .doc file (converts to PDF first)\u001b[39;00m\n\u001b[0;32m---> 48\u001b[0m doc_docs \u001b[38;5;241m=\u001b[39m \u001b[43mload_document\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdoc_file_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 50\u001b[0m \u001b[38;5;66;03m# Load a .docx file (converts to PDF first)\u001b[39;00m\n\u001b[1;32m 51\u001b[0m \u001b[38;5;66;03m#docx_docs = load_document(docx_file_path)\u001b[39;00m\n\u001b[1;32m 52\u001b[0m \n\u001b[1;32m 53\u001b[0m \u001b[38;5;66;03m# Load a PDF file\u001b[39;00m\n\u001b[1;32m 54\u001b[0m pdf_docs \u001b[38;5;241m=\u001b[39m load_document(pdf_file_path)\n", + "Cell \u001b[0;32mIn[4], line 33\u001b[0m, in \u001b[0;36mload_document\u001b[0;34m(file_path)\u001b[0m\n\u001b[1;32m 29\u001b[0m extension \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39msplitext(file_path)[\u001b[38;5;241m1\u001b[39m]\u001b[38;5;241m.\u001b[39mlower()\n\u001b[1;32m 31\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m extension \u001b[38;5;129;01min\u001b[39;00m [\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m.doc\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m.docx\u001b[39m\u001b[38;5;124m'\u001b[39m]:\n\u001b[1;32m 32\u001b[0m \u001b[38;5;66;03m# Convert .doc or .docx to PDF first\u001b[39;00m\n\u001b[0;32m---> 33\u001b[0m pdf_path \u001b[38;5;241m=\u001b[39m \u001b[43mconvert_to_pdf\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfile_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 34\u001b[0m loader \u001b[38;5;241m=\u001b[39m PyPDFLoader(pdf_path)\n\u001b[1;32m 35\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m extension \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m.pdf\u001b[39m\u001b[38;5;124m'\u001b[39m:\n", + "Cell \u001b[0;32mIn[4], line 16\u001b[0m, in \u001b[0;36mconvert_to_pdf\u001b[0;34m(doc_path)\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 7\u001b[0m \u001b[38;5;124;03mConvert a .doc or .docx file to PDF using pypandoc.\u001b[39;00m\n\u001b[1;32m 8\u001b[0m \u001b[38;5;124;03m\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;124;03m str: The path to the converted PDF file.\u001b[39;00m\n\u001b[1;32m 14\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 15\u001b[0m pdf_path \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39msplitext(doc_path)[\u001b[38;5;241m0\u001b[39m] \u001b[38;5;241m+\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m.pdf\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[0;32m---> 16\u001b[0m \u001b[43mpypandoc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconvert_file\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdoc_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mpdf\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moutputfile\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpdf_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 17\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m pdf_path\n", + "File \u001b[0;32m~/ds_erp_ai/erp/lib/python3.10/site-packages/pypandoc/__init__.py:200\u001b[0m, in \u001b[0;36mconvert_file\u001b[0;34m(source_file, to, format, extra_args, encoding, outputfile, filters, verify_format, sandbox, cworkdir)\u001b[0m\n\u001b[1;32m 197\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(discovered_source_files) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 198\u001b[0m discovered_source_files \u001b[38;5;241m=\u001b[39m discovered_source_files[\u001b[38;5;241m0\u001b[39m]\n\u001b[0;32m--> 200\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_convert_input\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdiscovered_source_files\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mformat\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mpath\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mto\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mextra_args\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mextra_args\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 201\u001b[0m \u001b[43m \u001b[49m\u001b[43moutputfile\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutputfile\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfilters\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfilters\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 202\u001b[0m \u001b[43m \u001b[49m\u001b[43mverify_format\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverify_format\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msandbox\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msandbox\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 203\u001b[0m \u001b[43m \u001b[49m\u001b[43mcworkdir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcworkdir\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/ds_erp_ai/erp/lib/python3.10/site-packages/pypandoc/__init__.py:364\u001b[0m, in \u001b[0;36m_convert_input\u001b[0;34m(source, format, input_type, to, extra_args, outputfile, filters, verify_format, sandbox, cworkdir)\u001b[0m\n\u001b[1;32m 361\u001b[0m _check_log_handler()\n\u001b[1;32m 363\u001b[0m logger\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mEnsuring pandoc path...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 364\u001b[0m \u001b[43m_ensure_pandoc_path\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 366\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m verify_format:\n\u001b[1;32m 367\u001b[0m logger\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mVerifying format...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "File \u001b[0;32m~/ds_erp_ai/erp/lib/python3.10/site-packages/pypandoc/__init__.py:797\u001b[0m, in \u001b[0;36m_ensure_pandoc_path\u001b[0;34m()\u001b[0m\n\u001b[1;32m 789\u001b[0m logger\u001b[38;5;241m.\u001b[39minfo(textwrap\u001b[38;5;241m.\u001b[39mdedent(\u001b[38;5;124m\"\"\"\u001b[39m\u001b[38;5;130;01m\\\u001b[39;00m\n\u001b[1;32m 790\u001b[0m \u001b[38;5;124m See http://johnmacfarlane.net/pandoc/installing.html\u001b[39m\n\u001b[1;32m 791\u001b[0m \u001b[38;5;124m for installation options\u001b[39m\n\u001b[1;32m 792\u001b[0m \u001b[38;5;124m\u001b[39m\u001b[38;5;124m\"\"\"\u001b[39m))\n\u001b[1;32m 793\u001b[0m logger\u001b[38;5;241m.\u001b[39minfo(textwrap\u001b[38;5;241m.\u001b[39mdedent(\u001b[38;5;124m\"\"\"\u001b[39m\u001b[38;5;130;01m\\\u001b[39;00m\n\u001b[1;32m 794\u001b[0m \u001b[38;5;124m ---------------------------------------------------------------\u001b[39m\n\u001b[1;32m 795\u001b[0m \n\u001b[1;32m 796\u001b[0m \u001b[38;5;124m\u001b[39m\u001b[38;5;124m\"\"\"\u001b[39m))\n\u001b[0;32m--> 797\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mOSError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNo pandoc was found: either install pandoc and add it\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 798\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mto your PATH or or call pypandoc.download_pandoc(...) or\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 799\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minstall pypandoc wheels with included pandoc.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "\u001b[0;31mOSError\u001b[0m: No pandoc was found: either install pandoc and add it\nto your PATH or or call pypandoc.download_pandoc(...) or\ninstall pypandoc wheels with included pandoc." + ] + } + ], + "source": [ + "import os\n", + "from spire.doc import Document, FileFormat\n", + "from langchain_community.document_loaders import PyPDFLoader\n", + "\n", + "def convert_word_to_pdf(doc_path: str) -> str:\n", + " \"\"\"\n", + " Convert a .doc or .docx file to PDF using Spire.Doc.\n", + " \n", + " Args:\n", + " doc_path (str): The path to the .doc or .docx file.\n", + "\n", + " Returns:\n", + " str: The path to the converted PDF file.\n", + " \"\"\"\n", + " pdf_path = os.path.splitext(doc_path)[0] + '.pdf'\n", + " \n", + " # Create a Document object\n", + " document = Document()\n", + " # Load the Word document\n", + " document.load_from_file(doc_path)\n", + " # Save as PDF\n", + " document.save_to_file(pdf_path, FileFormat.PDF)\n", + " document.close()\n", + " \n", + " return pdf_path\n", + "\n", + "def load_document(file_path: str):\n", + " \"\"\"\n", + " Utility function to load a PDF, DOCX, or DOC file by first converting it to PDF.\n", + "\n", + " Args:\n", + " file_path (str): The path to the file to load.\n", + "\n", + " Returns:\n", + " List[Document]: A list of Document objects representing the contents of the file.\n", + " \"\"\"\n", + " extension = os.path.splitext(file_path)[1].lower()\n", + " \n", + " if extension in ['.doc', '.docx']:\n", + " # Convert .doc or .docx to PDF first\n", + " pdf_path = convert_word_to_pdf(file_path)\n", + " loader = PyPDFLoader(pdf_path)\n", + " elif extension == '.pdf':\n", + " loader = PyPDFLoader(file_path)\n", + " else:\n", + " raise ValueError(f\"Unsupported file type: {extension}. Only .pdf, .docx, and .doc are supported.\")\n", + " \n", + " return loader.load()\n", + "\n", + "# Example usage:\n", + "doc_file_path = \"/root/ds_erp_ai/data/raw/document.doc\"\n", + "docx_file_path = \"/root/ds_erp_ai/data/raw/test_docx.docx\"\n", + "pdf_file_path = \"/root/ds_erp_ai/data/raw/test_sop.pdf\"\n", + "\n", + "# Load a .doc file (converts to PDF first)\n", + "doc_docs = load_document(doc_file_path)\n", + "\n", + "# Load a .docx file (converts to PDF first)\n", + "docx_docs = load_document(docx_file_path)\n", + "\n", + "# Load a PDF file\n", + "pdf_docs = load_document(pdf_file_path)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "erp", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/test_sop_generator.ipynb b/notebooks/test_sop_generator.ipynb index 23cd4fe..6e195ea 100644 --- a/notebooks/test_sop_generator.ipynb +++ b/notebooks/test_sop_generator.ipynb @@ -7,10 +7,19 @@ "outputs": [], "source": [ "from langchain_community.document_loaders import PyPDFLoader\n", - "loader = PyPDFLoader(\"/content/Example SOP (1) (1).pdf\")\n", + "loader = PyPDFLoader(\"/root/ds_erp_ai/data/raw/test_sop.pdf\")\n", "docs = loader.load()" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "docs" + ] + }, { "cell_type": "code", "execution_count": 1, @@ -162,20 +171,12 @@ ], "metadata": { "kernelspec": { - "display_name": "erp", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", "version": "3.10.13" } }, diff --git a/requirements.txt b/requirements.txt index 488c514..838a4f2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,8 @@ langchain-community langchain-openai pydantic flask -python-dotenv \ No newline at end of file +python-dotenv +pypdf +pypandoc +Spire.Doc +plum-dispatch==1.7.4 \ No newline at end of file diff --git a/run.py b/run.py new file mode 100644 index 0000000..7d7cd4d --- /dev/null +++ b/run.py @@ -0,0 +1,7 @@ +from src.api.app import create_app +app = create_app() + + + +if __name__ == '__main__': + app.run(debug=True, port=5401) diff --git a/src/api/app.py b/src/api/app.py new file mode 100644 index 0000000..53a4025 --- /dev/null +++ b/src/api/app.py @@ -0,0 +1,21 @@ +import os +from flask import Flask +from src.api.routes.sops import sops_bp + +def create_app(): + app = Flask(__name__) + + # Register the blueprint with the desired prefix + app.register_blueprint(sops_bp, url_prefix='/api/v1/sop') + + # Set up the upload folder configuration inside the src directory + UPLOAD_FOLDER = os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../uploads') + UPLOAD_FOLDER = os.path.abspath(UPLOAD_FOLDER) + + # Make sure the upload folder exists + os.makedirs(UPLOAD_FOLDER, exist_ok=True) + + # Assign the upload folder path to Flask config + app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER + + return app diff --git a/src/api/routes/__init___.py b/src/api/routes/__init___.py new file mode 100644 index 0000000..e69de29 diff --git a/src/api/routes/sops.py b/src/api/routes/sops.py new file mode 100644 index 0000000..469f236 --- /dev/null +++ b/src/api/routes/sops.py @@ -0,0 +1,116 @@ +import os +from flask import Blueprint, request, jsonify, current_app +from werkzeug.utils import secure_filename + +from src.services.sop_generator import SopGenerator +from src.utils.utils import delete_all_files_in_directory +from src.utils.document_loader import load_document +import json +# Initialize the Blueprint +sops_bp = Blueprint('sops', __name__) + +# Initialize SopGenerator +sop_generator = SopGenerator() + +ALLOWED_EXTENSIONS = {'pdf', 'doc', 'docx'} + +def allowed_file(filename): + """Check if the file has an allowed extension.""" + return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS + +@sops_bp.route('/get_roles', methods=['POST']) +def get_roles(): + # Check if the post request has the file part + if 'document' not in request.files: + return jsonify({"error": "No file part", "message": "Please upload a file with the key 'document'."}), 400 + + file = request.files['document'] + + # If the user does not select a file, the browser may also submit an empty part without filename + if file.filename == '': + return jsonify({"error": "No selected file", "message": "A file was not selected for upload. Please select a valid file."}), 400 + + if file and allowed_file(file.filename): + filename = secure_filename(file.filename) + upload_folder = current_app.config['UPLOAD_FOLDER'] + file_path = os.path.join(upload_folder, filename) + + # Save the file to the upload folder + file.save(file_path) + + try: + # Use the utility function to generate docs from the file + docs = load_document(file_path) + + # Generate roles from the docs + roles = sop_generator.get_roles(docs)["roles"] + + # Cleanup: Delete all files in the upload directory after processing + delete_all_files_in_directory(upload_folder) + + return jsonify({"roles": roles, "message": "Roles successfully extracted from the document."}), 200 + + except Exception as e: + # Cleanup: Delete all files in the upload directory if an error occurs + delete_all_files_in_directory(upload_folder) + return jsonify({"error": "Processing error", "message": f"An error occurred while processing the document: {str(e)}"}), 500 + + return jsonify({"error": "File type not allowed", "message": "The uploaded file type is not allowed. Please upload a PDF, DOC, or DOCX file."}), 400 + + + +@sops_bp.route('/generate_sops', methods=['POST']) +def generate_sops(): + # Check if the POST request has the file part + if 'document' not in request.files: + return jsonify({"error": "No file part", "message": "Please upload a file with the key 'document'."}), 400 + + print("Running................") + + file = request.files['document'] + roles_json = request.form.get('roles') # Get the roles as a JSON string + if not roles_json: + return jsonify({"error": "No roles provided", "message": "Please provide a list of roles in the 'roles' field."}), 400 + + try: + roles = json.loads(roles_json) # Parse the roles from JSON string to a list + print(f"Roles are:{roles}") + except json.JSONDecodeError: + return jsonify({"error": "Invalid JSON", "message": "The 'roles' field contains invalid JSON."}), 400 + + # If the user does not select a file, the browser may also submit an empty part without a filename + if file.filename == '': + return jsonify({"error": "No selected file", "message": "A file was not selected for upload. Please select a valid file."}), 400 + + if file and allowed_file(file.filename): + filename = secure_filename(file.filename) + upload_folder = current_app.config['UPLOAD_FOLDER'] + file_path = os.path.join(upload_folder, filename) + + # Save the file to the upload folder + file.save(file_path) + + try: + # Use the utility function to generate docs from the file + docs = load_document(file_path) + + # Check if the document can generate SOPs for the roles + status_check = sop_generator.check_role_sop(roles=roles, docs=docs) + + if not status_check["status"]: + return jsonify({"error": "Document cannot extract SOPs", "message": status_check["message"]}), 400 + + # Generate SOPs based on the roles provided + sops = sop_generator.generate_sops(roles, docs) + + # Cleanup: Delete all files in the upload directory after processing + delete_all_files_in_directory(upload_folder) + + return jsonify({"sops": sops, "message": "SOPs successfully generated for the roles from the document."}), 200 + + except Exception as e: + # Cleanup: Delete all files in the upload directory if an error occurs + delete_all_files_in_directory(upload_folder) + return jsonify({"error": "Processing error", "message": f"An error occurred while processing the document: {str(e)}"}), 500 + + return jsonify({"error": "File type not allowed", "message": "The uploaded file type is not allowed. Please upload a PDF, DOC, or DOCX file."}), 400 diff --git a/src/services/sop_generator.py b/src/services/sop_generator.py new file mode 100644 index 0000000..8145dca --- /dev/null +++ b/src/services/sop_generator.py @@ -0,0 +1,113 @@ +import os +import json +from openai import OpenAI +from pydantic import BaseModel, Field +from typing import List, Dict, Optional + +class SOPs(BaseModel): + must: Optional[List[str]] = Field(default_factory=list) + shall: Optional[List[str]] = Field(default_factory=list) + will: Optional[List[str]] = Field(default_factory=list) + +class RoleSOPs(BaseModel): + sops: SOPs + +class SOPsFound(BaseModel): + message: str + status: bool + +class RolesResponse(BaseModel): + roles: List[str] + +class SOPsResponse(BaseModel): + roles_sops: Dict[str, SOPs] = Field(default_factory=dict) + +class SopGenerator: + def __init__(self): + self.api_key = os.getenv("OPENAI_API_KEY") + self.client = OpenAI(api_key=self.api_key) + self.model = "gpt-4o-mini" + + def _extract_text_from_docs(self, docs): + """Extract text content from document objects.""" + return [doc.page_content for doc in docs] + + def get_roles(self, docs) -> RolesResponse: + docs_text = self._extract_text_from_docs(docs) + response = self.client.beta.chat.completions.parse( + model=self.model, + messages=[ + { + "role": "system", + "content": '''Suppose you are a role/position extractor from a company document. + You extract the roles as a list, e.g., ["financial analyst", "data scientist", etc.]. + If no roles are found, return an empty list.''', + }, + { + "role": "user", + "content": [{"type": "text", "text": text} for text in docs_text], + } + ], + response_format=RolesResponse, + max_tokens=1024, + temperature=0.1 + ) + return json.loads(response.choices[0].message.content) + + def check_role_sop(self, roles: str, docs) -> SOPsFound: + docs_text = self._extract_text_from_docs(docs) + response = self.client.beta.chat.completions.parse( + model=self.model, + messages=[ + { + "role": "system", + "content": f'''Your role is to check if the SOPs for the provided roles "{roles}" are found in the document. + You are validating if the document can provide the SOPs. + Return status=True with a proper message if found, and status=False with a proper message if not. + Keep the message short, e.g., "SOPs found for the role: {roles}" or "SOPs not found for the role: {roles}".''' + }, + { + "role": "user", + "content": [{"type": "text", "text": text} for text in docs_text], + } + ], + response_format=SOPsFound, + max_tokens=1024, + temperature=0.1 + ) + return json.loads(response.choices[0].message.content) + + def generate_sops(self, roles: List[str], docs) -> SOPsResponse: + roles_sops_all = {} + + docs_text = self._extract_text_from_docs(docs) + + for role in roles: + response = self.client.beta.chat.completions.parse( + model=self.model, + messages=[ + { + "role": "system", + "content": f'''You are a Standard Operating Procedure (SOP) extractor. + Your task is to find SOPs for the role "{role}" in the provided text. + SOPs should be categorized under "must", "shall", and "will". + If the SOPs for the role are not explicitly stated, you are required to infer them from the context provided in the document, + but only if there is clear evidence within the text. + Do not generate or assume SOPs that are not directly supported by the document. + Your extraction should strictly adhere to the content of the document, ensuring that no information is fabricated or inferred beyond what is present. + If no SOPs are found for the role, return an empty list for each category.''', + }, + { + "role": "user", + "content": [{"type": "text", "text": text} for text in docs_text], + } + ], + response_format=RoleSOPs, + max_tokens=1024, + temperature=0.1 + ) + role_sop = json.loads(response.choices[0].message.content) + roles_sops_all[role] = role_sop + + return roles_sops_all + diff --git a/src/utils/document_loader.py b/src/utils/document_loader.py new file mode 100644 index 0000000..6bd6044 --- /dev/null +++ b/src/utils/document_loader.py @@ -0,0 +1,48 @@ +import os +from spire.doc import Document, FileFormat +from langchain_community.document_loaders import PyPDFLoader + +def convert_word_to_pdf(doc_path: str) -> str: + """ + Convert a .doc or .docx file to PDF using Spire.Doc. + + Args: + doc_path (str): The path to the .doc or .docx file. + + Returns: + str: The path to the converted PDF file. + """ + pdf_path = os.path.splitext(doc_path)[0] + '.pdf' + + # Create a Document object + document = Document() + # Load the Word document + document.LoadFromFile(doc_path) + # Save as PDF + document.SaveToFile(pdf_path, FileFormat.PDF) + document.Close() + + return pdf_path + +def load_document(file_path: str): + """ + Utility function to load a PDF, DOCX, or DOC file by first converting it to PDF. + + Args: + file_path (str): The path to the file to load. + + Returns: + List[Document]: A list of Document objects representing the contents of the file. + """ + extension = os.path.splitext(file_path)[1].lower() + + if extension in ['.doc', '.docx']: + # Convert .doc or .docx to PDF first + pdf_path = convert_word_to_pdf(file_path) + loader = PyPDFLoader(pdf_path) + elif extension == '.pdf': + loader = PyPDFLoader(file_path) + else: + raise ValueError(f"Unsupported file type: {extension}. Only .pdf, .docx, and .doc are supported.") + + return loader.load() \ No newline at end of file diff --git a/src/utils/utils.py b/src/utils/utils.py new file mode 100644 index 0000000..97bac5f --- /dev/null +++ b/src/utils/utils.py @@ -0,0 +1,20 @@ +import os +def delete_file(file_path): + try: + os.remove(file_path) + print(f"Deleted file: {file_path}") + except OSError as e: + print(f"Error deleting file {file_path}: {e}") + + +import os + +def delete_all_files_in_directory(directory_path): + try: + for filename in os.listdir(directory_path): + file_path = os.path.join(directory_path, filename) + if os.path.isfile(file_path): + os.remove(file_path) + print(f"Deleted file: {file_path}") + except OSError as e: + print(f"Error deleting files in {directory_path}: {e}") diff --git a/test.py b/test.py new file mode 100644 index 0000000..62ab249 --- /dev/null +++ b/test.py @@ -0,0 +1,21 @@ +from src.services.sop_generator import SopGenerator +from src.utils.document_loader import load_document + + +file_path = "/root/ds_erp_ai/data/raw/test_sop.pdf" +docs = load_document(file_path) +sop = SopGenerator() + + +if __name__ == "__main__": + # Assuming 'sop' is an instance of SopGenerator and 'docs' is the loaded document content. + + # Step 1: Get the roles from the document + roles = sop.get_roles(docs)["roles"] + print(f"Roles {roles}") + + sop_status = sop.check_role_sop(roles=roles, docs=docs) + print(sop_status) + roles = ["cloud engineer"] + sops = sop.generate_sops(roles=roles,docs=docs) + print(f"sops:{sops}") diff --git a/view?usp=sharing b/view?usp=sharing new file mode 100644 index 0000000..f7e1539 --- /dev/null +++ b/view?usp=sharing @@ -0,0 +1,136 @@ +