diff --git a/notebooks/test.ipynb b/notebooks/test.ipynb index fe0654c..d5617f1 100644 --- a/notebooks/test.ipynb +++ b/notebooks/test.ipynb @@ -11,7 +11,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 44, "metadata": {}, "outputs": [], "source": [ @@ -27,102 +27,186 @@ "\n", "# Now you can import your modules\n", "from src.services.sop_generator import SopGenerator\n", - "from src.utils.pdf_loader import load_pdf_to_docs\n" + "from src.utils.document_loader import load_document\n" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(metadata={'source': '/root/ds_erp_ai/data/raw/document.pdf', 'page': 0}, page_content='STANDARD OPERATING PROCEDURE Accounts Receivable Department - \\nCollections\\nSOP No: 0000001 \\nSOP Title: Minimizing unnecessary \\ncommunication (Emails) \\nEvaluation Warning: The document was created with Spire.Doc for Python.\\nSOP \\nNumberSOP 0000001\\nSOP Title Minimizing unnecessary communication (Emails)\\nNAME TITLE SIGNATUR\\nEDATE\\nAuthor Angela Lewis AR Director\\n 03/12/24\\nReviewer Jamie Vega \\nJessica Merzougui\\nDeniece SantosAR Supervisor\\nAR Supervisor\\nAR Analyst\\nAudience Full AR Department\\nEffective Date: 03/12/24\\nPurpose: To minimize unnecessary email communication regarding carbon \\ncopies\\nSOP Statement: \\nOnly cc your immediate supervisor on any regular follow-ups per account by \\nAR Collector. If additional escalations are needed, the AR Supervisor must \\nprovide that information to the AR Director. This is to ensure due diligence of \\nfull AR Operations have been reviewed to the extreme intent of collections \\n(PDL involvement, reconciliations, full account review, cash applications \\ncompleted, etc.). \\nThe escalation to the AR Director will need the following elements included in \\nthe email – checklist below…'),\n", + " Document(metadata={'source': '/root/ds_erp_ai/data/raw/document.pdf', 'page': 1}, page_content='STANDARD OPERATING PROCEDURE Accounts Receivable Department - \\nCollections\\nSOP No: 0000001 \\nSOP Title: Minimizing unnecessary \\ncommunication (Emails) \\nAll Open Invoices All Open \\nCredits/Payme\\nntsAny PPM Credits \\n(Payments)Date of Full \\nReconciliation \\nIdentified\\nEvaluation Warning: The document was created with Spire.Doc for Python.')]" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "file_path = \"/root/ds_erp_ai/data/raw/document.doc\"\n", + "docs = load_document(file_path)\n", + "docs" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [], + "source": [ + "sop = SopGenerator()\n", + "roles = sop.get_roles(docs)" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'roles': ['AR Director', 'AR Supervisor', 'AR Analyst']}" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "roles" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'roles': ['AR Director', 'AR Supervisor', 'AR Analyst']}" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "roles\n" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['AR Director', 'AR Supervisor', 'AR Analyst']" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "roles = roles[\"roles\"]\n", + "roles" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [], + "source": [ + "sop_status = sop.check_role_sop(role=roles,docs=docs)" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'message': \"SOPs found for the role: ['AR Director', 'AR Supervisor', 'AR Analyst']\",\n", + " 'status': True}" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sop_status" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [], + "source": [ + "sops = sop.generate_sops(roles,docs)" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'AR Director': SOPs(must=[], shall=[], will=[]),\n", + " 'AR Supervisor': SOPs(must=[], shall=[], will=[]),\n", + " 'AR Analyst': SOPs(must=[], shall=[], will=[])}" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sops.roles_sops" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "pip install pypandoc\n" + "file_path = " ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "OSError", - "evalue": "No pandoc was found: either install pandoc and add it\nto your PATH or or call pypandoc.download_pandoc(...) or\ninstall pypandoc wheels with included pandoc.", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mOSError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[1], line 54\u001b[0m\n\u001b[1;32m 51\u001b[0m pdf_file_path \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m/root/ds_erp_ai/data/raw/test_sop.pdf\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 53\u001b[0m \u001b[38;5;66;03m# Load a .doc file (converts to .docx internally)\u001b[39;00m\n\u001b[0;32m---> 54\u001b[0m doc_docs \u001b[38;5;241m=\u001b[39m \u001b[43mload_document\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdoc_file_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 56\u001b[0m \u001b[38;5;66;03m# Load a .docx file\u001b[39;00m\n\u001b[1;32m 57\u001b[0m \u001b[38;5;66;03m#docx_docs = load_document(docx_file_path)\u001b[39;00m\n\u001b[1;32m 58\u001b[0m \n\u001b[1;32m 59\u001b[0m \u001b[38;5;66;03m# Load a PDF file\u001b[39;00m\n\u001b[1;32m 60\u001b[0m pdf_docs \u001b[38;5;241m=\u001b[39m load_document(pdf_file_path)\n", - "Cell \u001b[0;32mIn[1], line 41\u001b[0m, in \u001b[0;36mload_document\u001b[0;34m(file_path, use_unstructured)\u001b[0m\n\u001b[1;32m 38\u001b[0m loader \u001b[38;5;241m=\u001b[39m Docx2txtLoader(file_path)\n\u001b[1;32m 39\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m extension \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m.doc\u001b[39m\u001b[38;5;124m'\u001b[39m:\n\u001b[1;32m 40\u001b[0m \u001b[38;5;66;03m# Convert .doc to .docx first\u001b[39;00m\n\u001b[0;32m---> 41\u001b[0m docx_path \u001b[38;5;241m=\u001b[39m \u001b[43mconvert_doc_to_docx\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfile_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 42\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m load_document(docx_path, use_unstructured\u001b[38;5;241m=\u001b[39muse_unstructured)\n\u001b[1;32m 43\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n", - "Cell \u001b[0;32mIn[1], line 16\u001b[0m, in \u001b[0;36mconvert_doc_to_docx\u001b[0;34m(doc_path)\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 7\u001b[0m \u001b[38;5;124;03mConvert a .doc file to .docx using pypandoc.\u001b[39;00m\n\u001b[1;32m 8\u001b[0m \u001b[38;5;124;03m\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;124;03m str: The path to the converted .docx file.\u001b[39;00m\n\u001b[1;32m 14\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 15\u001b[0m docx_path \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39msplitext(doc_path)[\u001b[38;5;241m0\u001b[39m] \u001b[38;5;241m+\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m.docx\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[0;32m---> 16\u001b[0m \u001b[43mpypandoc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconvert_file\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdoc_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mdocx\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moutputfile\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdocx_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 17\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m docx_path\n", - "File \u001b[0;32m~/ds_erp_ai/erp/lib/python3.10/site-packages/pypandoc/__init__.py:200\u001b[0m, in \u001b[0;36mconvert_file\u001b[0;34m(source_file, to, format, extra_args, encoding, outputfile, filters, verify_format, sandbox, cworkdir)\u001b[0m\n\u001b[1;32m 197\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(discovered_source_files) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 198\u001b[0m discovered_source_files \u001b[38;5;241m=\u001b[39m discovered_source_files[\u001b[38;5;241m0\u001b[39m]\n\u001b[0;32m--> 200\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_convert_input\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdiscovered_source_files\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mformat\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mpath\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mto\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mextra_args\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mextra_args\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 201\u001b[0m \u001b[43m \u001b[49m\u001b[43moutputfile\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutputfile\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfilters\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfilters\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 202\u001b[0m \u001b[43m \u001b[49m\u001b[43mverify_format\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverify_format\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msandbox\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msandbox\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 203\u001b[0m \u001b[43m \u001b[49m\u001b[43mcworkdir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcworkdir\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/ds_erp_ai/erp/lib/python3.10/site-packages/pypandoc/__init__.py:364\u001b[0m, in \u001b[0;36m_convert_input\u001b[0;34m(source, format, input_type, to, extra_args, outputfile, filters, verify_format, sandbox, cworkdir)\u001b[0m\n\u001b[1;32m 361\u001b[0m _check_log_handler()\n\u001b[1;32m 363\u001b[0m logger\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mEnsuring pandoc path...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 364\u001b[0m \u001b[43m_ensure_pandoc_path\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 366\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m verify_format:\n\u001b[1;32m 367\u001b[0m logger\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mVerifying format...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", - "File \u001b[0;32m~/ds_erp_ai/erp/lib/python3.10/site-packages/pypandoc/__init__.py:797\u001b[0m, in \u001b[0;36m_ensure_pandoc_path\u001b[0;34m()\u001b[0m\n\u001b[1;32m 789\u001b[0m logger\u001b[38;5;241m.\u001b[39minfo(textwrap\u001b[38;5;241m.\u001b[39mdedent(\u001b[38;5;124m\"\"\"\u001b[39m\u001b[38;5;130;01m\\\u001b[39;00m\n\u001b[1;32m 790\u001b[0m \u001b[38;5;124m See http://johnmacfarlane.net/pandoc/installing.html\u001b[39m\n\u001b[1;32m 791\u001b[0m \u001b[38;5;124m for installation options\u001b[39m\n\u001b[1;32m 792\u001b[0m \u001b[38;5;124m\u001b[39m\u001b[38;5;124m\"\"\"\u001b[39m))\n\u001b[1;32m 793\u001b[0m logger\u001b[38;5;241m.\u001b[39minfo(textwrap\u001b[38;5;241m.\u001b[39mdedent(\u001b[38;5;124m\"\"\"\u001b[39m\u001b[38;5;130;01m\\\u001b[39;00m\n\u001b[1;32m 794\u001b[0m \u001b[38;5;124m ---------------------------------------------------------------\u001b[39m\n\u001b[1;32m 795\u001b[0m \n\u001b[1;32m 796\u001b[0m \u001b[38;5;124m\u001b[39m\u001b[38;5;124m\"\"\"\u001b[39m))\n\u001b[0;32m--> 797\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mOSError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNo pandoc was found: either install pandoc and add it\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 798\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mto your PATH or or call pypandoc.download_pandoc(...) or\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 799\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minstall pypandoc wheels with included pandoc.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", - "\u001b[0;31mOSError\u001b[0m: No pandoc was found: either install pandoc and add it\nto your PATH or or call pypandoc.download_pandoc(...) or\ninstall pypandoc wheels with included pandoc." - ] - } - ], - "source": [ - "import os\n", - "import pypandoc\n", - "from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, UnstructuredWordDocumentLoader\n", - "\n", - "def convert_doc_to_docx(doc_path: str) -> str:\n", - " \"\"\"\n", - " Convert a .doc file to .docx using pypandoc.\n", - " \n", - " Args:\n", - " doc_path (str): The path to the .doc file.\n", - "\n", - " Returns:\n", - " str: The path to the converted .docx file.\n", - " \"\"\"\n", - " docx_path = os.path.splitext(doc_path)[0] + '.docx'\n", - " pypandoc.convert_file(doc_path, 'docx', outputfile=docx_path)\n", - " return docx_path\n", - "\n", - "def load_document(file_path: str, use_unstructured: bool = False):\n", - " \"\"\"\n", - " Utility function to load a PDF, DOCX, or DOC file and convert it to document objects.\n", - "\n", - " Args:\n", - " file_path (str): The path to the file to load.\n", - " use_unstructured (bool): Whether to use the Unstructured loader for .docx files. Defaults to False.\n", - "\n", - " Returns:\n", - " List[Document]: A list of Document objects representing the contents of the file.\n", - " \"\"\"\n", - " extension = os.path.splitext(file_path)[1].lower()\n", - " \n", - " if extension == '.pdf':\n", - " loader = PyPDFLoader(file_path)\n", - " elif extension == '.docx':\n", - " if use_unstructured:\n", - " loader = UnstructuredWordDocumentLoader(file_path)\n", - " else:\n", - " loader = Docx2txtLoader(file_path)\n", - " elif extension == '.doc':\n", - " # Convert .doc to .docx first\n", - " docx_path = convert_doc_to_docx(file_path)\n", - " return load_document(docx_path, use_unstructured=use_unstructured)\n", - " else:\n", - " raise ValueError(f\"Unsupported file type: {extension}. Only .pdf, .docx, and .doc are supported.\")\n", - " \n", - " return loader.load()\n", - "\n", - "# Example usage:\n", - "doc_file_path = \"/root/ds_erp_ai/data/raw/document.doc\"\n", - "#docx_file_path = \"/root/ds_erp_ai/data/raw/test_docx.docx\"\n", - "pdf_file_path = \"/root/ds_erp_ai/data/raw/test_sop.pdf\"\n", - "\n", - "# Load a .doc file (converts to .docx internally)\n", - "doc_docs = load_document(doc_file_path)\n", - "\n", - "# Load a .docx file\n", - "#docx_docs = load_document(docx_file_path)\n", - "\n", - "# Load a PDF file\n", - "pdf_docs = load_document(pdf_file_path)\n" - ] + "outputs": [], + "source": [] }, { "cell_type": "code", @@ -130,85 +214,50 @@ "metadata": {}, "outputs": [ { - "ename": "OSError", - "evalue": "No pandoc was found: either install pandoc and add it\nto your PATH or or call pypandoc.download_pandoc(...) or\ninstall pypandoc wheels with included pandoc.", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mOSError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[4], line 48\u001b[0m\n\u001b[1;32m 45\u001b[0m pdf_file_path \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m/root/ds_erp_ai/data/raw/test_sop.pdf\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 47\u001b[0m \u001b[38;5;66;03m# Load a .doc file (converts to PDF first)\u001b[39;00m\n\u001b[0;32m---> 48\u001b[0m doc_docs \u001b[38;5;241m=\u001b[39m \u001b[43mload_document\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdoc_file_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 50\u001b[0m \u001b[38;5;66;03m# Load a .docx file (converts to PDF first)\u001b[39;00m\n\u001b[1;32m 51\u001b[0m \u001b[38;5;66;03m#docx_docs = load_document(docx_file_path)\u001b[39;00m\n\u001b[1;32m 52\u001b[0m \n\u001b[1;32m 53\u001b[0m \u001b[38;5;66;03m# Load a PDF file\u001b[39;00m\n\u001b[1;32m 54\u001b[0m pdf_docs \u001b[38;5;241m=\u001b[39m load_document(pdf_file_path)\n", - "Cell \u001b[0;32mIn[4], line 33\u001b[0m, in \u001b[0;36mload_document\u001b[0;34m(file_path)\u001b[0m\n\u001b[1;32m 29\u001b[0m extension \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39msplitext(file_path)[\u001b[38;5;241m1\u001b[39m]\u001b[38;5;241m.\u001b[39mlower()\n\u001b[1;32m 31\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m extension \u001b[38;5;129;01min\u001b[39;00m [\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m.doc\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m.docx\u001b[39m\u001b[38;5;124m'\u001b[39m]:\n\u001b[1;32m 32\u001b[0m \u001b[38;5;66;03m# Convert .doc or .docx to PDF first\u001b[39;00m\n\u001b[0;32m---> 33\u001b[0m pdf_path \u001b[38;5;241m=\u001b[39m \u001b[43mconvert_to_pdf\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfile_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 34\u001b[0m loader \u001b[38;5;241m=\u001b[39m PyPDFLoader(pdf_path)\n\u001b[1;32m 35\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m extension \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m.pdf\u001b[39m\u001b[38;5;124m'\u001b[39m:\n", - "Cell \u001b[0;32mIn[4], line 16\u001b[0m, in \u001b[0;36mconvert_to_pdf\u001b[0;34m(doc_path)\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 7\u001b[0m \u001b[38;5;124;03mConvert a .doc or .docx file to PDF using pypandoc.\u001b[39;00m\n\u001b[1;32m 8\u001b[0m \u001b[38;5;124;03m\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;124;03m str: The path to the converted PDF file.\u001b[39;00m\n\u001b[1;32m 14\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 15\u001b[0m pdf_path \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39msplitext(doc_path)[\u001b[38;5;241m0\u001b[39m] \u001b[38;5;241m+\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m.pdf\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[0;32m---> 16\u001b[0m \u001b[43mpypandoc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconvert_file\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdoc_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mpdf\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moutputfile\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpdf_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 17\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m pdf_path\n", - "File \u001b[0;32m~/ds_erp_ai/erp/lib/python3.10/site-packages/pypandoc/__init__.py:200\u001b[0m, in \u001b[0;36mconvert_file\u001b[0;34m(source_file, to, format, extra_args, encoding, outputfile, filters, verify_format, sandbox, cworkdir)\u001b[0m\n\u001b[1;32m 197\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(discovered_source_files) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 198\u001b[0m discovered_source_files \u001b[38;5;241m=\u001b[39m discovered_source_files[\u001b[38;5;241m0\u001b[39m]\n\u001b[0;32m--> 200\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_convert_input\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdiscovered_source_files\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mformat\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mpath\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mto\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mextra_args\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mextra_args\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 201\u001b[0m \u001b[43m \u001b[49m\u001b[43moutputfile\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutputfile\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfilters\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfilters\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 202\u001b[0m \u001b[43m \u001b[49m\u001b[43mverify_format\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverify_format\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msandbox\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msandbox\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 203\u001b[0m \u001b[43m \u001b[49m\u001b[43mcworkdir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcworkdir\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/ds_erp_ai/erp/lib/python3.10/site-packages/pypandoc/__init__.py:364\u001b[0m, in \u001b[0;36m_convert_input\u001b[0;34m(source, format, input_type, to, extra_args, outputfile, filters, verify_format, sandbox, cworkdir)\u001b[0m\n\u001b[1;32m 361\u001b[0m _check_log_handler()\n\u001b[1;32m 363\u001b[0m logger\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mEnsuring pandoc path...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 364\u001b[0m \u001b[43m_ensure_pandoc_path\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 366\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m verify_format:\n\u001b[1;32m 367\u001b[0m logger\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mVerifying format...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", - "File \u001b[0;32m~/ds_erp_ai/erp/lib/python3.10/site-packages/pypandoc/__init__.py:797\u001b[0m, in \u001b[0;36m_ensure_pandoc_path\u001b[0;34m()\u001b[0m\n\u001b[1;32m 789\u001b[0m logger\u001b[38;5;241m.\u001b[39minfo(textwrap\u001b[38;5;241m.\u001b[39mdedent(\u001b[38;5;124m\"\"\"\u001b[39m\u001b[38;5;130;01m\\\u001b[39;00m\n\u001b[1;32m 790\u001b[0m \u001b[38;5;124m See http://johnmacfarlane.net/pandoc/installing.html\u001b[39m\n\u001b[1;32m 791\u001b[0m \u001b[38;5;124m for installation options\u001b[39m\n\u001b[1;32m 792\u001b[0m \u001b[38;5;124m\u001b[39m\u001b[38;5;124m\"\"\"\u001b[39m))\n\u001b[1;32m 793\u001b[0m logger\u001b[38;5;241m.\u001b[39minfo(textwrap\u001b[38;5;241m.\u001b[39mdedent(\u001b[38;5;124m\"\"\"\u001b[39m\u001b[38;5;130;01m\\\u001b[39;00m\n\u001b[1;32m 794\u001b[0m \u001b[38;5;124m ---------------------------------------------------------------\u001b[39m\n\u001b[1;32m 795\u001b[0m \n\u001b[1;32m 796\u001b[0m \u001b[38;5;124m\u001b[39m\u001b[38;5;124m\"\"\"\u001b[39m))\n\u001b[0;32m--> 797\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mOSError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNo pandoc was found: either install pandoc and add it\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 798\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mto your PATH or or call pypandoc.download_pandoc(...) or\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 799\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minstall pypandoc wheels with included pandoc.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", - "\u001b[0;31mOSError\u001b[0m: No pandoc was found: either install pandoc and add it\nto your PATH or or call pypandoc.download_pandoc(...) or\ninstall pypandoc wheels with included pandoc." - ] + "data": { + "text/plain": [ + "[Document(metadata={'source': '/root/ds_erp_ai/data/raw/document.pdf', 'page': 0}, page_content='STANDARD OPERATING PROCEDURE Accounts Receivable Department - \\nCollections\\nSOP No: 0000001 \\nSOP Title: Minimizing unnecessary \\ncommunication (Emails) \\nEvaluation Warning: The document was created with Spire.Doc for Python.\\nSOP \\nNumberSOP 0000001\\nSOP Title Minimizing unnecessary communication (Emails)\\nNAME TITLE SIGNATUR\\nEDATE\\nAuthor Angela Lewis AR Director\\n 03/12/24\\nReviewer Jamie Vega \\nJessica Merzougui\\nDeniece SantosAR Supervisor\\nAR Supervisor\\nAR Analyst\\nAudience Full AR Department\\nEffective Date: 03/12/24\\nPurpose: To minimize unnecessary email communication regarding carbon \\ncopies\\nSOP Statement: \\nOnly cc your immediate supervisor on any regular follow-ups per account by \\nAR Collector. If additional escalations are needed, the AR Supervisor must \\nprovide that information to the AR Director. This is to ensure due diligence of \\nfull AR Operations have been reviewed to the extreme intent of collections \\n(PDL involvement, reconciliations, full account review, cash applications \\ncompleted, etc.). \\nThe escalation to the AR Director will need the following elements included in \\nthe email – checklist below…'),\n", + " Document(metadata={'source': '/root/ds_erp_ai/data/raw/document.pdf', 'page': 1}, page_content='STANDARD OPERATING PROCEDURE Accounts Receivable Department - \\nCollections\\nSOP No: 0000001 \\nSOP Title: Minimizing unnecessary \\ncommunication (Emails) \\nAll Open Invoices All Open \\nCredits/Payme\\nntsAny PPM Credits \\n(Payments)Date of Full \\nReconciliation \\nIdentified\\nEvaluation Warning: The document was created with Spire.Doc for Python.')]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "import os\n", - "from spire.doc import Document, FileFormat\n", - "from langchain_community.document_loaders import PyPDFLoader\n", - "\n", - "def convert_word_to_pdf(doc_path: str) -> str:\n", - " \"\"\"\n", - " Convert a .doc or .docx file to PDF using Spire.Doc.\n", - " \n", - " Args:\n", - " doc_path (str): The path to the .doc or .docx file.\n", - "\n", - " Returns:\n", - " str: The path to the converted PDF file.\n", - " \"\"\"\n", - " pdf_path = os.path.splitext(doc_path)[0] + '.pdf'\n", - " \n", - " # Create a Document object\n", - " document = Document()\n", - " # Load the Word document\n", - " document.load_from_file(doc_path)\n", - " # Save as PDF\n", - " document.save_to_file(pdf_path, FileFormat.PDF)\n", - " document.close()\n", - " \n", - " return pdf_path\n", - "\n", - "def load_document(file_path: str):\n", - " \"\"\"\n", - " Utility function to load a PDF, DOCX, or DOC file by first converting it to PDF.\n", - "\n", - " Args:\n", - " file_path (str): The path to the file to load.\n", - "\n", - " Returns:\n", - " List[Document]: A list of Document objects representing the contents of the file.\n", - " \"\"\"\n", - " extension = os.path.splitext(file_path)[1].lower()\n", - " \n", - " if extension in ['.doc', '.docx']:\n", - " # Convert .doc or .docx to PDF first\n", - " pdf_path = convert_word_to_pdf(file_path)\n", - " loader = PyPDFLoader(pdf_path)\n", - " elif extension == '.pdf':\n", - " loader = PyPDFLoader(file_path)\n", - " else:\n", - " raise ValueError(f\"Unsupported file type: {extension}. Only .pdf, .docx, and .doc are supported.\")\n", - " \n", - " return loader.load()\n", - "\n", - "# Example usage:\n", - "doc_file_path = \"/root/ds_erp_ai/data/raw/document.doc\"\n", - "docx_file_path = \"/root/ds_erp_ai/data/raw/test_docx.docx\"\n", - "pdf_file_path = \"/root/ds_erp_ai/data/raw/test_sop.pdf\"\n", - "\n", - "# Load a .doc file (converts to PDF first)\n", - "doc_docs = load_document(doc_file_path)\n", - "\n", - "# Load a .docx file (converts to PDF first)\n", - "docx_docs = load_document(docx_file_path)\n", - "\n", - "# Load a PDF file\n", - "pdf_docs = load_document(pdf_file_path)\n" + "doc_docs" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(metadata={'source': '/root/ds_erp_ai/data/raw/test_sop.pdf', 'page': 0}, page_content='ERGO \\nERGO 202 4 VISION \\nDEFINITION \\nA standard operating procedure (SOP) is a step -by-step, repeatable process for any critical \\nand routine task. It’s a kind of documentation that prevents stress, mistakes, and \\nmiscommunication. SOPs ensure reliability, efficiency, and consistently hitting quality \\nstandards in regular work activities. The SOPs represent the what, when, where and how of \\nevery role in the organization as it aligns to it’s vision and goals. \\n \\nVISION: \\nERGO envisions itself as the leading platform for renting spaces, equipment, and services, \\ntransforming the way people access and share resources. By the end of the next year, we \\naim to significantly increase our user base and participation rates, fosterin g a vibrant and \\nactive community that thrives on mutual benefit and shared success. \\n \\nWe are committed to creating an intuitive, reliable, and user -friendly app that empowers \\nindividuals and businesses to rent what they need when they need it, contributing to a \\nmore efficient and sustainable use of resources. We believe in the power of tech nology to \\nsimplify transactions, build connections, and unlock new possibilities. \\n \\nOur strategic vision is to continuously innovate and improve our platform, ensuring it \\nmeets the evolving needs of our users. We will invest in marketing and user engagement \\nstrategies to attract new users and retain existing ones, with a focus on providin g \\nexceptional customer service and a seamless user experience. \\n \\nIn the long term, we aspire to expand our services to new markets and regions, establishing \\nERGO as a global leader in the sharing economy. We are dedicated to creating value for our \\nusers, stakeholders, and the broader community, contributing to a more co nnected and \\nresource -efficient world.\" \\n \\nRemember, a strategic vision is a guiding light for a company’s future direction. It will be \\nrevisited and potentially revised as the company grows and the market landscape \\nchanges. It’s also important to ensure that all stakeholders understand and align with the \\nvision. This includes employees, investors, and customers. They all play a crucia l role in \\nthe company’s journey towards its vision. \\n '),\n", + " Document(metadata={'source': '/root/ds_erp_ai/data/raw/test_sop.pdf', 'page': 1}, page_content=' '),\n", + " Document(metadata={'source': '/root/ds_erp_ai/data/raw/test_sop.pdf', 'page': 2}, page_content='GOALS; \\n1. MARKETING \\na. Brand Awareness : Increase brand awareness by 50% over the next year \\nthrough targeted marketing campaigns. \\nb. User Acquisition: Acquire 100,000 new users over the next year through \\ndigital marketing strategies. \\nc. User Engagement : Improve user engagement by 30% by implementing a \\ncontent marketing strategy that educates users about the benefits and uses \\nof ERGO. \\nd. Market Expansion : Enter two new geographic markets by the end of the year \\nthrough localized marketing campaigns. \\n2. IT INFRASTRUCTURE : \\na. Platform Stability: Achieve 99.9% uptime for the ERGO app to ensure a \\nseamless user experience. \\nb. Security: Implement robust security measures to protect user data and build \\ntrust with the user base. \\nc. Scalability: Enhance the IT infrastructure to support a 50% increase in user \\ntraffic over the next year. \\nd. Feature Development: Develop and launch three new features based on user \\nfeedback to improve the functionality and usability of the ERGO app. \\n3. SALES TEAM GOALS: \\na. New User Sales: Sign up 50,000 new users over the next year through direct \\nsales efforts. \\nb. Partnership Development: Establish partnerships with 20 new businesses to \\noffer their spaces, equipment, or services on ERGO. \\nc. Customer Retention: Achieve a customer retention rate of 90% by providing \\nexcellent customer service and follow -up. \\nd. Revenue Growth: Increase sales revenue by 30% over the next year by \\nupselling premium features and services. \\n '),\n", + " Document(metadata={'source': '/root/ds_erp_ai/data/raw/test_sop.pdf', 'page': 3}, page_content='ERGO 2024 Standard Operating Procedures : \\n \\nAPPLICATION \\nThis document applies to Information technology specialists and contractors. Marketing \\nteams and Contracting Specialist inv olved with Marketing contracts. Current and future \\nsales associates employed or contracted by ERGO. \\n \\nVISION STATEMENT \\n \\nERGO’s strategic goals for the next year encompass marketing, IT infrastructure, and sales. \\nIn marketing, we aim to increase brand awareness by 50% through targeted campaigns, \\nacquire 100,000 new users via digital strategies, improve user engagement by 30% with a \\ncontent marketing strategy, and expand into two new markets. For IT infrastructure, our \\ngoals are to achieve 99.9% app uptime, implement robust security measures, enhance \\ninfrastructure to support a 50% increase in user traffic, and develop three n ew features \\nbased on user feedback. The sales team aims to sign up 50,000 new users, establish \\npartnerships with 20 new businesses, achieve a 90% customer retention rate, and increase \\nsales revenue by 30% through upselling. \\n \\nRESPONSIBILITIES \\n \\nCHIEF OPERATIONS OFFICER : \\nCross -Functional Coordination: As a key member of the executive team, the COO will \\nensure alignment and coordination across different departments to achieve these goals. \\nPerformance Monitoring: The COO will establish key performance indicators (KPIs) and \\nregularly review progress towards these goals, making necessary adjustments to strategies \\nand plans. \\n \\nSALES MANAGER; \\nUnder the oversight and control of the COO, the sales manager will guide the sales team to \\nsign up 50,000 new users, establish partnerships with 20 new businesses, achieve a 90% \\ncustomer retention rate, and increase sales revenue by 30% through upselling. \\n \\nCAMPAIGN MANAGER: '),\n", + " Document(metadata={'source': '/root/ds_erp_ai/data/raw/test_sop.pdf', 'page': 4}, page_content='Under the oversight and control of the COO the campaign manager will oversee the \\nmarketing team’s efforts to increase brand awareness and acquire new users. They will \\nalso ensure the implementation of a content marketing strategy to improve user \\nengagement and spearhead efforts to expand into two new geographic markets. The \\ncampaign manager will exercise operational oversight and control over marketing \\nspecialist and ad contracts. \\n \\nCONTENT MARKETING SPECIALIST: \\nUnder the oversight and control of the campaign manager the content marketing specialist \\nwill; \\n1. Create fun and informative quizzes related to your brand or services . Share them on \\nsocial media and encourage users to participate and share their results. \\n2. Share behind -the-scenes glimpses of your company culture, product development, \\nor team members. People love to connect with the human side of brands. \\n3. User -Generated Content Contests: Run a contest where users submit photos, \\nvideos, or stories related to your brand. Feature the best entries on your website or \\nsocial media. \\n4. Host online webinars or workshops that provide value to potential users. Topics \\ncould include product demos, services trends, or problem -solving sessions. \\n5. Create comprehensive guides or e -books that address common pain points your \\nproduct solves. Promote these through targeted ads and email campaigns. \\n6. Share success stories of existing users. Highlight how your product transformed \\ntheir lives or businesses. Authentic testimonials resonate well with potential users. \\n7. Develop a series of short videos explaining different aspects of your product or \\nservices . Keep them engaging and easy to understand. \\n8. Create visually appealing infographics that educate users about ERGO’s benefits. \\nUse interactive elements like clickable sections or animations. \\n9. Set up a community forum where users can ask questions, share tips, and interact \\nwith each other. Host live Q&A sessions on social media. \\n10. Write blog posts specifically tailored to the culture, language, and interests of the \\nnew markets. Address local challenges and showcase how ERGO can solve them. \\n11. Create landing pages in the local language, highlighting the unique value \\nproposition of ERGO for each market. \\n12. Partner with influencers or thought leaders from the new markets. Have them \\ncreate content or host events related to ERGO. \\n \\nDIGITAL MARKETING SPECIALIST: '),\n", + " Document(metadata={'source': '/root/ds_erp_ai/data/raw/test_sop.pdf', 'page': 5}, page_content='Under the oversight and control of the campaign manager the digital marketing specialist \\nwill; \\n1. Conduct market research to understand the target audience, their preferences, and \\nwhere they spend their time online.. \\n2. React to media inquiries: engage with journalists and media outlets to kickstart your \\nPR efforts1. \\n3. Start a referral program to encourage existing users to refer new ones. \\n4. Regularly publish informative blog posts to attract organic traffic and showcase \\nyour expertise . \\n5. Offer limited -time promotions or exclusive deals to entice new users. \\n6. Personalize content: tailor content to user interests and needs. Use personalized \\npush notifications, emails, and SEO strategies. \\n7. Measure metrics: track page views, time spent on site, bounce rate, and social \\nmedia interactions. Tools like Full Session can provide insights into user behavior4. \\n8. Engage with audience: respond to comments, likes, and shares. Regularly publish \\nrelevant content to keep users engaged. \\n9. Conduct market research: understand local nuances, preferences, and behaviors. \\nDefine demographics, languages, and dialects. \\n10. Adapt content: translate and culturally adapt content. Consider local holidays, \\nsymbols, and traditions. \\n11. Localize marketing channels: choose platforms popular in the new markets. Adjust \\nadvertising campaigns accordingly. \\n12. Test and analyze pilot localized campaigns, analyze results, and optimize based on \\ndata. \\nUnder the oversight and control of the campaign manager and in coordination with other \\nspecialist the digital marketing specialist must; \\n1. Leverage social media: use platforms where your audience is most active. Share \\nengaging content, run targeted ads, and interact with followers \\n2. Implement a search -focused content marketing strategy, create valuable content \\nthat aligns with what your audience searches for. Optimize it for relevant keywords \\nto improve organic visibility \\n3. Run targeted ads on cloud -related platforms or industry -specific websites in the \\nnew markets. Emphasize ERGO’s cloud advantages for local businesses. \\nSEARCH ENGINE OPTIMIZATION SPECIALIST: \\nUnder the oversight and control of the campaign manager the search engine optimization \\nspecialist will; \\n1. Create valuable content that aligns with what your audience searches for. Optimize \\nit for relevant keywords to improve organic visibility '),\n", + " Document(metadata={'source': '/root/ds_erp_ai/data/raw/test_sop.pdf', 'page': 6}, page_content='2. Personalize Content: Tailor content to user interests and needs. Use personalized \\npush notifications, emails, and SEO strategies. \\n3. Measure Metrics: Track page views, time spent on site, bounce rate, and social \\nmedia interactions. Tools like Full Session can provide insights into user behavior4. \\n4. Under the oversight and control of the campaign manager and in coordination with \\nother specialist the search engine optimization specialist must; \\n5. Engage with journalists and media outlets to kickstart your PR efforts. \\n6. Use platforms where your audience is most active. Share engaging content, run \\ntargeted ads, and interact with followers. \\n \\nINFORMATION TECHNOLOGY OFFICER \\nIn coordination cooperation with the Chief Operations Officer the Information Technology \\nOfficer will work closely with the it team to achieve 99.9% uptime for the ergo app, \\nimplement robust security measures, shall enhance the it infrastructure to support \\nincreased user traffic, and must develop and launch new features based on user feedback. \\nThe Information Technology officer will h ave a well -defined plan in place to handle security \\nincidents in local and new markets . \\nCLOUD ENGINEER: \\nunder the oversight and control of the Information Technology Officer the Cloud Engineer \\nshall; \\n1. Select cloud service providers with strong security practices. Ensure they comply \\nwith industry standards. \\n2. Clearly communicate to users how their data will be collected, used, and \\nprotected. Maintain a comprehensive privacy policy on your website. \\n3. Obtain explicit consent from users before collecting any personal data. Implement \\ncookie consent banners and ensure compliance with regulations like GDPR or \\nCCPA. \\n4. Isolate different components of your infrastructure to prevent lateral movement in \\ncase of a breach \\n5. Conduct periodic security audits to identify vulnerabilities and address them \\npromptly. \\nunder the oversight and control of the Information Technology Officer the Cloud Engineer \\nwill; \\n1. Design , implement, and manag e cloud computing solutions, such as public, \\nprivate, and hybrid cloud environments. \\n2. Create step -by-step video tutorials on how new users can set up their accounts, \\naccess features, and utilize cloud -based services within ERGO. \\n3. Deploy cloud infrastructure '),\n", + " Document(metadata={'source': '/root/ds_erp_ai/data/raw/test_sop.pdf', 'page': 7}, page_content=\"4. Migrate on -premises systems to the cloud \\n1. Continuously monitor and optimize ERGO’s cloud resources for scalability and \\ncost -effectiveness. Share success stories of how cloud improvements directly \\nbenefit users. Ensure cloud security and compliance \\n2. Host webinars on cloud -related topics, emphasizing how ERGO leverages cloud \\ntechnology. Invite industry experts and influencers to participate. \\nunder the oversight and control of the Information Technology Officer the Cloud Engineer \\nmust ; \\n3. Establish an online community forum where users can share tips, ask questions, \\nand discuss cloud -related topics. Cloud engineers can actively participate and \\nprovide insights. \\n4. Create visually appealing case studies or blog posts highlighting how ERGO’s cloud \\ninfrastructure enables seamless user experiences. Explain how scalability, \\nreliability, and security are achieved through cloud services. \\n5. Develop an interactive web page or app feature that takes users on a virtual tour of \\nERGO’s cloud architecture. Explain key components, data flow, and benefits. \\n6. Train cloud support teams to assist users in the new markets. Ensure they \\nunderstand the cultural context and can address cloud -related queries effectively. \\n \\nDATABASE ADMINISTRATOR (DBA): \\nDatabase administrators are responsible for the design, implementation, and \\nmaintenance of databases that store and organize an organization's data. They install and \\nconfigure database management systems, optimize database performance, backup and \\nrestore d ata, and enforce data security policies. \\nunder the oversight and control of the Information Technology Officer the DBA will; \\n1. Continuously update and expand the user database. Capture user data through \\nsign -ups, app usage, and interactions. Ensure data accuracy and completeness. \\n2. Leverage lookalike audience targeting on platforms like Facebook and Google Ads. \\nUse existing user profiles to find similar potential users. \\n3. Set up automated triggers based on user behavior (e.g., abandoned carts, frequent \\nvisits). Send personalized emails or notifications to re -engage users. \\n4. Regularly clean and optimize the database. Remove inactive or irrelevant users to \\nimprove engagement metrics. \\n5. Gather data on market trends, cultural nuances, and user preferences in the new \\nmarkets. Understand how ERGO’s brand can align with local values. \\nunder the oversight and control of the Information Technology Officer the DBA must; \"),\n", + " Document(metadata={'source': '/root/ds_erp_ai/data/raw/test_sop.pdf', 'page': 8}, page_content='1. Collaborate with the marketing team to collect data on brand awareness metrics \\n(e.g., social media mentions, website traffic, search volume). Use this data to \\nidentify trends and areas for improvement. \\n2. Create segments within the database based on user interactions with ERGO’s \\nbrand. Tailor marketing messages to each segment, emphasizing brand values, \\nunique selling points, and success stories. \\n3. Experiment with different marketing campaigns and track their impact on brand \\nawareness. Optimize based on the most effective strategies. \\n6. Work closely with data analysts to profile existing users. Understand their \\ndemographics, behavior, and preferences. Use this information to target similar \\naudiences in digital marketing campaigns. \\n7. Collaborate with content creators to personalize marketing content. Use database \\ninsights to tailor messages that resonate with users’ interests and pain points. \\n8. Collaborate with content teams to create region -specific marketing materials. \\nTranslate content, adapt visuals, and customize messaging. \\nIT SECURITY ANALYST: \\nUnder the oversight and control of the Information Technology Officer the IT Security \\nAnalyst shall; \\n1. Research and understand the data protection laws and regulations in the new \\nmarkets. Ensure compliance with GDPR, CCPA, or other relevant standards. \\n2. Train local teams on incident response procedures \\nUnder the oversight and control of the Information Technology Officer the IT Security \\nAnalyst will; \\n1. Restrict access to marketing databases. Only authorized personnel should handle \\nsensitive customer information. \\n2. Implement CAPTCHA or other anti -bot measures during user registration to prevent \\nfraudulent accounts. \\n3. If ERGO uses apis for user registration or authentication, ensure they are properly \\nsecured with authentication tokens and rate limiting. \\n4. Set up anomaly detection to identify suspicious user activity (e.g., sudden spikes in \\nregistrations). \\n5. Regularly update CMS platforms and plugins to patch security vulnerabilities. Use \\nstrong authentication for CMS access. \\n6. Ensure that content marketing materials (blogs, videos, etc.) Are free from \\nmalicious code (e.g., cross -site scripting). Scan for vulnerabilities. \\nUnder the oversight and control of the Information Technology Officer the IT Security \\nAnalyst must; '),\n", + " Document(metadata={'source': '/root/ds_erp_ai/data/raw/test_sop.pdf', 'page': 9}, page_content=\"1. Collaborate with the marketing team to ensure that all customer data collected for \\ntargeted campaigns is securely stored and transmitted. Implement encryption for \\ndata in transit and at rest. \\n2. Educate marketing staff about security best practices. Ensure they understand the \\nimportance of protecting customer data. \\nIT OPERATIONS MANAGER: \\nIT operations managers oversee the day -to-day operations of an organization's IT \\ninfrastructure and support teams. They develop and implement IT policies and procedures, \\nmanage IT resources and budgets, coordinate IT projects and initiatives, and ensure th at IT \\nsystems meet business requirements and objectives. \\nIT SUPPORT SPECIALIST: \\nUnder the oversight and control of the IT OPERATIONS MANAGER the IT Security Analyst \\nwill; \\n1. Ensure that the onboarding process is seamless for new users. Provide clear \\ninstructions, troubleshoot any issues, and guide them through setup. \\n2. Set up efficient user support channels (chat, email, or phone). Respond promptly to \\ninquiries and resolve any technical hurdles. \\n3. Reach out to users who haven’t engaged recently. Offer personalized assistance, \\nask for feedback, and provide tips on using ERGO effectively. \\n4. Gather user feedback on their experiences. Use this information to improve the \\nproduct and enhance brand perception. \\n5. Conduct surveys to understand pain points and areas for improvement. Use this \\ndata to enhance user engagement strategies. \\n6. Continuously update the knowledge base with troubleshooting guides, FAQs, and \\nbest practices. Make it easily accessible to users. \\n7. Collaborate with localization teams to ensure that ERGO’s software and \\ndocumentation are available in the local languages of the new markets. \\n8. Anticipate region -specific technical challenges. Prepare troubleshooting guides \\ntailored to the new markets. \\n9. Host webinars or virtual training sessions for existing and potential users. Teach \\nthem how to make the most of ERGO’s services. \\nUnder the oversight and control of the IT OPERATIONS MANAGER the IT Security Analyst \\nmust; \\n1. Collaborate with the marketing team to create engaging technical content. Write \\nblog posts, FAQs, or video tutorials about ERGO’s unique features and benefits. \\n2. Encourage existing users to refer new users. Offer incentives or discounts for \\nsuccessful referrals. \"),\n", + " Document(metadata={'source': '/root/ds_erp_ai/data/raw/test_sop.pdf', 'page': 10}, page_content='3. Understand cultural nuances and adapt communication accordingly. Provide user \\nsupport that aligns with local customs and preferences. \\nDEVOPS ENGINEER: \\nUnder the oversight and control of the IT OPERATIONS MANAGER the DEVOPS engineer \\nshall ; \\n1. Collaborate with legal and compliance teams to understand data protection laws \\nand ensure adherence in each region. \\nUnder the oversight and control of the IT OPERATIONS MANAGER the DEVOPS engineer \\nwill; \\n1. Implement monitoring tools to track website performance, uptime, and user \\ninteractions. Set up alerts for any anomalies or downtime. \\n2. Ensure that ERGO’s infrastructure can handle increased traffic during marketing \\ncampaigns. Autoscale servers and optimize load balancers. \\n3. Design robust and fault -tolerant systems to handle user registrations, logins, and \\ndata processing. Use redundancy and failover mechanisms. \\n4. optimize content delivery networks (CDNs). Reduce latency for users accessing \\nERGO’s content. \\n5. Set up infrastructure to support A/B testing for different content variations. Monitor \\nuser engagement metrics for each variant. \\n6. Customize deployment scripts for each new market. Consider regional cloud \\nproviders and data centers. \\nUnder the oversight and control of the IT OPERATIONS MANAGER the DEVOPS engineer \\nmust; \\n1. Set up CI/CD pipelines to streamline marketing campaign deployments. Automate \\nthe rollout of new landing pages, banners, and promotional content. \\n2. Collaborate with DBA to simulate heavy user loads during testing. Optimize \\ndatabase queries and API endpoints for scalability. \\n3. Create a feedback loop between DevOps and content teams. Rapidly deploy \\nchanges based on user feedback to improve engagement. \\n4. Set up databases with geo -replication to ensure data availability and low latency in \\nnew markets. \\n \\n \\n \\n \\n \\n \\n '),\n", + " Document(metadata={'source': '/root/ds_erp_ai/data/raw/test_sop.pdf', 'page': 11}, page_content=' \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n ')]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pdf_docs" ] } ], diff --git a/notebooks/test_sop_generator.ipynb b/notebooks/test_sop_generator.ipynb index 6e195ea..59ac554 100644 --- a/notebooks/test_sop_generator.ipynb +++ b/notebooks/test_sop_generator.ipynb @@ -13,31 +13,51 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], + "source": [ + "from langchain_community.document_loaders import PyPDFLoader\n", + "loader = PyPDFLoader(\"/root/ds_erp_ai/data/raw/test_sop.pdf\")\n", + "docs = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(metadata={'source': '/root/ds_erp_ai/data/raw/test_sop.pdf', 'page': 0}, page_content='ERGO \\nERGO 202 4 VISION \\nDEFINITION \\nA standard operating procedure (SOP) is a step -by-step, repeatable process for any critical \\nand routine task. It’s a kind of documentation that prevents stress, mistakes, and \\nmiscommunication. SOPs ensure reliability, efficiency, and consistently hitting quality \\nstandards in regular work activities. The SOPs represent the what, when, where and how of \\nevery role in the organization as it aligns to it’s vision and goals. \\n \\nVISION: \\nERGO envisions itself as the leading platform for renting spaces, equipment, and services, \\ntransforming the way people access and share resources. By the end of the next year, we \\naim to significantly increase our user base and participation rates, fosterin g a vibrant and \\nactive community that thrives on mutual benefit and shared success. \\n \\nWe are committed to creating an intuitive, reliable, and user -friendly app that empowers \\nindividuals and businesses to rent what they need when they need it, contributing to a \\nmore efficient and sustainable use of resources. We believe in the power of tech nology to \\nsimplify transactions, build connections, and unlock new possibilities. \\n \\nOur strategic vision is to continuously innovate and improve our platform, ensuring it \\nmeets the evolving needs of our users. We will invest in marketing and user engagement \\nstrategies to attract new users and retain existing ones, with a focus on providin g \\nexceptional customer service and a seamless user experience. \\n \\nIn the long term, we aspire to expand our services to new markets and regions, establishing \\nERGO as a global leader in the sharing economy. We are dedicated to creating value for our \\nusers, stakeholders, and the broader community, contributing to a more co nnected and \\nresource -efficient world.\" \\n \\nRemember, a strategic vision is a guiding light for a company’s future direction. It will be \\nrevisited and potentially revised as the company grows and the market landscape \\nchanges. It’s also important to ensure that all stakeholders understand and align with the \\nvision. This includes employees, investors, and customers. They all play a crucia l role in \\nthe company’s journey towards its vision. \\n '),\n", + " Document(metadata={'source': '/root/ds_erp_ai/data/raw/test_sop.pdf', 'page': 1}, page_content=' '),\n", + " Document(metadata={'source': '/root/ds_erp_ai/data/raw/test_sop.pdf', 'page': 2}, page_content='GOALS; \\n1. MARKETING \\na. Brand Awareness : Increase brand awareness by 50% over the next year \\nthrough targeted marketing campaigns. \\nb. User Acquisition: Acquire 100,000 new users over the next year through \\ndigital marketing strategies. \\nc. User Engagement : Improve user engagement by 30% by implementing a \\ncontent marketing strategy that educates users about the benefits and uses \\nof ERGO. \\nd. Market Expansion : Enter two new geographic markets by the end of the year \\nthrough localized marketing campaigns. \\n2. IT INFRASTRUCTURE : \\na. Platform Stability: Achieve 99.9% uptime for the ERGO app to ensure a \\nseamless user experience. \\nb. Security: Implement robust security measures to protect user data and build \\ntrust with the user base. \\nc. Scalability: Enhance the IT infrastructure to support a 50% increase in user \\ntraffic over the next year. \\nd. Feature Development: Develop and launch three new features based on user \\nfeedback to improve the functionality and usability of the ERGO app. \\n3. SALES TEAM GOALS: \\na. New User Sales: Sign up 50,000 new users over the next year through direct \\nsales efforts. \\nb. Partnership Development: Establish partnerships with 20 new businesses to \\noffer their spaces, equipment, or services on ERGO. \\nc. Customer Retention: Achieve a customer retention rate of 90% by providing \\nexcellent customer service and follow -up. \\nd. Revenue Growth: Increase sales revenue by 30% over the next year by \\nupselling premium features and services. \\n '),\n", + " Document(metadata={'source': '/root/ds_erp_ai/data/raw/test_sop.pdf', 'page': 3}, page_content='ERGO 2024 Standard Operating Procedures : \\n \\nAPPLICATION \\nThis document applies to Information technology specialists and contractors. Marketing \\nteams and Contracting Specialist inv olved with Marketing contracts. Current and future \\nsales associates employed or contracted by ERGO. \\n \\nVISION STATEMENT \\n \\nERGO’s strategic goals for the next year encompass marketing, IT infrastructure, and sales. \\nIn marketing, we aim to increase brand awareness by 50% through targeted campaigns, \\nacquire 100,000 new users via digital strategies, improve user engagement by 30% with a \\ncontent marketing strategy, and expand into two new markets. For IT infrastructure, our \\ngoals are to achieve 99.9% app uptime, implement robust security measures, enhance \\ninfrastructure to support a 50% increase in user traffic, and develop three n ew features \\nbased on user feedback. The sales team aims to sign up 50,000 new users, establish \\npartnerships with 20 new businesses, achieve a 90% customer retention rate, and increase \\nsales revenue by 30% through upselling. \\n \\nRESPONSIBILITIES \\n \\nCHIEF OPERATIONS OFFICER : \\nCross -Functional Coordination: As a key member of the executive team, the COO will \\nensure alignment and coordination across different departments to achieve these goals. \\nPerformance Monitoring: The COO will establish key performance indicators (KPIs) and \\nregularly review progress towards these goals, making necessary adjustments to strategies \\nand plans. \\n \\nSALES MANAGER; \\nUnder the oversight and control of the COO, the sales manager will guide the sales team to \\nsign up 50,000 new users, establish partnerships with 20 new businesses, achieve a 90% \\ncustomer retention rate, and increase sales revenue by 30% through upselling. \\n \\nCAMPAIGN MANAGER: '),\n", + " Document(metadata={'source': '/root/ds_erp_ai/data/raw/test_sop.pdf', 'page': 4}, page_content='Under the oversight and control of the COO the campaign manager will oversee the \\nmarketing team’s efforts to increase brand awareness and acquire new users. They will \\nalso ensure the implementation of a content marketing strategy to improve user \\nengagement and spearhead efforts to expand into two new geographic markets. The \\ncampaign manager will exercise operational oversight and control over marketing \\nspecialist and ad contracts. \\n \\nCONTENT MARKETING SPECIALIST: \\nUnder the oversight and control of the campaign manager the content marketing specialist \\nwill; \\n1. Create fun and informative quizzes related to your brand or services . Share them on \\nsocial media and encourage users to participate and share their results. \\n2. Share behind -the-scenes glimpses of your company culture, product development, \\nor team members. People love to connect with the human side of brands. \\n3. User -Generated Content Contests: Run a contest where users submit photos, \\nvideos, or stories related to your brand. Feature the best entries on your website or \\nsocial media. \\n4. Host online webinars or workshops that provide value to potential users. Topics \\ncould include product demos, services trends, or problem -solving sessions. \\n5. Create comprehensive guides or e -books that address common pain points your \\nproduct solves. Promote these through targeted ads and email campaigns. \\n6. Share success stories of existing users. Highlight how your product transformed \\ntheir lives or businesses. Authentic testimonials resonate well with potential users. \\n7. Develop a series of short videos explaining different aspects of your product or \\nservices . Keep them engaging and easy to understand. \\n8. Create visually appealing infographics that educate users about ERGO’s benefits. \\nUse interactive elements like clickable sections or animations. \\n9. Set up a community forum where users can ask questions, share tips, and interact \\nwith each other. Host live Q&A sessions on social media. \\n10. Write blog posts specifically tailored to the culture, language, and interests of the \\nnew markets. Address local challenges and showcase how ERGO can solve them. \\n11. Create landing pages in the local language, highlighting the unique value \\nproposition of ERGO for each market. \\n12. Partner with influencers or thought leaders from the new markets. Have them \\ncreate content or host events related to ERGO. \\n \\nDIGITAL MARKETING SPECIALIST: '),\n", + " Document(metadata={'source': '/root/ds_erp_ai/data/raw/test_sop.pdf', 'page': 5}, page_content='Under the oversight and control of the campaign manager the digital marketing specialist \\nwill; \\n1. Conduct market research to understand the target audience, their preferences, and \\nwhere they spend their time online.. \\n2. React to media inquiries: engage with journalists and media outlets to kickstart your \\nPR efforts1. \\n3. Start a referral program to encourage existing users to refer new ones. \\n4. Regularly publish informative blog posts to attract organic traffic and showcase \\nyour expertise . \\n5. Offer limited -time promotions or exclusive deals to entice new users. \\n6. Personalize content: tailor content to user interests and needs. Use personalized \\npush notifications, emails, and SEO strategies. \\n7. Measure metrics: track page views, time spent on site, bounce rate, and social \\nmedia interactions. Tools like Full Session can provide insights into user behavior4. \\n8. Engage with audience: respond to comments, likes, and shares. Regularly publish \\nrelevant content to keep users engaged. \\n9. Conduct market research: understand local nuances, preferences, and behaviors. \\nDefine demographics, languages, and dialects. \\n10. Adapt content: translate and culturally adapt content. Consider local holidays, \\nsymbols, and traditions. \\n11. Localize marketing channels: choose platforms popular in the new markets. Adjust \\nadvertising campaigns accordingly. \\n12. Test and analyze pilot localized campaigns, analyze results, and optimize based on \\ndata. \\nUnder the oversight and control of the campaign manager and in coordination with other \\nspecialist the digital marketing specialist must; \\n1. Leverage social media: use platforms where your audience is most active. Share \\nengaging content, run targeted ads, and interact with followers \\n2. Implement a search -focused content marketing strategy, create valuable content \\nthat aligns with what your audience searches for. Optimize it for relevant keywords \\nto improve organic visibility \\n3. Run targeted ads on cloud -related platforms or industry -specific websites in the \\nnew markets. Emphasize ERGO’s cloud advantages for local businesses. \\nSEARCH ENGINE OPTIMIZATION SPECIALIST: \\nUnder the oversight and control of the campaign manager the search engine optimization \\nspecialist will; \\n1. Create valuable content that aligns with what your audience searches for. Optimize \\nit for relevant keywords to improve organic visibility '),\n", + " Document(metadata={'source': '/root/ds_erp_ai/data/raw/test_sop.pdf', 'page': 6}, page_content='2. Personalize Content: Tailor content to user interests and needs. Use personalized \\npush notifications, emails, and SEO strategies. \\n3. Measure Metrics: Track page views, time spent on site, bounce rate, and social \\nmedia interactions. Tools like Full Session can provide insights into user behavior4. \\n4. Under the oversight and control of the campaign manager and in coordination with \\nother specialist the search engine optimization specialist must; \\n5. Engage with journalists and media outlets to kickstart your PR efforts. \\n6. Use platforms where your audience is most active. Share engaging content, run \\ntargeted ads, and interact with followers. \\n \\nINFORMATION TECHNOLOGY OFFICER \\nIn coordination cooperation with the Chief Operations Officer the Information Technology \\nOfficer will work closely with the it team to achieve 99.9% uptime for the ergo app, \\nimplement robust security measures, shall enhance the it infrastructure to support \\nincreased user traffic, and must develop and launch new features based on user feedback. \\nThe Information Technology officer will h ave a well -defined plan in place to handle security \\nincidents in local and new markets . \\nCLOUD ENGINEER: \\nunder the oversight and control of the Information Technology Officer the Cloud Engineer \\nshall; \\n1. Select cloud service providers with strong security practices. Ensure they comply \\nwith industry standards. \\n2. Clearly communicate to users how their data will be collected, used, and \\nprotected. Maintain a comprehensive privacy policy on your website. \\n3. Obtain explicit consent from users before collecting any personal data. Implement \\ncookie consent banners and ensure compliance with regulations like GDPR or \\nCCPA. \\n4. Isolate different components of your infrastructure to prevent lateral movement in \\ncase of a breach \\n5. Conduct periodic security audits to identify vulnerabilities and address them \\npromptly. \\nunder the oversight and control of the Information Technology Officer the Cloud Engineer \\nwill; \\n1. Design , implement, and manag e cloud computing solutions, such as public, \\nprivate, and hybrid cloud environments. \\n2. Create step -by-step video tutorials on how new users can set up their accounts, \\naccess features, and utilize cloud -based services within ERGO. \\n3. Deploy cloud infrastructure '),\n", + " Document(metadata={'source': '/root/ds_erp_ai/data/raw/test_sop.pdf', 'page': 7}, page_content=\"4. Migrate on -premises systems to the cloud \\n1. Continuously monitor and optimize ERGO’s cloud resources for scalability and \\ncost -effectiveness. Share success stories of how cloud improvements directly \\nbenefit users. Ensure cloud security and compliance \\n2. Host webinars on cloud -related topics, emphasizing how ERGO leverages cloud \\ntechnology. Invite industry experts and influencers to participate. \\nunder the oversight and control of the Information Technology Officer the Cloud Engineer \\nmust ; \\n3. Establish an online community forum where users can share tips, ask questions, \\nand discuss cloud -related topics. Cloud engineers can actively participate and \\nprovide insights. \\n4. Create visually appealing case studies or blog posts highlighting how ERGO’s cloud \\ninfrastructure enables seamless user experiences. Explain how scalability, \\nreliability, and security are achieved through cloud services. \\n5. Develop an interactive web page or app feature that takes users on a virtual tour of \\nERGO’s cloud architecture. Explain key components, data flow, and benefits. \\n6. Train cloud support teams to assist users in the new markets. Ensure they \\nunderstand the cultural context and can address cloud -related queries effectively. \\n \\nDATABASE ADMINISTRATOR (DBA): \\nDatabase administrators are responsible for the design, implementation, and \\nmaintenance of databases that store and organize an organization's data. They install and \\nconfigure database management systems, optimize database performance, backup and \\nrestore d ata, and enforce data security policies. \\nunder the oversight and control of the Information Technology Officer the DBA will; \\n1. Continuously update and expand the user database. Capture user data through \\nsign -ups, app usage, and interactions. Ensure data accuracy and completeness. \\n2. Leverage lookalike audience targeting on platforms like Facebook and Google Ads. \\nUse existing user profiles to find similar potential users. \\n3. Set up automated triggers based on user behavior (e.g., abandoned carts, frequent \\nvisits). Send personalized emails or notifications to re -engage users. \\n4. Regularly clean and optimize the database. Remove inactive or irrelevant users to \\nimprove engagement metrics. \\n5. Gather data on market trends, cultural nuances, and user preferences in the new \\nmarkets. Understand how ERGO’s brand can align with local values. \\nunder the oversight and control of the Information Technology Officer the DBA must; \"),\n", + " Document(metadata={'source': '/root/ds_erp_ai/data/raw/test_sop.pdf', 'page': 8}, page_content='1. Collaborate with the marketing team to collect data on brand awareness metrics \\n(e.g., social media mentions, website traffic, search volume). Use this data to \\nidentify trends and areas for improvement. \\n2. Create segments within the database based on user interactions with ERGO’s \\nbrand. Tailor marketing messages to each segment, emphasizing brand values, \\nunique selling points, and success stories. \\n3. Experiment with different marketing campaigns and track their impact on brand \\nawareness. Optimize based on the most effective strategies. \\n6. Work closely with data analysts to profile existing users. Understand their \\ndemographics, behavior, and preferences. Use this information to target similar \\naudiences in digital marketing campaigns. \\n7. Collaborate with content creators to personalize marketing content. Use database \\ninsights to tailor messages that resonate with users’ interests and pain points. \\n8. Collaborate with content teams to create region -specific marketing materials. \\nTranslate content, adapt visuals, and customize messaging. \\nIT SECURITY ANALYST: \\nUnder the oversight and control of the Information Technology Officer the IT Security \\nAnalyst shall; \\n1. Research and understand the data protection laws and regulations in the new \\nmarkets. Ensure compliance with GDPR, CCPA, or other relevant standards. \\n2. Train local teams on incident response procedures \\nUnder the oversight and control of the Information Technology Officer the IT Security \\nAnalyst will; \\n1. Restrict access to marketing databases. Only authorized personnel should handle \\nsensitive customer information. \\n2. Implement CAPTCHA or other anti -bot measures during user registration to prevent \\nfraudulent accounts. \\n3. If ERGO uses apis for user registration or authentication, ensure they are properly \\nsecured with authentication tokens and rate limiting. \\n4. Set up anomaly detection to identify suspicious user activity (e.g., sudden spikes in \\nregistrations). \\n5. Regularly update CMS platforms and plugins to patch security vulnerabilities. Use \\nstrong authentication for CMS access. \\n6. Ensure that content marketing materials (blogs, videos, etc.) Are free from \\nmalicious code (e.g., cross -site scripting). Scan for vulnerabilities. \\nUnder the oversight and control of the Information Technology Officer the IT Security \\nAnalyst must; '),\n", + " Document(metadata={'source': '/root/ds_erp_ai/data/raw/test_sop.pdf', 'page': 9}, page_content=\"1. Collaborate with the marketing team to ensure that all customer data collected for \\ntargeted campaigns is securely stored and transmitted. Implement encryption for \\ndata in transit and at rest. \\n2. Educate marketing staff about security best practices. Ensure they understand the \\nimportance of protecting customer data. \\nIT OPERATIONS MANAGER: \\nIT operations managers oversee the day -to-day operations of an organization's IT \\ninfrastructure and support teams. They develop and implement IT policies and procedures, \\nmanage IT resources and budgets, coordinate IT projects and initiatives, and ensure th at IT \\nsystems meet business requirements and objectives. \\nIT SUPPORT SPECIALIST: \\nUnder the oversight and control of the IT OPERATIONS MANAGER the IT Security Analyst \\nwill; \\n1. Ensure that the onboarding process is seamless for new users. Provide clear \\ninstructions, troubleshoot any issues, and guide them through setup. \\n2. Set up efficient user support channels (chat, email, or phone). Respond promptly to \\ninquiries and resolve any technical hurdles. \\n3. Reach out to users who haven’t engaged recently. Offer personalized assistance, \\nask for feedback, and provide tips on using ERGO effectively. \\n4. Gather user feedback on their experiences. Use this information to improve the \\nproduct and enhance brand perception. \\n5. Conduct surveys to understand pain points and areas for improvement. Use this \\ndata to enhance user engagement strategies. \\n6. Continuously update the knowledge base with troubleshooting guides, FAQs, and \\nbest practices. Make it easily accessible to users. \\n7. Collaborate with localization teams to ensure that ERGO’s software and \\ndocumentation are available in the local languages of the new markets. \\n8. Anticipate region -specific technical challenges. Prepare troubleshooting guides \\ntailored to the new markets. \\n9. Host webinars or virtual training sessions for existing and potential users. Teach \\nthem how to make the most of ERGO’s services. \\nUnder the oversight and control of the IT OPERATIONS MANAGER the IT Security Analyst \\nmust; \\n1. Collaborate with the marketing team to create engaging technical content. Write \\nblog posts, FAQs, or video tutorials about ERGO’s unique features and benefits. \\n2. Encourage existing users to refer new users. Offer incentives or discounts for \\nsuccessful referrals. \"),\n", + " Document(metadata={'source': '/root/ds_erp_ai/data/raw/test_sop.pdf', 'page': 10}, page_content='3. Understand cultural nuances and adapt communication accordingly. Provide user \\nsupport that aligns with local customs and preferences. \\nDEVOPS ENGINEER: \\nUnder the oversight and control of the IT OPERATIONS MANAGER the DEVOPS engineer \\nshall ; \\n1. Collaborate with legal and compliance teams to understand data protection laws \\nand ensure adherence in each region. \\nUnder the oversight and control of the IT OPERATIONS MANAGER the DEVOPS engineer \\nwill; \\n1. Implement monitoring tools to track website performance, uptime, and user \\ninteractions. Set up alerts for any anomalies or downtime. \\n2. Ensure that ERGO’s infrastructure can handle increased traffic during marketing \\ncampaigns. Autoscale servers and optimize load balancers. \\n3. Design robust and fault -tolerant systems to handle user registrations, logins, and \\ndata processing. Use redundancy and failover mechanisms. \\n4. optimize content delivery networks (CDNs). Reduce latency for users accessing \\nERGO’s content. \\n5. Set up infrastructure to support A/B testing for different content variations. Monitor \\nuser engagement metrics for each variant. \\n6. Customize deployment scripts for each new market. Consider regional cloud \\nproviders and data centers. \\nUnder the oversight and control of the IT OPERATIONS MANAGER the DEVOPS engineer \\nmust; \\n1. Set up CI/CD pipelines to streamline marketing campaign deployments. Automate \\nthe rollout of new landing pages, banners, and promotional content. \\n2. Collaborate with DBA to simulate heavy user loads during testing. Optimize \\ndatabase queries and API endpoints for scalability. \\n3. Create a feedback loop between DevOps and content teams. Rapidly deploy \\nchanges based on user feedback to improve engagement. \\n4. Set up databases with geo -replication to ensure data availability and low latency in \\nnew markets. \\n \\n \\n \\n \\n \\n \\n '),\n", + " Document(metadata={'source': '/root/ds_erp_ai/data/raw/test_sop.pdf', 'page': 11}, page_content=' \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n ')]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "docs" ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 5, "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'SOPsResponse' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[1], line 22\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[38;5;28;01mclass\u001b[39;00m \u001b[38;5;21;01mRoles_response\u001b[39;00m(BaseModel):\n\u001b[1;32m 20\u001b[0m roles: \u001b[38;5;28mlist\u001b[39m[\u001b[38;5;28mstr\u001b[39m]\n\u001b[0;32m---> 22\u001b[0m \u001b[38;5;28;01mclass\u001b[39;00m \u001b[38;5;21;01mSopGenerator\u001b[39;00m:\n\u001b[1;32m 23\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__init__\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m 24\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mapi_key \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mgetenv(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mOPENAI_API_KEY\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", - "Cell \u001b[0;32mIn[1], line 89\u001b[0m, in \u001b[0;36mSopGenerator\u001b[0;34m()\u001b[0m\n\u001b[1;32m 60\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mclient\u001b[38;5;241m.\u001b[39mbeta\u001b[38;5;241m.\u001b[39mchat\u001b[38;5;241m.\u001b[39mcompletions\u001b[38;5;241m.\u001b[39mparse(\n\u001b[1;32m 61\u001b[0m model\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel,\n\u001b[1;32m 62\u001b[0m messages\u001b[38;5;241m=\u001b[39m[\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 83\u001b[0m temperature\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.1\u001b[39m\n\u001b[1;32m 84\u001b[0m )\n\u001b[1;32m 86\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m json\u001b[38;5;241m.\u001b[39mloads(response\u001b[38;5;241m.\u001b[39mchoices[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;241m.\u001b[39mmessage\u001b[38;5;241m.\u001b[39mcontent)\n\u001b[0;32m---> 89\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mgenerate_sops\u001b[39m(\u001b[38;5;28mself\u001b[39m, roles, docs_text) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[43mSOPsResponse\u001b[49m:\n\u001b[1;32m 90\u001b[0m roles_sops_all \u001b[38;5;241m=\u001b[39m {}\n\u001b[1;32m 92\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m role \u001b[38;5;129;01min\u001b[39;00m roles:\n", - "\u001b[0;31mNameError\u001b[0m: name 'SOPsResponse' is not defined" - ] - } - ], + "outputs": [], "source": [ "import os\n", "import json\n", @@ -167,6 +187,26 @@ "roles = [\"Devops engineers\"]\n", "sops_response = service.check_role_sop(roles,docs)" ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'message': 'SOPs found for the roles: Devops engineers', 'status': True}" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sops_response" + ] } ], "metadata": { @@ -176,7 +216,15 @@ "name": "python3" }, "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", "version": "3.10.13" } }, diff --git a/src/api/routes/sops.py b/src/api/routes/sops.py index 469f236..fb96ba1 100644 --- a/src/api/routes/sops.py +++ b/src/api/routes/sops.py @@ -59,7 +59,7 @@ def get_roles(): -@sops_bp.route('/generate_sops', methods=['POST']) +@sops_bp.route('/generate_sops_from_doc', methods=['POST']) def generate_sops(): # Check if the POST request has the file part if 'document' not in request.files: @@ -101,7 +101,7 @@ def generate_sops(): return jsonify({"error": "Document cannot extract SOPs", "message": status_check["message"]}), 400 # Generate SOPs based on the roles provided - sops = sop_generator.generate_sops(roles, docs) + sops = sop_generator.generate_sops_from_doc(roles, docs) # Cleanup: Delete all files in the upload directory after processing delete_all_files_in_directory(upload_folder) @@ -114,3 +114,120 @@ def generate_sops(): return jsonify({"error": "Processing error", "message": f"An error occurred while processing the document: {str(e)}"}), 500 return jsonify({"error": "File type not allowed", "message": "The uploaded file type is not allowed. Please upload a PDF, DOC, or DOCX file."}), 400 + + + +@sops_bp.route('/generate_sops_from_info', methods=['POST']) +def generate_sops_from_info(): + """ + Generate SOPs based on role information provided in the request body. + """ + try: + # Get role information from the request body + roles_info = request.json.get('roles_info') + if not roles_info: + return jsonify({"error": "No role information provided", "message": "Please provide role information in the 'roles_info' field."}), 400 + + # Generate SOPs based on the provided role information + sops_response = sop_generator.generate_sops_from_info(roles=roles_info) + + return jsonify({"sops": sops_response, "message": "SOPs successfully generated based on the provided role information."}), 200 + + except Exception as e: + return jsonify({"error": "Processing error", "message": f"An error occurred while generating SOPs: {str(e)}"}), 500 + + +@sops_bp.route('/generate_sops_by_role_and_area', methods=['POST']) +def generate_sops_by_role_and_area(): + try: + # Get role and area from the request body + role = request.json.get('role') + area = request.json.get('area') + + if not role or not area: + return jsonify({"error": "Missing parameters", "message": "Both 'role' and 'area' fields are required."}), 400 + + # Generate SOPs based on the provided role and area + sops_response = sop_generator.generate_sops_by_role_and_area(role=role, area=area) + + return jsonify({"sops": sops_response, "message": f"SOPs successfully generated for role '{role}' in area '{area}'."}), 200 + + except Exception as e: + return jsonify({"error": "Processing error", "message": f"An error occurred while generating SOPs: {str(e)}"}), 500 + + + +@sops_bp.route('/executive/generate_sops_from_questionnaire', methods=['POST']) +def generate_executive_sops_from_questionnaire(): + try: + # Get data from the request body + data = request.json + + # Generate SOPs based on the questionnaire answers + sops_response = sop_generator.generate_executive_sops_from_questionnaire(data) + + return jsonify({"sops": sops_response, "message": "SOPs successfully generated from the questionnaire."}), 200 + + except Exception as e: + return jsonify({"error": "Processing error", "message": f"An error occurred while generating SOPs: {str(e)}"}), 500 + + + + +@sops_bp.route('/executive/generate_sops_from_doc', methods=['POST']) +def generate_executive_sops_from_doc(): + """ + Generate SOPs for executives based on a document containing vision and mission. + """ + # Check if the POST request has the file part + if 'document' not in request.files: + return jsonify({"error": "No file part", "message": "Please upload a file with the key 'document'."}), 400 + + file = request.files['document'] + + # If the user does not select a file, the browser may also submit an empty part without filename + if file.filename == '': + return jsonify({"error": "No selected file", "message": "A file was not selected for upload. Please select a valid file."}), 400 + + if file and allowed_file(file.filename): + filename = secure_filename(file.filename) + upload_folder = current_app.config['UPLOAD_FOLDER'] + file_path = os.path.join(upload_folder, filename) + + # Save the file to the upload folder + file.save(file_path) + + try: + # Use the utility function to generate docs from the file + docs = load_document(file_path) + + + # Use LLM to extract Vision and Mission sections from the document + vision_section, mission_section = sop_generator.extract_vision_and_mission(docs) + + if not vision_section or not mission_section: + # Cleanup: Delete all files in the upload directory if parsing fails + delete_all_files_in_directory(upload_folder) + return jsonify({"error": "Missing Vision and Mission", "message": "The document does not contain or properly define the company's vision and mission."}), 400 + + # Organize extracted data + extracted_data = { + "role": "Executive", + "organization vision": [vision_section], + "organization strategic goals": [mission_section] + } + + # Generate SOPs based on the extracted vision and goals + sops_response = sop_generator.generate_executive_sops_from_questionnaire(extracted_data) + + # Cleanup: Delete all files in the upload directory after processing + delete_all_files_in_directory(upload_folder) + + return jsonify({"sops": sops_response, "message": "SOPs successfully generated from the document."}), 200 + + except Exception as e: + # Cleanup: Delete all files in the upload directory if an error occurs + delete_all_files_in_directory(upload_folder) + return jsonify({"error": "Processing error", "message": f"An error occurred while processing the document: {str(e)}"}), 500 + + return jsonify({"error": "File type not allowed", "message": "The uploaded file type is not allowed. Please upload a PDF, DOC, or DOCX file."}), 400 diff --git a/src/services/sop_generator.py b/src/services/sop_generator.py index 8145dca..0c7abb4 100644 --- a/src/services/sop_generator.py +++ b/src/services/sop_generator.py @@ -22,6 +22,12 @@ class RolesResponse(BaseModel): class SOPsResponse(BaseModel): roles_sops: Dict[str, SOPs] = Field(default_factory=dict) +class VisionMissionResponse(BaseModel): + vision: Optional[str] + mission: Optional[str] + message: str + + class SopGenerator: def __init__(self): self.api_key = os.getenv("OPENAI_API_KEY") @@ -77,7 +83,7 @@ class SopGenerator: ) return json.loads(response.choices[0].message.content) - def generate_sops(self, roles: List[str], docs) -> SOPsResponse: + def generate_sops_from_doc(self, roles: List[str], docs) -> SOPsResponse: roles_sops_all = {} docs_text = self._extract_text_from_docs(docs) @@ -88,14 +94,26 @@ class SopGenerator: messages=[ { "role": "system", - "content": f'''You are a Standard Operating Procedure (SOP) extractor. - Your task is to find SOPs for the role "{role}" in the provided text. - SOPs should be categorized under "must", "shall", and "will". - If the SOPs for the role are not explicitly stated, you are required to infer them from the context provided in the document, - but only if there is clear evidence within the text. - Do not generate or assume SOPs that are not directly supported by the document. - Your extraction should strictly adhere to the content of the document, ensuring that no information is fabricated or inferred beyond what is present. - If no SOPs are found for the role, return an empty list for each category.''', + "content": f'''Your job is to extract Standard Operating Procedure (SOP) questions specifically for the role of "{role}" from the provided text. + + Instructions: + + Categorization: Organize the questions under three categories: "must," "shall," and "will." + Direct Questions: The questions should be directly addressed to the person in the role. Do not reference the role itself in the question. + Contextual Inference: If SOPs for the role are not explicitly stated, infer them from the context, but only if there is clear evidence within the text. Do not generate or assume SOPs that are not directly supported by the document. + Empty Lists: If no SOPs are found for the role, return an empty list for each category. + Format: The questions should be direct and concise, e.g., "Have you completed all the required reports?" + Example: + + Category: Must + + Have you completed all the required reports? + Category: Shall + + Are you ensuring that all team members follow the safety protocols? + Category: Will + + Are you planning to review the budget next week?''', }, { "role": "user", @@ -108,6 +126,211 @@ class SopGenerator: ) role_sop = json.loads(response.choices[0].message.content) roles_sops_all[role] = role_sop + + + return roles_sops_all + + def generate_sops_from_info(self, roles: List[Dict[str, str]]): + + roles_sops_all = {} + + for role_info in roles: + role_title = role_info.get("title", "Unknown Role") + print(f"Role title : {role_title}") + response = self.client.beta.chat.completions.parse( + model=self.model, + messages=[ + { + "role": "system", + "content": f'''Your job is to generate Standard Operating Procedures (SOPs) for the role of "{role_title}" based on the following information provided: + + Responsibilities: {role_info.get("responsibilities", "Not provided")} + Objectives: {role_info.get("objectives", "Not provided")} + Tools: {role_info.get("tools", "Not provided")} + Challenges: {role_info.get("challenges", "Not provided")} + + Instructions: + + Categorization: Organize the SOPs under three categories: "must," "shall," and "will." + Direct Instructions: The SOPs should directly address the responsibilities, objectives, and challenges. + Contextual Inference: If SOPs for the role are not explicitly stated, infer them from the context provided. + Empty Lists: If no SOPs are generated, return an empty list for each category. + Format: The SOPs should be direct and concise. + ''', + } + ], + response_format=RoleSOPs, + max_tokens=1024, + temperature=0.1 + ) + role_sop = json.loads(response.choices[0].message.content) + roles_sops_all[role_title] = role_sop return roles_sops_all + + + + def generate_sops_by_role_and_area(self, role: str, area: str) -> RoleSOPs: + + response = self.client.beta.chat.completions.parse( + model=self.model, + messages=[ + { + "role": "system", + "content": f'''Your job is to generate Standard Operating Procedures (SOPs) for the role of "{role}" with a focus on the area "{area}" based on the following instructions: + + Instructions: + + Categorization: Organize the SOPs under three categories: "must," "shall," and "will." + Direct Instructions: The SOPs should directly address responsibilities, objectives, and challenges related to the area of "{area}" for the role of "{role}". + Contextual Inference: If SOPs for the area are not explicitly stated, infer them from the role and area context provided. + Empty Lists: If no SOPs are generated, return an empty list for each category. + Format: The SOPs should be direct and concise. + ''', + } + ], + response_format=RoleSOPs, + max_tokens=1024, + temperature=0.1 + ) + return json.loads(response.choices[0].message.content) + + + def generate_executive_sops_from_questionnaire(self, data: dict) -> RoleSOPs: + """ + Generate SOPs based on the answers from an executive questionnaire. + + :param data: A dictionary containing the vision, strategic goals, and department goals. + :return: SOPs categorized by "must", "shall", and "will". + """ + vision_list = data.get("organization vision", []) + strategic_goals = data.get("organization strategic goals", []) + department_goals = data.get("department goals", []) + + # Format vision and goals as text + formatted_vision = "\n".join([f"- {vision}" for vision in vision_list]) + formatted_goals = "\n".join([f"- {goal}" for goal in strategic_goals]) + formatted_department_goals = "\n".join([ + f"{dept}: " + ", ".join([f"{goal}" for goal in goals]) + for dept_dict in department_goals + for dept, goals in dept_dict.items() + ]) + + response = self.client.beta.chat.completions.parse( + model=self.model, + messages=[ + { + "role": "system", + "content": f'''Generate Standard Operating Procedures (SOPs) for an executive role based on the following information: + + Organizational Vision: + {formatted_vision} + + Organizational Strategic Goals: + {formatted_goals} + + Departmental Strategic Goals: + {formatted_department_goals} + + Instructions: + + Categorization: Organize the SOPs under three categories: "must," "shall," and "will." + Direct Instructions: The SOPs should address leadership responsibilities for achieving the vision, strategic contribution, and goals outlined. + Empty Lists: If no SOPs are generated, return an empty list for each category. + Format: SOPs should be direct and concise. + ''', + } + ], + response_format=RoleSOPs, + max_tokens=1024, + temperature=0.1 + ) + return json.loads(response.choices[0].message.content) + + + + + def generate_executive_sops_from_doc(self,docs) -> SOPsResponse: + + + docs_text = self._extract_text_from_docs(docs) + + response = self.client.beta.chat.completions.parse( + model=self.model, + messages=[ + { + "role": "system", + "content": f'''Your job is to extract Standard Operating Procedure (SOP) questions specifically for the role of "{role}" from the provided text. + + Instructions: + + Categorization: Organize the questions under three categories: "must," "shall," and "will." + Direct Questions: The questions should be directly addressed to the person in the role. Do not reference the role itself in the question. + Contextual Inference: If SOPs for the role are not explicitly stated, infer them from the context, but only if there is clear evidence within the text. Do not generate or assume SOPs that are not directly supported by the document. + Empty Lists: If no SOPs are found for the role, return an empty list for each category. + Format: The questions should be direct and concise, e.g., "Have you completed all the required reports?" + Example: + + Category: Must + + Have you completed all the required reports? + Category: Shall + + Are you ensuring that all team members follow the safety protocols? + Category: Will + + Are you planning to review the budget next week?''', + }, + { + "role": "user", + "content": [{"type": "text", "text": text} for text in docs_text], + } + ], + response_format=VisionMissionResponse, + max_tokens=1024, + temperature=0.1 + ) + return response + + def extract_vision_and_mission(self, docs: str): + """ + Use LLM to extract Vision and Mission from the document text. + + :param document_text: The text content of the document. + :return: (vision_section, mission_section) + """ + docs_text = self._extract_text_from_docs(docs) + response = self.client.beta.chat.completions.parse( + model=self.model, + messages=[ + { + "role": "system", + "content": '''You are a helpful assistant that extracts specific sections from business documents. + Your task is to extract the "Vision" and "Mission" sections (or "Goals" if "Mission" is not found). + mission is basically same as goals just mission as goals if not found + Provide the sections exactly as they appear in the document.''' + }, + + { + "role": "user", + "content": [{"type": "text", "text": text} for text in docs_text], + } + ], + max_tokens=1024, + temperature=0.1, + response_format=VisionMissionResponse, + ) + + # Parse the response from the LLM + extracted_text = json.loads(response.choices[0].message.content) + + + print(F"extracted text:{extracted_text}") + # Assuming the response contains fields for 'vision' and 'mission' (or 'goals') + vision_section = extracted_text["vision"] + mission_section = extracted_text["mission"] + + return vision_section, mission_section + + diff --git a/src/utils/utils.py b/src/utils/utils.py index 97bac5f..ce7154a 100644 --- a/src/utils/utils.py +++ b/src/utils/utils.py @@ -7,7 +7,6 @@ def delete_file(file_path): print(f"Error deleting file {file_path}: {e}") -import os def delete_all_files_in_directory(directory_path): try: