erp_ai/notebooks/test.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pypandoc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "import os\n",
    "\n",
    "# Adjust this path to point to the root of your project\n",
    "project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))\n",
    "\n",
    "# Add the project root to sys.path\n",
    "if project_root not in sys.path:\n",
    "    sys.path.insert(0, project_root)\n",
    "\n",
    "# Now you can import your modules\n",
    "from src.services.sop_generator import SopGenerator\n",
    "from src.utils.pdf_loader import load_pdf_to_docs\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pip install pypandoc\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "ename": "OSError",
     "evalue": "No pandoc was found: either install pandoc and add it\nto your PATH or or call pypandoc.download_pandoc(...) or\ninstall pypandoc wheels with included pandoc.",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mOSError\u001b[0m                                   Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[1], line 54\u001b[0m\n\u001b[1;32m     51\u001b[0m pdf_file_path \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m/root/ds_erp_ai/data/raw/test_sop.pdf\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m     53\u001b[0m \u001b[38;5;66;03m# Load a .doc file (converts to .docx internally)\u001b[39;00m\n\u001b[0;32m---> 54\u001b[0m doc_docs \u001b[38;5;241m=\u001b[39m \u001b[43mload_document\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdoc_file_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     56\u001b[0m \u001b[38;5;66;03m# Load a .docx file\u001b[39;00m\n\u001b[1;32m     57\u001b[0m \u001b[38;5;66;03m#docx_docs = load_document(docx_file_path)\u001b[39;00m\n\u001b[1;32m     58\u001b[0m \n\u001b[1;32m     59\u001b[0m \u001b[38;5;66;03m# Load a PDF file\u001b[39;00m\n\u001b[1;32m     60\u001b[0m pdf_docs \u001b[38;5;241m=\u001b[39m load_document(pdf_file_path)\n",
      "Cell \u001b[0;32mIn[1], line 41\u001b[0m, in \u001b[0;36mload_document\u001b[0;34m(file_path, use_unstructured)\u001b[0m\n\u001b[1;32m     38\u001b[0m         loader \u001b[38;5;241m=\u001b[39m Docx2txtLoader(file_path)\n\u001b[1;32m     39\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m extension \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m.doc\u001b[39m\u001b[38;5;124m'\u001b[39m:\n\u001b[1;32m     40\u001b[0m     \u001b[38;5;66;03m# Convert .doc to .docx first\u001b[39;00m\n\u001b[0;32m---> 41\u001b[0m     docx_path \u001b[38;5;241m=\u001b[39m \u001b[43mconvert_doc_to_docx\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfile_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     42\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m load_document(docx_path, use_unstructured\u001b[38;5;241m=\u001b[39muse_unstructured)\n\u001b[1;32m     43\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n",
      "Cell \u001b[0;32mIn[1], line 16\u001b[0m, in \u001b[0;36mconvert_doc_to_docx\u001b[0;34m(doc_path)\u001b[0m\n\u001b[1;32m      6\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m      7\u001b[0m \u001b[38;5;124;03mConvert a .doc file to .docx using pypandoc.\u001b[39;00m\n\u001b[1;32m      8\u001b[0m \u001b[38;5;124;03m\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m     13\u001b[0m \u001b[38;5;124;03m    str: The path to the converted .docx file.\u001b[39;00m\n\u001b[1;32m     14\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m     15\u001b[0m docx_path \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39msplitext(doc_path)[\u001b[38;5;241m0\u001b[39m] \u001b[38;5;241m+\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m.docx\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[0;32m---> 16\u001b[0m \u001b[43mpypandoc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconvert_file\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdoc_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mdocx\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moutputfile\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdocx_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     17\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m docx_path\n",
      "File \u001b[0;32m~/ds_erp_ai/erp/lib/python3.10/site-packages/pypandoc/__init__.py:200\u001b[0m, in \u001b[0;36mconvert_file\u001b[0;34m(source_file, to, format, extra_args, encoding, outputfile, filters, verify_format, sandbox, cworkdir)\u001b[0m\n\u001b[1;32m    197\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(discovered_source_files) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m    198\u001b[0m     discovered_source_files \u001b[38;5;241m=\u001b[39m discovered_source_files[\u001b[38;5;241m0\u001b[39m]\n\u001b[0;32m--> 200\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_convert_input\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdiscovered_source_files\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mformat\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mpath\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mto\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mextra_args\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mextra_args\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    201\u001b[0m \u001b[43m                  \u001b[49m\u001b[43moutputfile\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutputfile\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfilters\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfilters\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    202\u001b[0m \u001b[43m                  \u001b[49m\u001b[43mverify_format\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverify_format\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msandbox\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msandbox\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    203\u001b[0m \u001b[43m                  \u001b[49m\u001b[43mcworkdir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcworkdir\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/ds_erp_ai/erp/lib/python3.10/site-packages/pypandoc/__init__.py:364\u001b[0m, in \u001b[0;36m_convert_input\u001b[0;34m(source, format, input_type, to, extra_args, outputfile, filters, verify_format, sandbox, cworkdir)\u001b[0m\n\u001b[1;32m    361\u001b[0m _check_log_handler()\n\u001b[1;32m    363\u001b[0m logger\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mEnsuring pandoc path...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 364\u001b[0m \u001b[43m_ensure_pandoc_path\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    366\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m verify_format:\n\u001b[1;32m    367\u001b[0m     logger\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mVerifying format...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
      "File \u001b[0;32m~/ds_erp_ai/erp/lib/python3.10/site-packages/pypandoc/__init__.py:797\u001b[0m, in \u001b[0;36m_ensure_pandoc_path\u001b[0;34m()\u001b[0m\n\u001b[1;32m    789\u001b[0m logger\u001b[38;5;241m.\u001b[39minfo(textwrap\u001b[38;5;241m.\u001b[39mdedent(\u001b[38;5;124m\"\"\"\u001b[39m\u001b[38;5;130;01m\\\u001b[39;00m\n\u001b[1;32m    790\u001b[0m \u001b[38;5;124m    See http://johnmacfarlane.net/pandoc/installing.html\u001b[39m\n\u001b[1;32m    791\u001b[0m \u001b[38;5;124m    for installation options\u001b[39m\n\u001b[1;32m    792\u001b[0m \u001b[38;5;124m\u001b[39m\u001b[38;5;124m\"\"\"\u001b[39m))\n\u001b[1;32m    793\u001b[0m logger\u001b[38;5;241m.\u001b[39minfo(textwrap\u001b[38;5;241m.\u001b[39mdedent(\u001b[38;5;124m\"\"\"\u001b[39m\u001b[38;5;130;01m\\\u001b[39;00m\n\u001b[1;32m    794\u001b[0m \u001b[38;5;124m    ---------------------------------------------------------------\u001b[39m\n\u001b[1;32m    795\u001b[0m \n\u001b[1;32m    796\u001b[0m \u001b[38;5;124m\u001b[39m\u001b[38;5;124m\"\"\"\u001b[39m))\n\u001b[0;32m--> 797\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mOSError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNo pandoc was found: either install pandoc and add it\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    798\u001b[0m               \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mto your PATH or or call pypandoc.download_pandoc(...) or\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    799\u001b[0m               \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minstall pypandoc wheels with included pandoc.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
      "\u001b[0;31mOSError\u001b[0m: No pandoc was found: either install pandoc and add it\nto your PATH or or call pypandoc.download_pandoc(...) or\ninstall pypandoc wheels with included pandoc."
     ]
    }
   ],
   "source": [
    "import os\n",
    "import pypandoc\n",
    "from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, UnstructuredWordDocumentLoader\n",
    "\n",
    "def convert_doc_to_docx(doc_path: str) -> str:\n",
    "    \"\"\"\n",
    "    Convert a .doc file to .docx using pypandoc.\n",
    "    \n",
    "    Args:\n",
    "        doc_path (str): The path to the .doc file.\n",
    "\n",
    "    Returns:\n",
    "        str: The path to the converted .docx file.\n",
    "    \"\"\"\n",
    "    docx_path = os.path.splitext(doc_path)[0] + '.docx'\n",
    "    pypandoc.convert_file(doc_path, 'docx', outputfile=docx_path)\n",
    "    return docx_path\n",
    "\n",
    "def load_document(file_path: str, use_unstructured: bool = False):\n",
    "    \"\"\"\n",
    "    Utility function to load a PDF, DOCX, or DOC file and convert it to document objects.\n",
    "\n",
    "    Args:\n",
    "        file_path (str): The path to the file to load.\n",
    "        use_unstructured (bool): Whether to use the Unstructured loader for .docx files. Defaults to False.\n",
    "\n",
    "    Returns:\n",
    "        List[Document]: A list of Document objects representing the contents of the file.\n",
    "    \"\"\"\n",
    "    extension = os.path.splitext(file_path)[1].lower()\n",
    "    \n",
    "    if extension == '.pdf':\n",
    "        loader = PyPDFLoader(file_path)\n",
    "    elif extension == '.docx':\n",
    "        if use_unstructured:\n",
    "            loader = UnstructuredWordDocumentLoader(file_path)\n",
    "        else:\n",
    "            loader = Docx2txtLoader(file_path)\n",
    "    elif extension == '.doc':\n",
    "        # Convert .doc to .docx first\n",
    "        docx_path = convert_doc_to_docx(file_path)\n",
    "        return load_document(docx_path, use_unstructured=use_unstructured)\n",
    "    else:\n",
    "        raise ValueError(f\"Unsupported file type: {extension}. Only .pdf, .docx, and .doc are supported.\")\n",
    "    \n",
    "    return loader.load()\n",
    "\n",
    "# Example usage:\n",
    "doc_file_path = \"/root/ds_erp_ai/data/raw/document.doc\"\n",
    "#docx_file_path = \"/root/ds_erp_ai/data/raw/test_docx.docx\"\n",
    "pdf_file_path = \"/root/ds_erp_ai/data/raw/test_sop.pdf\"\n",
    "\n",
    "# Load a .doc file (converts to .docx internally)\n",
    "doc_docs = load_document(doc_file_path)\n",
    "\n",
    "# Load a .docx file\n",
    "#docx_docs = load_document(docx_file_path)\n",
    "\n",
    "# Load a PDF file\n",
    "pdf_docs = load_document(pdf_file_path)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "ename": "OSError",
     "evalue": "No pandoc was found: either install pandoc and add it\nto your PATH or or call pypandoc.download_pandoc(...) or\ninstall pypandoc wheels with included pandoc.",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mOSError\u001b[0m                                   Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[4], line 48\u001b[0m\n\u001b[1;32m     45\u001b[0m pdf_file_path \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m/root/ds_erp_ai/data/raw/test_sop.pdf\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m     47\u001b[0m \u001b[38;5;66;03m# Load a .doc file (converts to PDF first)\u001b[39;00m\n\u001b[0;32m---> 48\u001b[0m doc_docs \u001b[38;5;241m=\u001b[39m \u001b[43mload_document\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdoc_file_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     50\u001b[0m \u001b[38;5;66;03m# Load a .docx file (converts to PDF first)\u001b[39;00m\n\u001b[1;32m     51\u001b[0m \u001b[38;5;66;03m#docx_docs = load_document(docx_file_path)\u001b[39;00m\n\u001b[1;32m     52\u001b[0m \n\u001b[1;32m     53\u001b[0m \u001b[38;5;66;03m# Load a PDF file\u001b[39;00m\n\u001b[1;32m     54\u001b[0m pdf_docs \u001b[38;5;241m=\u001b[39m load_document(pdf_file_path)\n",
      "Cell \u001b[0;32mIn[4], line 33\u001b[0m, in \u001b[0;36mload_document\u001b[0;34m(file_path)\u001b[0m\n\u001b[1;32m     29\u001b[0m extension \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39msplitext(file_path)[\u001b[38;5;241m1\u001b[39m]\u001b[38;5;241m.\u001b[39mlower()\n\u001b[1;32m     31\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m extension \u001b[38;5;129;01min\u001b[39;00m [\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m.doc\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m.docx\u001b[39m\u001b[38;5;124m'\u001b[39m]:\n\u001b[1;32m     32\u001b[0m     \u001b[38;5;66;03m# Convert .doc or .docx to PDF first\u001b[39;00m\n\u001b[0;32m---> 33\u001b[0m     pdf_path \u001b[38;5;241m=\u001b[39m \u001b[43mconvert_to_pdf\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfile_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     34\u001b[0m     loader \u001b[38;5;241m=\u001b[39m PyPDFLoader(pdf_path)\n\u001b[1;32m     35\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m extension \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m.pdf\u001b[39m\u001b[38;5;124m'\u001b[39m:\n",
      "Cell \u001b[0;32mIn[4], line 16\u001b[0m, in \u001b[0;36mconvert_to_pdf\u001b[0;34m(doc_path)\u001b[0m\n\u001b[1;32m      6\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m      7\u001b[0m \u001b[38;5;124;03mConvert a .doc or .docx file to PDF using pypandoc.\u001b[39;00m\n\u001b[1;32m      8\u001b[0m \u001b[38;5;124;03m\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m     13\u001b[0m \u001b[38;5;124;03m    str: The path to the converted PDF file.\u001b[39;00m\n\u001b[1;32m     14\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m     15\u001b[0m pdf_path \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39msplitext(doc_path)[\u001b[38;5;241m0\u001b[39m] \u001b[38;5;241m+\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m.pdf\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[0;32m---> 16\u001b[0m \u001b[43mpypandoc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconvert_file\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdoc_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mpdf\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moutputfile\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpdf_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     17\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m pdf_path\n",
      "File \u001b[0;32m~/ds_erp_ai/erp/lib/python3.10/site-packages/pypandoc/__init__.py:200\u001b[0m, in \u001b[0;36mconvert_file\u001b[0;34m(source_file, to, format, extra_args, encoding, outputfile, filters, verify_format, sandbox, cworkdir)\u001b[0m\n\u001b[1;32m    197\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(discovered_source_files) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m    198\u001b[0m     discovered_source_files \u001b[38;5;241m=\u001b[39m discovered_source_files[\u001b[38;5;241m0\u001b[39m]\n\u001b[0;32m--> 200\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_convert_input\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdiscovered_source_files\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mformat\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mpath\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mto\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mextra_args\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mextra_args\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    201\u001b[0m \u001b[43m                  \u001b[49m\u001b[43moutputfile\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutputfile\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfilters\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfilters\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    202\u001b[0m \u001b[43m                  \u001b[49m\u001b[43mverify_format\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverify_format\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msandbox\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msandbox\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    203\u001b[0m \u001b[43m                  \u001b[49m\u001b[43mcworkdir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcworkdir\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/ds_erp_ai/erp/lib/python3.10/site-packages/pypandoc/__init__.py:364\u001b[0m, in \u001b[0;36m_convert_input\u001b[0;34m(source, format, input_type, to, extra_args, outputfile, filters, verify_format, sandbox, cworkdir)\u001b[0m\n\u001b[1;32m    361\u001b[0m _check_log_handler()\n\u001b[1;32m    363\u001b[0m logger\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mEnsuring pandoc path...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 364\u001b[0m \u001b[43m_ensure_pandoc_path\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    366\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m verify_format:\n\u001b[1;32m    367\u001b[0m     logger\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mVerifying format...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
      "File \u001b[0;32m~/ds_erp_ai/erp/lib/python3.10/site-packages/pypandoc/__init__.py:797\u001b[0m, in \u001b[0;36m_ensure_pandoc_path\u001b[0;34m()\u001b[0m\n\u001b[1;32m    789\u001b[0m logger\u001b[38;5;241m.\u001b[39minfo(textwrap\u001b[38;5;241m.\u001b[39mdedent(\u001b[38;5;124m\"\"\"\u001b[39m\u001b[38;5;130;01m\\\u001b[39;00m\n\u001b[1;32m    790\u001b[0m \u001b[38;5;124m    See http://johnmacfarlane.net/pandoc/installing.html\u001b[39m\n\u001b[1;32m    791\u001b[0m \u001b[38;5;124m    for installation options\u001b[39m\n\u001b[1;32m    792\u001b[0m \u001b[38;5;124m\u001b[39m\u001b[38;5;124m\"\"\"\u001b[39m))\n\u001b[1;32m    793\u001b[0m logger\u001b[38;5;241m.\u001b[39minfo(textwrap\u001b[38;5;241m.\u001b[39mdedent(\u001b[38;5;124m\"\"\"\u001b[39m\u001b[38;5;130;01m\\\u001b[39;00m\n\u001b[1;32m    794\u001b[0m \u001b[38;5;124m    ---------------------------------------------------------------\u001b[39m\n\u001b[1;32m    795\u001b[0m \n\u001b[1;32m    796\u001b[0m \u001b[38;5;124m\u001b[39m\u001b[38;5;124m\"\"\"\u001b[39m))\n\u001b[0;32m--> 797\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mOSError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNo pandoc was found: either install pandoc and add it\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    798\u001b[0m               \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mto your PATH or or call pypandoc.download_pandoc(...) or\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    799\u001b[0m               \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minstall pypandoc wheels with included pandoc.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
      "\u001b[0;31mOSError\u001b[0m: No pandoc was found: either install pandoc and add it\nto your PATH or or call pypandoc.download_pandoc(...) or\ninstall pypandoc wheels with included pandoc."
     ]
    }
   ],
   "source": [
    "import os\n",
    "from spire.doc import Document, FileFormat\n",
    "from langchain_community.document_loaders import PyPDFLoader\n",
    "\n",
    "def convert_word_to_pdf(doc_path: str) -> str:\n",
    "    \"\"\"\n",
    "    Convert a .doc or .docx file to PDF using Spire.Doc.\n",
    "    \n",
    "    Args:\n",
    "        doc_path (str): The path to the .doc or .docx file.\n",
    "\n",
    "    Returns:\n",
    "        str: The path to the converted PDF file.\n",
    "    \"\"\"\n",
    "    pdf_path = os.path.splitext(doc_path)[0] + '.pdf'\n",
    "    \n",
    "    # Create a Document object\n",
    "    document = Document()\n",
    "    # Load the Word document\n",
    "    document.load_from_file(doc_path)\n",
    "    # Save as PDF\n",
    "    document.save_to_file(pdf_path, FileFormat.PDF)\n",
    "    document.close()\n",
    "    \n",
    "    return pdf_path\n",
    "\n",
    "def load_document(file_path: str):\n",
    "    \"\"\"\n",
    "    Utility function to load a PDF, DOCX, or DOC file by first converting it to PDF.\n",
    "\n",
    "    Args:\n",
    "        file_path (str): The path to the file to load.\n",
    "\n",
    "    Returns:\n",
    "        List[Document]: A list of Document objects representing the contents of the file.\n",
    "    \"\"\"\n",
    "    extension = os.path.splitext(file_path)[1].lower()\n",
    "    \n",
    "    if extension in ['.doc', '.docx']:\n",
    "        # Convert .doc or .docx to PDF first\n",
    "        pdf_path = convert_word_to_pdf(file_path)\n",
    "        loader = PyPDFLoader(pdf_path)\n",
    "    elif extension == '.pdf':\n",
    "        loader = PyPDFLoader(file_path)\n",
    "    else:\n",
    "        raise ValueError(f\"Unsupported file type: {extension}. Only .pdf, .docx, and .doc are supported.\")\n",
    "    \n",
    "    return loader.load()\n",
    "\n",
    "# Example usage:\n",
    "doc_file_path = \"/root/ds_erp_ai/data/raw/document.doc\"\n",
    "docx_file_path = \"/root/ds_erp_ai/data/raw/test_docx.docx\"\n",
    "pdf_file_path = \"/root/ds_erp_ai/data/raw/test_sop.pdf\"\n",
    "\n",
    "# Load a .doc file (converts to PDF first)\n",
    "doc_docs = load_document(doc_file_path)\n",
    "\n",
    "# Load a .docx file (converts to PDF first)\n",
    "docx_docs = load_document(docx_file_path)\n",
    "\n",
    "# Load a PDF file\n",
    "pdf_docs = load_document(pdf_file_path)\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "erp",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}