AI indexing completed

2024-08-16 17:37:28 +01:00
parent 713354371e
commit cff9511d86
13 changed files with 2843 additions and 257 deletions
@@ -1,19 +1,18 @@
 import sys, os
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
-from utils import create_vector_store, save_embedded_data, load_documents_from_directory, load_embedding_model
+from utils import create_vector_store, save_embedded_data, load_documents_from_directory, process_directory
 from loggings.logging_config import logger
 import time 
 # This module will load in the data, you only need to add the data path to it. 
 data_path = './data'
 # # loading the embeddings
 # logger.info(f"Loading the embeddings")
 # embeddings = load_embedding_model()
 # logger.info(f"Embeddings loaded")
 def load_data(data_path: str):
    logger.info(f"Loading data from {data_path}")
    start_time = time.time()
    # logging the start time
    logger.info(f"Start time: {start_time}")
    documents, docs_id, num_pages = load_documents_from_directory(data_path)
    logger.info(f"Data loaded")
    logger.info(f"Creating vector store")
@@ -23,8 +22,17 @@ def load_data(data_path: str):
    # saving the embedded data
    save_embedded_data(embed_db)
    logger.info(f"Vector store saved")
    end_time = time.time()
    logger.info(f"End time: {end_time}")
    time_taken = end_time - start_time
    logger.info(f"Time taken: {time_taken}")
    print("Vector store created and saved")
    # creating the thumbnails
    logger.info(f"Creating thumbnails")
    status = process_directory(data_path)
    print(f"{status}: Thumbnails created.")
    logger.info(f"Thumbnails created")
    return embed_db
@@ -4,16 +4,12 @@ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
 from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
-from utils import search, load_embedded_data
+from utils import  load_embedded_data, load_documents_from_directory, create_vector_store, save_embedded_data
 from search import search_and_summarize
 from data_ingest import load_data
 app = FastAPI()
 # Initialize global variables for FAISS index and vector store
 try:
    vector_store = load_embedded_data()
 except Exception as e:
    vector_store = None
 # Define allowed origins for CORS
 origins = [
@@ -37,19 +33,21 @@ class SearchRequest(BaseModel):
@app.get("/load_documents")
 def load_documents(directory: str):
    global vector_store
-    # Load documents using the utility function
+    # loading the documents from the directory
-    vector_store = load_data(directory)
+    documents, docs_id, num_pages = load_documents_from_directory(directory)
    # embedding the documents
    embed_db = create_vector_store(documents, docs_id, num_pages)
    # saving the embedded data
    status = save_embedded_data(embed_db)
    return {"status": "Documents loaded successfully"}
-@app.get("/search")
+@app.post("/search")
 def search(request: SearchRequest):
    global vector_store
    # Perform search using the utility function
-    results = search(vector_store, request.query)
+    results = search_and_summarize(request.query)
    return {"results": results}
@@ -13,6 +13,8 @@ docx2txt
 docx
 fastapi[standard]
 pdfplumber
 pypdf
 python-docx
 pytesseract
 groq
 python-dotenv
@@ -1,21 +1,98 @@
 from utils import search
 import sys, os
 import json
 #  Add the root directory to sys.path
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
 from loggings.logging_config import logger
 # a function to get data description
 def get_data_description(data_path):
    # ensuring no // or / or extension is present
    data_name = data_path.split('/')[-1].split('\\')[-1].split('.')[0]
    # print(data_name)
    # open the data.json file
    with open('data/data.json') as f:
        data = json.load(f)
        existing_data = data.keys()
        if data_name in existing_data:
            return data[data_name]['doc_summary']
        else:
            return 'No description available'
 # getting data thumbnais. 
 def get_data_thumbnail(data_path, timestamp = None):
    # ensuring no // or / or extension is present
    file_name = data_path.split('/')[-1].split('\\')[-1].split('.')[0]
    # first check is to see if the file_name has a .png image in the thumbnail folder
    if os.path.exists(f'data/thumbnails/{file_name}.png'):
        return f'data/thumbnails/{file_name}.png'
    # the second check is to see if we have a folder with this file_name
    elif os.path.exists(f'data/{file_name}'):
        # so now we want to access the first timestamp
        if timestamp:
            first = timestamp[0]
            # split by -
            start, end = first.split('-')
            # we want to convert something like 03:00, 04:00, 03:30 which is in min:sec to seconds
            start = int(start.split(':')[0])*60 + int(start.split(':')[1])
            end = int(end.split(':')[0])*60 + int(end.split(':')[1])
            # bringing them together
            image_file = f"{start}-{end}s.png"
            # niw checkin if the file exists
            if os.path.exists(f'data/{file_name}/{image_file}'):
                return f'data/{file_name}/{image_file}'
 def summarize_doc_search(data):
    summary = {}
    for item in data:
        source = item['source']
        if source not in summary:
            summary[source] = {'pages': [], 'timestamps': [], 'file_type': item.get('file_type', 'pdf')}
        if 'page' in item:
            summary[source]['pages'].append(item['page'])
        if 'timestamp' in item:
            summary[source]['timestamps'].append(item['timestamp'])
    # Formatting the summary as a list of dictionaries
    summarized_list = [
        {'filename': key.split("\\")[-1], 
         'pages': value['pages'], 
         'timestamps': value['timestamps'], 
         'file_type': value['file_type']}
        for key, value in summary.items()
    ]
    # getting the file description and thumbnail
    for item in summarized_list:
        item['description'] = get_data_description(item['filename'])
        # ehcking if we have an empty timestamp list
        if len(item['timestamps']) > 0:
            item['thumbnail'] = get_data_thumbnail(item['filename'], item['timestamps'])
        else:
            item['thumbnail'] = get_data_thumbnail(item['filename'])
    return summarized_list
 # a function that perform the search and summary together
 def search_and_summarize(query):
    logger.info("Searching for the query")
    docs = search(query)
    logger.info("Search completed")
    logger.info("Summarizing search results")
    summary = summarize_doc_search(docs)
    logger.info("Search results summarized")
    return summary
 if __name__ == "__main__":
    logger.info("Receiving the search query")
    query = input("Enter the search query: ")
-    logger.info(f"Searching for {query}")
+    logger.info(f"Search query received: {query}")
-    page_content, all, pages = search(query)
+    logger.info("Searching and summarizing the search results")
-    logger.info("Search completed")
+    search_results = search_and_summarize(query)
-    logger.info(f"Page content: {page_content}")
+    logger.info("Search results summarized")
-    print(f"Page content: {all}")
+    print(search_results)
    print(f"Pages: {pages}")
    print("Search completed")
@@ -11,16 +11,29 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2024-08-16 16:06:32,880 - INFO - Loading the embedding model\n",
      "2024-08-16 16:06:38,758 - WARNING - c:\\Users\\timmy_3aupohg\\anaconda3\\envs\\smog_env\\Lib\\site-packages\\sentence_transformers\\cross_encoder\\CrossEncoder.py:11: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n",
      "  from tqdm.autonotebook import tqdm, trange\n",
      "\n",
      "2024-08-16 16:06:47,268 - INFO - PyTorch version 2.4.0+cu124 available.\n",
      "2024-08-16 16:06:47,868 - INFO - Load pretrained SentenceTransformer: BAAI/bge-small-en\n",
      "2024-08-16 16:06:55,638 - INFO - Embedding model loaded\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
-     "execution_count": 2,
+     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -32,17 +45,51 @@
    "from langchain_groq import ChatGroq\n",
    "from langchain_core.prompts.prompt import PromptTemplate\n",
    "from langchain_core.output_parsers import StrOutputParser\n",
    "from collections import defaultdict\n",
    "import json\n",
    "load_dotenv()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# setting up groq api key\n",
    "# os.environ[\"GROQ_API_KEY\"] = os.getenv('GROQ_API_KEY')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
-    "# setting up groq api key\n",
+    "\n",
-    "os.environ[\"GROQ_API_KEY\"] = os.getenv('GROQ_API_KEY')"
+    "# # chat set up\n",
    "# GROQ_LLM = ChatGroq(temperature=0, model_name=\"llama3-8b-8192\", max_tokens=100)\n",
    "\n",
    "\n",
    "# ### Chains #####\n",
    "# # Initiator\n",
    "# def doc_summarizer(document_page: list) -> str:\n",
    "#     initiator_prompt = PromptTemplate(\n",
    "#         template=\"\"\"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n",
    "#         Create a short summary of the document based on the provided text. \n",
    "        \n",
    "#         Start with: This document is about...\n",
    "        \n",
    "#         <|eot_id|><|start_header_id|>user<|end_header_id|>\n",
    "#         DOCUMENT: {document_page} \\n\n",
    "        \n",
    "#         <|eot_id|><|start_header_id|>assistant<|end_header_id|>\"\"\",\n",
    "#         input_variables=[\"document_page\"],\n",
    "#     )\n",
    "\n",
    "#     initiator_router = initiator_prompt | GROQ_LLM | StrOutputParser()\n",
    "#     output = initiator_router.invoke({\"document_page\":document_page})\n",
    "#     return output\n"
   ]
  },
  {
@@ -51,41 +98,31 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "\n",
+    "document_page = 'Wirebrush WD 40'\n",
-    "# chat set up\n",
+    "# testing the function\n",
-    "GROQ_LLM = ChatGroq(temperature=0, model_name=\"llama3-8b-8192\", max_tokens=100)\n",
+    "# summary = doc_summarizer(document_page)"
    "\n",
    "\n",
    "### Chains #####\n",
    "# Initiator\n",
    "def doc_summarizer(document_page: list) -> str:\n",
    "    initiator_prompt = PromptTemplate(\n",
    "        template=\"\"\"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n",
    "        Create a short summary of the document based on the provided text. \n",
    "        \n",
    "        Start with: This document is about...\n",
    "        \n",
    "        <|eot_id|><|start_header_id|>user<|end_header_id|>\n",
    "        DOCUMENT: {document_page} \\n\n",
    "        \n",
    "        <|eot_id|><|start_header_id|>assistant<|end_header_id|>\"\"\",\n",
    "        input_variables=[\"document_page\"],\n",
    "    )\n",
    "\n",
    "    initiator_router = initiator_prompt | GROQ_LLM | StrOutputParser()\n",
    "    output = initiator_router.invoke({\"document_page\":document_page})\n",
    "    return output\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2024-08-16 16:06:55,717 - INFO - Searching for Wirebrush WD 40\n",
      "2024-08-16 16:06:55,717 - INFO - Loading embedded data\n",
      "2024-08-16 16:06:56,487 - WARNING - c:\\Users\\timmy_3aupohg\\anaconda3\\envs\\smog_env\\Lib\\site-packages\\transformers\\models\\bert\\modeling_bert.py:439: UserWarning: 1Torch was not compiled with flash attention. (Triggered internally at C:\\cb\\pytorch_1000000000000\\work\\aten\\src\\ATen\\native\\transformers\\cuda\\sdp_utils.cpp:555.)\n",
      "  attn_output = torch.nn.functional.scaled_dot_product_attention(\n",
      "\n",
      "2024-08-16 16:06:56,628 - INFO - Search completed\n"
     ]
    }
   ],
   "source": [
-    "document_page = 'How to change the engine oil of a toyota corrolla.'\n",
+    "docs = search(document_page)"
    "# testing the function\n",
    "summary = doc_summarizer(document_page)"
   ]
  },
  {
@@ -96,7 +133,46 @@
    {
     "data": {
      "text/plain": [
-       "'This document is about providing a step-by-step guide on how to change the engine oil of a Toyota Corolla.'"
+       "[{'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 424},\n",
       " {'source': 'How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]',\n",
       "  'timestamp': '0:30-1:0',\n",
       "  'file_type': 'video'},\n",
       " {'source': 'How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]',\n",
       "  'timestamp': '2:00-2:00',\n",
       "  'file_type': 'video'},\n",
       " {'source': 'How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]',\n",
       "  'timestamp': '2:30-3:0',\n",
       "  'file_type': 'video'},\n",
       " {'source': 'How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]',\n",
       "  'timestamp': '4:00-4:00',\n",
       "  'file_type': 'video'},\n",
       " {'source': 'How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]',\n",
       "  'timestamp': '5:30-6:0',\n",
       "  'file_type': 'video'},\n",
       " {'source': 'How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]',\n",
       "  'timestamp': '8:00-8:00',\n",
       "  'file_type': 'video'},\n",
       " {'source': 'How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]',\n",
       "  'timestamp': '8:30-9:0',\n",
       "  'file_type': 'video'},\n",
       " {'source': 'How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]',\n",
       "  'timestamp': '10:00-10:00',\n",
       "  'file_type': 'video'},\n",
       " {'source': 'How to change rear wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]',\n",
       "  'timestamp': '3:30-4:0',\n",
       "  'file_type': 'video'},\n",
       " {'source': 'How to change rear wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]',\n",
       "  'timestamp': '5:30-6:0',\n",
       "  'file_type': 'video'},\n",
       " {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 329},\n",
       " {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 264},\n",
       " {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 290},\n",
       " {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 201},\n",
       " {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 326},\n",
       " {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 315},\n",
       " {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 317},\n",
       " {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 325},\n",
       " {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 422}]"
      ]
     },
     "execution_count": 6,
@@ -104,202 +180,235 @@
     "output_type": "execute_result"
    }
   ],
   "source": [
    "summary"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "docs = search(document_page)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'source': './data\\\\How to change engine oil and filter on TOYOTA Corolla.txt',\n",
       "  'page': 1,\n",
       "  'file_type': 'text'},\n",
       " {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 438},\n",
       " {'source': './data\\\\How to change engine oil and filter on TOYOTA Corolla.txt',\n",
       "  'page': 3,\n",
       "  'file_type': 'text'},\n",
       " {'source': './data\\\\How to change engine oil and filter on TOYOTA Corolla.txt',\n",
       "  'page': 2,\n",
       "  'file_type': 'text'},\n",
       " {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 525},\n",
       " {'source': './data\\\\How to change spark plugs on TOYOTA COROLLA.docx',\n",
       "  'page': 2,\n",
       "  'file_type': 'text'},\n",
       " {'source': './data\\\\How to change spark plugs on TOYOTA COROLLA.docx',\n",
       "  'page': 3,\n",
       "  'file_type': 'text'},\n",
       " {'source': './data\\\\How to change engine oil and filter on TOYOTA Corolla.txt',\n",
       "  'page': 0,\n",
       "  'file_type': 'text'},\n",
       " {'source': './data\\\\How to change spark plugs on TOYOTA COROLLA.docx',\n",
       "  'page': 5,\n",
       "  'file_type': 'text'},\n",
       " {'source': './data\\\\How to change spark plugs on TOYOTA COROLLA.docx',\n",
       "  'page': 6,\n",
       "  'file_type': 'text'},\n",
       " {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 526},\n",
       " {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 422},\n",
       " {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 514},\n",
       " {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 153},\n",
       " {'filename': 'audio-2', 'duration': '0-3 minutes', 'file_type': 'audio'},\n",
       " {'filename': 'audio-2', 'duration': '3-6 minutes', 'file_type': 'audio'},\n",
       " {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 149},\n",
       " {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 513},\n",
       " {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 436},\n",
       " {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 148}]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "docs"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
-    "from collections import defaultdict\n",
+    "# a function to get data description\n",
-    "\n",
+    "def get_data_description(data_path):\n",
-    "def transform_file_data(input_data):\n",
+    "    # ensuring no // or / or extension is present\n",
-    "    # Create a dictionary to aggregate data by filename\n",
+    "    data_name = data_path.split('/')[-1].split('\\\\')[-1].split('.')[0]\n",
-    "    aggregated_data = defaultdict(lambda: {\n",
+    "    # print(data_name)\n",
-    "        'filename': '',\n",
+    "    # open the data.json file\n",
-    "        'pages': [],\n",
+    "    with open('data/data.json') as f:\n",
-    "        'timestamps': [],\n",
+    "        data = json.load(f)\n",
-    "        'description': 'lorem ipsum',\n",
+    "        existing_data = data.keys()\n",
-    "        'filetype': '',\n",
+    "        if data_name in existing_data:\n",
-    "        'thumbnail': '',\n",
+    "            return data[data_name]['doc_summary']\n",
-    "        'track_id': 123\n",
+    "        else:\n",
-    "    })\n",
+    "            return 'No description available'"
    "\n",
    "    for item in input_data:\n",
    "        if 'source' in item:\n",
    "            file_path = item['source']\n",
    "            filename = file_path.split('\\\\')[-1]\n",
    "            extension = filename.split('.')[-1]\n",
    "\n",
    "            aggregated_data[filename]['filename'] = filename\n",
    "            aggregated_data[filename]['filetype'] = extension\n",
    "            aggregated_data[filename]['thumbnail'] = f\"{filename.split('.')[0]}.jpg\"\n",
    "\n",
    "            if extension in ['pdf', 'txt', 'docx']:\n",
    "                aggregated_data[filename]['pages'].append(item['page'])\n",
    "            elif extension in ['mp4', 'mkv', 'flv']:\n",
    "                aggregated_data[filename]['timestamps'].append(item['page'])\n",
    "            elif extension in ['mp3', 'wav', 'flac']:\n",
    "                aggregated_data[filename]['timestamps'].append(item['page'])\n",
    "            elif extension in ['jpg', 'jpeg', 'png', 'gif', 'bmp']:\n",
    "                aggregated_data[filename].pop('pages', None)  # Remove pages if it's an image\n",
    "                aggregated_data[filename].pop('timestamps', None)  # Remove timestamps if it's an image\n",
    "\n",
    "        elif 'filename' in item:\n",
    "            filename = item['filename']\n",
    "            extension = item['file_type']\n",
    "            aggregated_data[filename]['filename'] = f\"{filename}.{extension}\"\n",
    "            aggregated_data[filename]['filetype'] = extension\n",
    "            aggregated_data[filename]['thumbnail'] = f\"{filename}.jpg\"\n",
    "            if 'duration' in item:\n",
    "                start_time, end_time = item['duration'].split(' minutes')[0].split('-')\n",
    "                aggregated_data[filename]['timestamps'].append((int(start_time), int(end_time)))\n",
    "\n",
    "    # Convert aggregated data to the desired output format\n",
    "    output_data = []\n",
    "    for filename, data in aggregated_data.items():\n",
    "        # Remove empty lists for pages and timestamps\n",
    "        if not data['pages']:\n",
    "            data.pop('pages', None)\n",
    "        if not data['timestamps']:\n",
    "            data.pop('timestamps', None)\n",
    "        output_data.append(data)\n",
    "\n",
    "    return output_data\n"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'filename': 'How to change engine oil and filter on TOYOTA Corolla.txt', 'pages': [1, 3, 2, 0], 'description': 'lorem ipsum', 'filetype': 'txt', 'thumbnail': 'How to change engine oil and filter on TOYOTA Corolla.jpg', 'track_id': 123}\n",
      "{'filename': 'corolla-2020-toyota-owners-manual.pdf', 'pages': [438, 525, 526, 422, 514, 153, 149, 513, 436, 148], 'description': 'lorem ipsum', 'filetype': 'pdf', 'thumbnail': 'corolla-2020-toyota-owners-manual.jpg', 'track_id': 123}\n",
      "{'filename': 'How to change spark plugs on TOYOTA COROLLA.docx', 'pages': [2, 3, 5, 6], 'description': 'lorem ipsum', 'filetype': 'docx', 'thumbnail': 'How to change spark plugs on TOYOTA COROLLA.jpg', 'track_id': 123}\n",
      "{'filename': 'audio-2.audio', 'timestamps': [(0, 3), (3, 6)], 'description': 'lorem ipsum', 'filetype': 'audio', 'thumbnail': 'audio-2.jpg', 'track_id': 123}\n"
     ]
    }
   ],
   "source": [
    "output = transform_file_data(docs)\n",
    "for item in output:\n",
    "    print(item)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
-       "[{'filename': 'How to change engine oil and filter on TOYOTA Corolla.txt',\n",
+       "\"This document is about a video tutorial series on replacing car parts, specifically the latest installment of AutoDoc's video tutorials.\""
       "  'pages': [1, 3, 2, 0],\n",
       "  'description': 'lorem ipsum',\n",
       "  'filetype': 'txt',\n",
       "  'thumbnail': 'How to change engine oil and filter on TOYOTA Corolla.jpg',\n",
       "  'track_id': 123},\n",
       " {'filename': 'corolla-2020-toyota-owners-manual.pdf',\n",
       "  'pages': [438, 525, 526, 422, 514, 153, 149, 513, 436, 148],\n",
       "  'description': 'lorem ipsum',\n",
       "  'filetype': 'pdf',\n",
       "  'thumbnail': 'corolla-2020-toyota-owners-manual.jpg',\n",
       "  'track_id': 123},\n",
       " {'filename': 'How to change spark plugs on TOYOTA COROLLA.docx',\n",
       "  'pages': [2, 3, 5, 6],\n",
       "  'description': 'lorem ipsum',\n",
       "  'filetype': 'docx',\n",
       "  'thumbnail': 'How to change spark plugs on TOYOTA COROLLA.jpg',\n",
       "  'track_id': 123},\n",
       " {'filename': 'audio-2.audio',\n",
       "  'timestamps': [(0, 3), (3, 6)],\n",
       "  'description': 'lorem ipsum',\n",
       "  'filetype': 'audio',\n",
       "  'thumbnail': 'audio-2.jpg',\n",
       "  'track_id': 123}]"
      ]
     },
-     "execution_count": 12,
+     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
-    "output"
+    "get_data_description('How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "# getting data thumbnais. \n",
    "def get_data_thumbnail(data_path, timestamp = None):\n",
    "    # ensuring no // or / or extension is present\n",
    "    file_name = data_path.split('/')[-1].split('\\\\')[-1].split('.')[0]\n",
    "    # first check is to see if the file_name has a .png image in the thumbnail folder\n",
    "    if os.path.exists(f'data/thumbnails/{file_name}.png'):\n",
    "        return f'data/thumbnails/{file_name}.png'\n",
    "    # the second check is to see if we have a folder with this file_name\n",
    "    elif os.path.exists(f'data/{file_name}'):\n",
    "        # so now we want to access the first timestamp\n",
    "        if timestamp:\n",
    "            first = timestamp[0]\n",
    "            # split by -\n",
    "            start, end = first.split('-')\n",
    "            # we want to convert something like 03:00, 04:00, 03:30 which is in min:sec to seconds\n",
    "            start = int(start.split(':')[0])*60 + int(start.split(':')[1])\n",
    "            end = int(end.split(':')[0])*60 + int(end.split(':')[1])\n",
    "            # bringing them together\n",
    "            image_file = f\"{start}-{end}s.png\"\n",
    "            # niw checkin if the file exists\n",
    "            if os.path.exists(f'data/{file_name}/{image_file}'):\n",
    "                return f'data/{file_name}/{image_file}'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'data/How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]/210-240s.png'"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "get_data_thumbnail('How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]', timestamp=['3:30-4:0', '5:30-6:0'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'data/thumbnails/corolla-2020-toyota-owners-manual.png'"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "get_data_thumbnail(\"./data\\\\corolla-2020-toyota-owners-manual.pdf'\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "def summarize_doc_search(data):\n",
    "    summary = {}\n",
    "\n",
    "    for item in data:\n",
    "        source = item['source']\n",
    "        if source not in summary:\n",
    "            summary[source] = {'pages': [], 'timestamps': [], 'file_type': item.get('file_type', 'pdf')}\n",
    "        \n",
    "        if 'page' in item:\n",
    "            summary[source]['pages'].append(item['page'])\n",
    "        if 'timestamp' in item:\n",
    "            summary[source]['timestamps'].append(item['timestamp'])\n",
    "    \n",
    "    # Formatting the summary as a list of dictionaries\n",
    "    summarized_list = [\n",
    "        {'filename': key.split(\"\\\\\")[-1], \n",
    "         'pages': value['pages'], \n",
    "         'timestamps': value['timestamps'], \n",
    "         'file_type': value['file_type']}\n",
    "        for key, value in summary.items()\n",
    "    ]\n",
    "    \n",
    "    # getting the file description and thumbnail\n",
    "    for item in summarized_list:\n",
    "        item['description'] = get_data_description(item['filename'])\n",
    "        # ehcking if we have an empty timestamp list\n",
    "        if len(item['timestamps']) > 0:\n",
    "            item['thumbnail'] = get_data_thumbnail(item['filename'], item['timestamps'])\n",
    "        else:\n",
    "            item['thumbnail'] = get_data_thumbnail(item['filename'])\n",
    "    \n",
    "    return summarized_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "doc_summary = summarize_doc_search(docs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'filename': 'corolla-2020-toyota-owners-manual.pdf',\n",
       "  'pages': [424, 329, 264, 290, 201, 326, 315, 317, 325, 422],\n",
       "  'timestamps': [],\n",
       "  'file_type': 'pdf',\n",
       "  'description': \"This document is about the user manual for a Toyota Corolla, providing information and instructions on various aspects of the vehicle, including safety and security, vehicle status, driving operations, interior features, maintenance, and troubleshooting. The manual covers topics such as child seat installation, theft deterrent systems, reading driving-related information, operating the Entune audio system, and caring for the vehicle's interior and exterior. It also includes information on reporting safety defects and provides instructions for Canadian owners on seat belt and SRS air\",\n",
       "  'thumbnail': 'data/thumbnails/corolla-2020-toyota-owners-manual.png'},\n",
       " {'filename': 'How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]',\n",
       "  'pages': [],\n",
       "  'timestamps': ['0:30-1:0',\n",
       "   '2:00-2:00',\n",
       "   '2:30-3:0',\n",
       "   '4:00-4:00',\n",
       "   '5:30-6:0',\n",
       "   '8:00-8:00',\n",
       "   '8:30-9:0',\n",
       "   '10:00-10:00'],\n",
       "  'file_type': 'video',\n",
       "  'description': \"This document is about a video tutorial series on replacing car parts, specifically the latest installment of AutoDoc's video tutorials.\",\n",
       "  'thumbnail': 'data/How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]/30-60s.png'},\n",
       " {'filename': 'How to change rear wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]',\n",
       "  'pages': [],\n",
       "  'timestamps': ['3:30-4:0', '5:30-6:0'],\n",
       "  'file_type': 'video',\n",
       "  'description': \"This document is about a video tutorial series on replacing car parts, specifically the latest installment of Auto-Doc's video tutorials.\",\n",
       "  'thumbnail': 'data/How to change rear wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]/210-240s.png'}]"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "doc_summary"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
@@ -0,0 +1,145 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import random\n",
    "from PIL import Image, ImageDraw, ImageFont\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_text_thumbnail(file_path):\n",
    "    # Create a folder for thumbnails if it doesn't exist\n",
    "    thumbnail_folder = os.path.join(os.path.dirname(file_path), 'thumbnails')\n",
    "    os.makedirs(thumbnail_folder, exist_ok=True)\n",
    "    \n",
    "    # Extract file name (without extension)\n",
    "    file_name = os.path.splitext(os.path.basename(file_path))[0]\n",
    "    \n",
    "    # Create a random background color\n",
    "    background_color = tuple(random.randint(0, 255) for _ in range(3))\n",
    "    \n",
    "    # Create an image with the random background color\n",
    "    img = Image.new('RGB', (800, 400), color=background_color)\n",
    "    \n",
    "    # Initialize drawing context\n",
    "    d = ImageDraw.Draw(img)\n",
    "    \n",
    "    # Load a font\n",
    "    try:\n",
    "        font = ImageFont.truetype(\"arial.ttf\", 25)  # Adjust the font size as needed\n",
    "    except IOError:\n",
    "        font = ImageFont.load_default()\n",
    "    \n",
    "    # Get the bounding box of the text\n",
    "    text_bbox = d.textbbox((0, 0), file_name, font=font)\n",
    "    text_width = text_bbox[2] - text_bbox[0]\n",
    "    text_height = text_bbox[3] - text_bbox[1]\n",
    "    \n",
    "    # Calculate the position to center the text\n",
    "    text_x = (img.width - text_width) / 2\n",
    "    text_y = (img.height - text_height) / 2\n",
    "    \n",
    "    # Draw the text onto the image\n",
    "    d.text((text_x, text_y), file_name, font=font, fill=(255, 255, 255))  # White text\n",
    "    \n",
    "    # Save the image\n",
    "    thumbnail_path = os.path.join(thumbnail_folder, f\"{file_name}.png\")\n",
    "    img.save(thumbnail_path)\n",
    "    \n",
    "    print(f\"Thumbnail created: {thumbnail_path}\")\n",
    "\n",
    "def process_directory(directory_path):\n",
    "    supported_extensions = ['.txt', '.pdf', '.docx', '.mp3', '.m4a']\n",
    "    \n",
    "    for file in os.listdir(directory_path):\n",
    "        file_path = os.path.join(directory_path, file)\n",
    "        if os.path.isfile(file_path):\n",
    "            file_extension = os.path.splitext(file)[1].lower()\n",
    "            if file_extension in supported_extensions:\n",
    "                create_text_thumbnail(file_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Thumbnail created: data\\thumbnails\\audio-2.png\n",
      "Thumbnail created: data\\thumbnails\\Car-Repair-Receipt-repair.png\n",
      "Thumbnail created: data\\thumbnails\\Car-Repair-Receipt-service.png\n",
      "Thumbnail created: data\\thumbnails\\Car-Repair-Receipt-tire.png\n",
      "Thumbnail created: data\\thumbnails\\Car-Repair-Receipt-tuning.png\n",
      "Thumbnail created: data\\thumbnails\\Car-Repair-Receipt-wash.png\n",
      "Thumbnail created: data\\thumbnails\\corolla-2020-toyota-owners-manual.png\n",
      "Thumbnail created: data\\thumbnails\\How to change engine oil and filter on TOYOTA Corolla.png\n",
      "Thumbnail created: data\\thumbnails\\How to change front brake pads on TOYOTA Corolla.png\n",
      "Thumbnail created: data\\thumbnails\\How to change rear windshield wipers on TOYOTA Corolla.png\n",
      "Thumbnail created: data\\thumbnails\\How to change spark plugs on TOYOTA COROLLA.png\n",
      "Thumbnail created: data\\thumbnails\\test_rec.png\n"
     ]
    }
   ],
   "source": [
    "# Example usage:\n",
    "directory_path = 'data'\n",
    "process_directory(directory_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "smog_env",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
@@ -12,7 +12,11 @@ from langchain_core.output_parsers import StrOutputParser
 from uuid import uuid4
 from langchain_core.documents  import Document
 from text_extractor import TextExtractor
-import os
+import os, sys
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
 from loggings.logging_config import logger
 import random
 from PIL import Image, ImageDraw, ImageFont
 from concurrent.futures import ThreadPoolExecutor
 import math
 import json
@@ -29,6 +33,7 @@ import ffmpeg
 from dotenv import load_dotenv
 load_dotenv()
 # OpenAI API Key
 api_key = os.getenv('OPENAI_API_KEY')
 # setting up groq api key
@@ -53,11 +58,14 @@ def load_embedding_model():
 # ----------------------------------------------------------------------------------------------------
 # loading the embedding model
 logger.info("Loading the embedding model")
 embeddings = load_embedding_model()
 logger.info("Embedding model loaded")
 # --------------------------------------------------------TEXT PREPROCESSING--------------------------------------------
 def create_documents(doc, file_type='text'):
    logger.info(f"Creating documents from text")
    text = doc[0].page_content
    metadata = doc[0].metadata
    text_splitter = RecursiveCharacterTextSplitter(
@@ -80,6 +88,7 @@ def create_documents(doc, file_type='text'):
 def load_txt_document(document_path):
    logger.info(f"Loading text document from {document_path}")
    try:
        txt_doc = TextLoader(document_path)
        text = txt_doc.load()
@@ -91,6 +100,7 @@ def load_txt_document(document_path):
 def load_docx_document(document_path):
    logger.info(f"Loading docx document from {document_path}")
    try:
        docx_doc = Docx2txtLoader(document_path)
        text = docx_doc.load()
@@ -103,6 +113,7 @@ def load_docx_document(document_path):
 # creating a function that checks the document type and loads the document
 def load_pdf_document(document_path):
    logger.info(f"Loading pdf document from {document_path}")
    try:
        pdf_doc = PyPDFLoader(document_path)
        pages = pdf_doc.load_and_split()
@@ -125,11 +136,13 @@ def load_document(document_path):
 # ----------------------------------------------------IMAGE PROCESSING------------------------------------------------
 # Function to encode the image
 def encode_image(image_path):
-  with open(image_path, "rb") as image_file:
+    logger.info(f"Encoding image {image_path}")
-    return base64.b64encode(image_file.read()).decode('utf-8')
+    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')
 # Vision API to process the image
 def process_image(image_path):
    logger.info(f"Processing image {image_path}")
    global api_key
    # Getting the base64 string
@@ -174,10 +187,11 @@ def process_image(image_path):
 # create image document
 def create_image_document(image_path, file_type='image'):
    logger.info(f"Creating image document from {image_path}")
    # getting the image name from the image path
-    image_name = image_path.split('/')[-1].split('.')[0]
+    image_name = image_path.split('\\')[-1].split('.')[0]
    # setting image name as metadata
-    metadata = {'filename': image_name, 'file_type': file_type}
+    metadata = {'source': image_name, 'file_type': file_type}
    text_extractor = TextExtractor()
    text = text_extractor.read_text_from_image(image_path)
    # removing special characters and line breaks
@@ -199,6 +213,7 @@ def create_image_document(image_path, file_type='image'):
 # -----------------------------------------------AUDIO PROCESSING-----------------------------------------------------
 # Audio to Text
 def audio_to_text(filepath):
    logger.info(f"Transcribing audio file {filepath}")
    with open(filepath, "rb") as file:
        translation = client.audio.translations.create(
            file=(filepath, file.read()),
@@ -208,6 +223,7 @@ def audio_to_text(filepath):
 def split_audio_by_duration(audio_file_path, chunk_duration_minutes, print_output=True):
    logger.info(f"Splitting audio file {audio_file_path} by duration")
    # Convert chunk duration to milliseconds
    chunk_length_ms = chunk_duration_minutes * 60 * 1000
@@ -247,6 +263,7 @@ def split_audio_by_duration(audio_file_path, chunk_duration_minutes, print_outpu
    return chunk_folder, chunk_paths
 def transcribe_audio_chunks(audio_file_path, chunk_duration_minutes, file_type='audio'):
    logger.info(f"Transcribing audio chunks from {audio_file_path}")
    # Split the audio file into chunks
    chunk_folder, chunk_paths = split_audio_by_duration(audio_file_path, chunk_duration_minutes)
@@ -270,11 +287,25 @@ def transcribe_audio_chunks(audio_file_path, chunk_duration_minutes, file_type='
        start_min = (chunk_index - 1) * chunk_duration_minutes
        end_min = chunk_index * chunk_duration_minutes
        actual_end_min = min(end_min, (len(AudioSegment.from_file(audio_file_path)) // 60000))  # To handle the last chunk's actual duration
        # preparing the start and end min in a timestamp format, also also catching cases of decimal, making it a real time 
        if start_min % 1 == 0:
            start_min = f"{int(start_min)}:00"
            end_min = f"{int(end_min)}:00"
        else:
            # splitting the decimal part of the start and end min
            start_min_int, start_min_dec = str(start_min).split('.')
            end_min_int, end_min_dec = str(end_min).split('.')
            # converting the decimal part to seconds
            start_sec = int(start_min_dec) * 6
            end_sec = int(end_min_dec) * 6
            start_min = f"{start_min_int}:{start_sec}"
            end_min = f"{end_min_int}:{end_sec}"
        # Create a document with the transcript and metadata
        metadata = {
-                "filename": base_filename,
+                "source": base_filename,
-                "duration": f"{start_min}-{end_min} minutes", 
+                "timestamp": f"{start_min}-{end_min}", 
                "file_type": file_type,
            }
        document = Document(page_content=transcript, metadata=metadata)
@@ -282,6 +313,9 @@ def transcribe_audio_chunks(audio_file_path, chunk_duration_minutes, file_type='
    # Delete the chunk folder after processing
    shutil.rmtree(chunk_folder)
    # adding a delay
    time.sleep(0.2)
    return documents
@@ -294,7 +328,7 @@ def create_audio_document(audio_file_path, chunk_duration_minutes=3, file_type='
 # ------------------------------------------------VIDEO PROCESSING-----------------------------------------------------
 def preprocess_video_data(video_path: str, time_interval: int):
-    
+    logger.info(f"Preprocessing video data from {video_path}")
    # Load the video file
    video = VideoFileClip(video_path)
@@ -341,6 +375,7 @@ def preprocess_video_data(video_path: str, time_interval: int):
    # now creating document from the audio file
    documents = create_audio_document(audio_path, chunk_duration_minutes=0.5, file_type='video')
    logger.info(f"Documents created from video {video_path}")
    # deleting the audio file
    os.remove(audio_path)
@@ -349,6 +384,7 @@ def preprocess_video_data(video_path: str, time_interval: int):
 #----------------------------------------------------DOC SUMMARIZER --------------------------------------------------
 def doc_summarizer(document_page: list) -> str:
    logger.info(f"Summarizing document")
    initiator_prompt = PromptTemplate(
        template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
        Create a short summary of the document based on the provided text. 
@@ -370,12 +406,15 @@ def doc_summarizer(document_page: list) -> str:
 #-----------------------------------------------------OTHERS--------------------------------------------------------------
 def save_embedded_data(embeddings, key="data"):
-  embeddings.save_local(f"index/faiss_index_{key}")
+    logger.info(f"Saving embeddings")
-  print("Embeddings saved")
+    embeddings.save_local(f"index/faiss_index_{key}")
    print("Embeddings saved")
    return 'saved'
 def load_embedded_data(embeddings=embeddings, key="data"):
-  embed_db = FAISS.load_local(f"index/faiss_index_{key}", embeddings, allow_dangerous_deserialization=True)
+    logger.info(f"Loading embedded data")
-  return embed_db
+    embed_db = FAISS.load_local(f"index/faiss_index_{key}", embeddings, allow_dangerous_deserialization=True)
    return embed_db
 #-----------------------------------------------------Data Loading Process----------------------------------------------------
@@ -396,15 +435,15 @@ def process_document(path, extension, text_doc, image_doc, audio_doc, video_doc)
    elif extension in image_doc:
        doc = process_map["image"](path)
        num_pages = 1
-        doc_name = doc[0].metadata['filename']
+        doc_name = doc[0].metadata['source'].split('\\')[-1]
    elif extension in audio_doc:
        doc = process_map["audio"](path)
        num_pages = len(doc)
-        doc_name = doc[0].metadata['filename']
+        doc_name = doc[0].metadata['source']
    elif extension in video_doc:
        doc = process_map["video"](path, time_interval=30)
        num_pages = len(doc)
-        doc_name = doc[0].metadata['filename']
+        doc_name = doc[0].metadata['source']
    else:
        return None, None, None  # Unhandled extension
@@ -425,7 +464,7 @@ def load_documents_from_directory(directory_path: str):
    def process_with_delay(file):
        result = process_document(os.path.join(directory_path, file), file.split('.')[-1], text_doc, image_doc, audio_doc, video_doc)
-        time.sleep(0.1)  # Introduce a 0.1s delay between processing each document
+        time.sleep(0.4)  # Introduce a 0.4s delay between processing each document
        return result
    with ThreadPoolExecutor() as executor:
@@ -441,27 +480,31 @@ def load_documents_from_directory(directory_path: str):
            first_page = doc[0].page_content
            summary = doc_summarizer(first_page)
            doc_summary.append(summary)
            # adding some delay
            time.sleep(0.5)
    docs_id = [uuid4().hex for _ in range(len(documents))]
    json_file = os.path.join(directory_path, 'data.json')
-    data = {'doc_names': doc_names, 'docs_id': docs_id, 'num_pages': num_pages, 'doc_summaary': doc_summary}
+    # creating a dictionary for each document in the json file
-    
+    for i in range(len(documents)):
-    if os.path.exists(json_file):
+        data = {doc_names[i].split("\\")[-1]: {'doc_id':docs_id[i], 'num_pages': num_pages[i], 'doc_summary': doc_summary[i]}}
-        with open(json_file, 'r+') as f:
+        if os.path.exists(json_file):
-            existing_data = json.load(f)
+            with open(json_file, 'r+') as f:
-            existing_data.update(data)
+                existing_data = json.load(f)
-            f.seek(0)
+                existing_data.update(data)
-            json.dump(existing_data, f)
+                f.seek(0)
-    else:
+                json.dump(existing_data, f)
-        with open(json_file, 'w') as f:
+        else:
-            json.dump(data, f)
+            with open(json_file, 'w') as f:
                json.dump(data, f)
    return documents, docs_id, num_pages
 # A function to create vector store
 def create_vector_store(documents: list, docs_id: list, num_pages: list):
    logger.info(f"Creating vector store")
    # index set up with the embedding dimension
    index = faiss.IndexFlatL2(384)
    # Initialize the FAISS vector store
@@ -476,10 +519,11 @@ def create_vector_store(documents: list, docs_id: list, num_pages: list):
        doc_id = docs_id[i]
        page_ids = [doc_id+ str(i) for i in range(num_pages[i])]
        vector_store.add_documents(documents=documents[i], ids=page_ids)
-        
+    logger.info(f"Vector store created")
    logger.info(f"Saving the vector store")
    # saving the vector store automatically
    save_embedded_data(vector_store, key="data")
-    
+    logger.info(f"Vector store saved")
    return vector_store
 # creating a function to add documents to the vector store
@@ -491,14 +535,70 @@ def add_documents_to_vector_store(embeddings, documents: list, docs_id: list, nu
        page_ids = [doc_id+ str(i) for i in range(num_pages[i])]
        vector_store.add_documents(documents=documents[i], ids=page_ids)
    print ("Documents added to the vector store")
 #----------------------------------------------------------Thumbnail Generator-----------------------------------------------------
 def create_text_thumbnail(file_path):
    logger.info(f"Creating thumbnail for {file_path}")
    # Create a folder for thumbnails if it doesn't exist
    thumbnail_folder = os.path.join(os.path.dirname(file_path), 'thumbnails')
    os.makedirs(thumbnail_folder, exist_ok=True)
    # Extract file name (without extension)
    file_name = os.path.splitext(os.path.basename(file_path))[0]
    # Create a random background color
    background_color = tuple(random.randint(0, 255) for _ in range(3))
    # Create an image with the random background color
    img = Image.new('RGB', (800, 400), color=background_color)
    # Initialize drawing context
    d = ImageDraw.Draw(img)
    # Load a font
    try:
        font = ImageFont.truetype("arial.ttf", 25)  # Adjust the font size as needed
    except IOError:
        font = ImageFont.load_default()
    # Get the bounding box of the text
    text_bbox = d.textbbox((0, 0), file_name, font=font)
    text_width = text_bbox[2] - text_bbox[0]
    text_height = text_bbox[3] - text_bbox[1]
    # Calculate the position to center the text
    text_x = (img.width - text_width) / 2
    text_y = (img.height - text_height) / 2
    # Draw the text onto the image
    d.text((text_x, text_y), file_name, font=font, fill=(255, 255, 255))  # White text
    # Save the image
    thumbnail_path = os.path.join(thumbnail_folder, f"{file_name}.png")
    img.save(thumbnail_path)
    print(f"Thumbnail created: {thumbnail_path}")
 def process_directory(directory_path):
    supported_extensions = ['.txt', '.pdf', '.docx', '.mp3', '.m4a']
    for file in os.listdir(directory_path):
        file_path = os.path.join(directory_path, file)
        if os.path.isfile(file_path):
            file_extension = os.path.splitext(file)[1].lower()
            if file_extension in supported_extensions:
                create_text_thumbnail(file_path)
    return "Done"
 #-----------------------------------------------------------SEARCH-------------------------------------------------------
 # A document search function
 def search(query, k=20):
    logger.info(f"Searching for {query}")
    # loading the embedded data
    embed_db = load_embedded_data()
    db = embed_db
    docs = db.similarity_search(query, k)
    logger.info(f"Search completed")
    all = []
    info = []
    for doc in docs: