diff --git a/Projct Structure.txt b/Projct Structure.txt new file mode 100644 index 00000000..8a3af613 --- /dev/null +++ b/Projct Structure.txt @@ -0,0 +1,16 @@ +---- 1. Load User Document + ----> Starting with word document. Like Pdf, txt and docx file. + ----> Data Ingestion is meant to take in the user data. Load the embedding model, then create a vector database from it. + ----> Considerations: + 1. Pdfs have pages already, hence text splitter won't be used. We want to be able to make reference to the pages the searched document can be found. + 2. The apporach for other data types can be different. we can have text splitter fot txt files and if possible add pages to the chunks made for easy reference. + 3. + + Data Ingestion Module: + This module will handle the data ingestion process. + uitls.py --> keep the reusable functions + pdf_ingest.py --> This module will handle pdfs + + + Loggings Module: + This module will keep logs of what's going on here. \ No newline at end of file diff --git a/data_ingestion/__init__.py b/data_ingestion/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/data_ingestion/__pycache__/__init__.cpython-311.pyc b/data_ingestion/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 00000000..09e0fb0e Binary files /dev/null and b/data_ingestion/__pycache__/__init__.cpython-311.pyc differ diff --git a/data_ingestion/__pycache__/utils.cpython-311.pyc b/data_ingestion/__pycache__/utils.cpython-311.pyc new file mode 100644 index 00000000..d392cb33 Binary files /dev/null and b/data_ingestion/__pycache__/utils.cpython-311.pyc differ diff --git a/data_ingestion/pdf_ingest.py b/data_ingestion/pdf_ingest.py new file mode 100644 index 00000000..951d5d0a --- /dev/null +++ b/data_ingestion/pdf_ingest.py @@ -0,0 +1,42 @@ +from langchain_community.document_loaders import PyPDFLoader +from utils import create_vector_store, save_embedded_data +import sys, os + +# Add the root directory to sys.path +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) +from loggings.logging_config import logger + +# A function to load the pdf document +def load_pdf_document(document_path: str): + logger.info(f"Loading document from {document_path}") + logger.info(f"Checking if the document is a pdf") + if document_path.endswith(".pdf"): + logger.info(f"Document is a pdf") + logger.info(f"Loading and splitting the document") + pdf_doc = PyPDFLoader(document_path) + pages = pdf_doc.load_and_split() + logger.info(f"Document loaded and split into {len(pages)} pages") + return pages + else: + logger.error(f"Unsupported document type for {document_path}") + raise ValueError(f"Unsupported document type for {document_path}") + +# creating a function that loads the pdf document and creates the vector store +def load_and_create_vector_store(document_path: str): + logger.info(f"Loading and creating vector store for {document_path}") + pages = load_pdf_document(document_path) + logger.info(f"Creating vector store") + embed_db = create_vector_store(pages) + logger.info(f"Vector store created") + logger.info(f"Saving the vector store") + # saving the embedded data + save_embedded_data(embed_db) + logger.info(f"Vector store saved") + + return "Vector store created and saved" + + + +if __name__ == "__main__": + document_path = "./data/corolla-2020-toyota-owners-manual.pdf" + load_and_create_vector_store(document_path) \ No newline at end of file diff --git a/data_ingestion/utils.py b/data_ingestion/utils.py new file mode 100644 index 00000000..817e18fc --- /dev/null +++ b/data_ingestion/utils.py @@ -0,0 +1,42 @@ +from langchain_community.embeddings import HuggingFaceBgeEmbeddings +from langchain_community.vectorstores import FAISS + + +# loading the embedding model +def load_embedding_model(): + model_name = "BAAI/bge-small-en" + model_kwargs = {"device": "cuda"} #can also be cpu + encode_kwargs = {"normalize_embeddings": True} + embeddings = HuggingFaceBgeEmbeddings( + model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs + ) + return embeddings + +# loading the embedding model +embeddings = load_embedding_model() + + +# A function to create the vector store +def create_vector_store(document, embeddings=embeddings): + embed_db = FAISS.from_documents(document, embeddings) + return embed_db + +# A function to save the embedded data +def save_embedded_data(docs, key="pdf"): + docs.save_local(f"vec-db/index/faiss_index_{key}") + print("Embeddings saved") + +# A function to load the embedded data +def load_embedded_data(embeddings=embeddings, key="pdf"): + embed_db = FAISS.load_local(f"vec-db/index/faiss_index_{key}", embeddings, allow_dangerous_deserialization=True) + return embed_db + +# A document search function +def search(db, query, k=4): + docs = db.similarity_search(query, k) + all = "" + pages = [] + for doc in docs: + all += f"{doc.page_content}\n" + pages.append(doc.metadata['page']) + return docs[0].page_content, all, pages diff --git a/doc-experiment.ipynb b/doc-experiment.ipynb index 66761e5e..2d93ec1c 100644 --- a/doc-experiment.ipynb +++ b/doc-experiment.ipynb @@ -1,20 +1,35 @@ { "cells": [ { - "cell_type": "code", - "execution_count": 1, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "from langchain_community.embeddings import HuggingFaceBgeEmbeddings\n", - "from langchain_text_splitters import RecursiveCharacterTextSplitter\n", - "from langchain_community.vectorstores import FAISS" + "## Libs import" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, + "outputs": [], + "source": [ + "from langchain_community.embeddings import HuggingFaceBgeEmbeddings\n", + "from langchain_text_splitters import RecursiveCharacterTextSplitter\n", + "from langchain_community.vectorstores import FAISS\n", + "from langchain_community.document_loaders import PyPDFLoader" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Loading the embeddings model" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, "outputs": [ { "name": "stderr", @@ -32,27 +47,14 @@ "encode_kwargs = {\"normalize_embeddings\": True}\n", "embeddings = HuggingFaceBgeEmbeddings(\n", " model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs\n", - " )\n", - "\n" + " )" ] }, { - "cell_type": "code", - "execution_count": 3, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "from langchain_community.document_loaders import PyPDFLoader" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "# Load the document \n", - "loader = PyPDFLoader(\"data/corolla-2020-toyota-owners-manual.pdf\")" + "## Experiment for pdf loading" ] }, { @@ -61,85 +63,54 @@ "metadata": {}, "outputs": [], "source": [ - "pages = loader.load_and_split()" + "# creating a function that checks the document type and loads the document\n", + "def load_pdf_document(document_path):\n", + " if document_path.endswith(\".pdf\"):\n", + " pdf_doc = PyPDFLoader(document_path)\n", + " pages = pdf_doc.load_and_split()\n", + " return pages\n", + " else:\n", + " raise ValueError(f\"Unsupported document type for {document_path}\")\n" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "588" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "len(pages)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Document(metadata={'source': 'data/corolla-2020-toyota-owners-manual.pdf', 'page': 0}, page_content='1\\n2\\n3\\n456789\\n9\\n10\\nCOROLLA_UPictorial index Search by illustration\\nFor safety \\nand securityMake sure to read through them\\n(Main topics: Child seat, theft deterrent system)\\nVehicle status information and \\nindicatorsReading driving-related information\\n(Main topics: Meters, multi-information display)\\nBefore drivingOpening and closing the doors and windows, \\nadjustment before driving\\n(Main topics: Keys, doors, seats)\\nDrivingOperations and advice which are necessary for driving\\n(Main topics: Starting engine, refueling)\\nEntune audioOperating the Entune Audio\\n(Main topics: Audio/visual, phone, Toyota Entune)\\nInterior featuresUsage of the interior features\\n(Main topics: Air conditioner, storage features)\\nMaintenance \\nand careCaring for your vehicle and maintenance \\nprocedures\\n(Main topics: Interior and exterior, light bulbs)\\nWhen trouble \\narisesWhat to do in case of malfunction and emergency\\n(Main topics: Battery discharge, flat tire)\\nVehicle specifications Vehicle specifications, customizable features\\n(Main topics: Fuel, oil, tire inflation pressure)\\nFor ownersReporting safety defects for U.S. owners, and seat \\nbelt and SRS airbag instructions for Canadian owners\\nIndexSearch by symptom\\nSearch alphabetically\\nhttps://www.MyCarManual.com')" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pages[0]" + "# Load the document \n", + "document_path = \"data/corolla-2020-toyota-owners-manual.pdf\"\n", + "pdf_pages = load_pdf_document(document_path)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "c:\\Users\\timmy_3aupohg\\anaconda3\\envs\\smog_env\\Lib\\site-packages\\transformers\\models\\bert\\modeling_bert.py:439: UserWarning: 1Torch was not compiled with flash attention. (Triggered internally at C:\\cb\\pytorch_1000000000000\\work\\aten\\src\\ATen\\native\\transformers\\cuda\\sdp_utils.cpp:555.)\n", - " attn_output = torch.nn.functional.scaled_dot_product_attention(\n" - ] - } - ], + "outputs": [], "source": [ - "db = FAISS.from_documents(pages, embeddings)" + "db = FAISS.from_documents(pdf_pages, embeddings)" ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ - "def save_embedded_data(embeddings, key=\"rand\"):\n", + "def save_embedded_data(embeddings, key=\"pdf\"):\n", " embeddings.save_local(f\"vec-db/index/faiss_index_{key}\")\n", " print(\"Embeddings saved\")\n", "\n", - "def load_embedded_data(embeddings, key):\n", + "def load_embedded_data(embeddings, key=\"pdf\"):\n", " embed_db = FAISS.load_local(f\"vec-db/index/faiss_index_{key}\", embeddings, allow_dangerous_deserialization=True)\n", " return embed_db" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -151,16 +122,23 @@ } ], "source": [ - "save_embedded_data(db, key=\"rand\")" + "save_embedded_data(db)" ] }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ - "load_db = load_embedded_data(embeddings, key=\"rand\")" + "load_db = load_embedded_data(embeddings)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Search" ] }, { @@ -270,44 +248,122 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "def search(db, query, k=4):\n", " docs = db.similarity_search(query, k)\n", " all = \"\"\n", + " pages = []\n", " for doc in docs:\n", " all += f\"{doc.page_content}\\n\"\n", - " return docs[0].page_content, all" + " pages.append(doc.metadata['page'])\n", + " return docs[0].page_content, all, pages" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": {}, "outputs": [ { - "ename": "TypeError", - "evalue": "'FAISS' object is not callable", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[1;32mIn[16], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m search_result, \u001b[38;5;28mall\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[43msearch\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdb\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mWhat is LDA\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;28mprint\u001b[39m( search_result )\n", - "Cell \u001b[1;32mIn[15], line 2\u001b[0m, in \u001b[0;36msearch\u001b[1;34m(db, query, k)\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21msearch\u001b[39m(db, query, k\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m4\u001b[39m):\n\u001b[1;32m----> 2\u001b[0m docs \u001b[38;5;241m=\u001b[39m \u001b[43mdb\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msimilarity_search\u001b[49m\u001b[43m(\u001b[49m\u001b[43mquery\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mk\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 3\u001b[0m \u001b[38;5;28mall\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 4\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m doc \u001b[38;5;129;01min\u001b[39;00m docs:\n", - "File \u001b[1;32mc:\\Users\\timmy_3aupohg\\anaconda3\\envs\\smog_env\\Lib\\site-packages\\langchain_community\\vectorstores\\faiss.py:530\u001b[0m, in \u001b[0;36mFAISS.similarity_search\u001b[1;34m(self, query, k, filter, fetch_k, **kwargs)\u001b[0m\n\u001b[0;32m 510\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21msimilarity_search\u001b[39m(\n\u001b[0;32m 511\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[0;32m 512\u001b[0m query: \u001b[38;5;28mstr\u001b[39m,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 516\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: Any,\n\u001b[0;32m 517\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m List[Document]:\n\u001b[0;32m 518\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Return docs most similar to query.\u001b[39;00m\n\u001b[0;32m 519\u001b[0m \n\u001b[0;32m 520\u001b[0m \u001b[38;5;124;03m Args:\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 528\u001b[0m \u001b[38;5;124;03m List of Documents most similar to the query.\u001b[39;00m\n\u001b[0;32m 529\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m--> 530\u001b[0m docs_and_scores \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msimilarity_search_with_score\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 531\u001b[0m \u001b[43m \u001b[49m\u001b[43mquery\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mk\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mfilter\u001b[39;49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mfilter\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfetch_k\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfetch_k\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\n\u001b[0;32m 532\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 533\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m [doc \u001b[38;5;28;01mfor\u001b[39;00m doc, _ \u001b[38;5;129;01min\u001b[39;00m docs_and_scores]\n", - "File \u001b[1;32mc:\\Users\\timmy_3aupohg\\anaconda3\\envs\\smog_env\\Lib\\site-packages\\langchain_community\\vectorstores\\faiss.py:402\u001b[0m, in \u001b[0;36mFAISS.similarity_search_with_score\u001b[1;34m(self, query, k, filter, fetch_k, **kwargs)\u001b[0m\n\u001b[0;32m 378\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21msimilarity_search_with_score\u001b[39m(\n\u001b[0;32m 379\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[0;32m 380\u001b[0m query: \u001b[38;5;28mstr\u001b[39m,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 384\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: Any,\n\u001b[0;32m 385\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m List[Tuple[Document, \u001b[38;5;28mfloat\u001b[39m]]:\n\u001b[0;32m 386\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Return docs most similar to query.\u001b[39;00m\n\u001b[0;32m 387\u001b[0m \n\u001b[0;32m 388\u001b[0m \u001b[38;5;124;03m Args:\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 400\u001b[0m \u001b[38;5;124;03m L2 distance in float. Lower score represents more similarity.\u001b[39;00m\n\u001b[0;32m 401\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m--> 402\u001b[0m embedding \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_embed_query\u001b[49m\u001b[43m(\u001b[49m\u001b[43mquery\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 403\u001b[0m docs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msimilarity_search_with_score_by_vector(\n\u001b[0;32m 404\u001b[0m embedding,\n\u001b[0;32m 405\u001b[0m k,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 408\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[0;32m 409\u001b[0m )\n\u001b[0;32m 410\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m docs\n", - "File \u001b[1;32mc:\\Users\\timmy_3aupohg\\anaconda3\\envs\\smog_env\\Lib\\site-packages\\langchain_community\\vectorstores\\faiss.py:156\u001b[0m, in \u001b[0;36mFAISS._embed_query\u001b[1;34m(self, text)\u001b[0m\n\u001b[0;32m 154\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39membedding_function\u001b[38;5;241m.\u001b[39membed_query(text)\n\u001b[0;32m 155\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m--> 156\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43membedding_function\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtext\u001b[49m\u001b[43m)\u001b[49m\n", - "\u001b[1;31mTypeError\u001b[0m: 'FAISS' object is not callable" + "name": "stdout", + "output_type": "stream", + "text": [ + "206 4-5. Using the driving support systems\n", + "COROLLA_UWARNING\n", + "■Before using LDA system\n", + "●Do not rely solely upon the LDA \n", + "system. The LDA system does \n", + "not automatically drive the vehi-cle or reduce the amount of \n", + "attention that must be paid to \n", + "the area in front of the vehicle. The driver must always assume \n", + "full responsibilit y for driving \n", + "safely by paying careful atten-\n", + "tion to the surrounding condi-tions and operating the steering \n", + "wheel to correct the path of the \n", + "vehicle. Also, the driver must take adequate breaks when \n", + "fatigued, such as from driving \n", + "for a long period of time.\n", + "●Failure to perform appropriate \n", + "driving operations and pay care-\n", + "ful attention may lead to an \n", + "accident, resulting in death or serious injury.\n", + "●When not using the LDA sys-\n", + "tem, use the LDA switch to turn \n", + "the system off.\n", + "■Situations unsuitable for LDA system\n", + "In the following situations, use the LDA switch to turn the system off. \n", + "Failure to do so may lead to an \n", + "accident, resulting in death or serious injury.\n", + "●Vehicle is driven on a road sur-\n", + "face which is slippery due to \n", + "rainy weather, fallen snow, freezing, etc.\n", + "●Vehicle is driven on a snow-cov-\n", + "ered road.\n", + "●White (yellow) lin es are difficult \n", + "to see due to rain, snow, fog, \n", + "dust, etc.\n", + "●A spare tire, tire chains, etc. are \n", + "equipped.●When the tires have been excessively worn, or when the \n", + "tire inflation p ressure is low.\n", + "●When tires of a size other than specified are installed.\n", + "●Vehicle is driven in traffic lanes \n", + "other than that highways and \n", + "freeways.\n", + "●During emergency towing.\n", + "■Preventing LDA system mal-functions and operations per-\n", + "formed by mistake\n", + "●Do not modify the headlights or place stickers, etc. on the sur-\n", + "face of the lights.\n", + "●Do not modify the suspension etc. If the suspension etc. needs \n", + "to be replaced, contact your \n", + "Toyota dealer.\n", + "●Do not install or place anything on the hoo d or grille. Also, do \n", + "not install a gr ille guard (bull \n", + "bars, kangaroo bar, etc.).\n", + "●If your windshield needs repairs, contact your Toyota \n", + "dealer.\n", + "■Conditions in which functions \n", + "may not operate properly\n", + "In the following situations, the \n", + "functions may not operate prop-erly and the vehicle may depart \n", + "from its lane. Drive safely by \n", + "always paying careful attention to your surroundings and operate \n", + "the steering whee l to correct the \n", + "path of the vehicle without relying \n", + "solely on the functions.\n", + "●Vehicle is being driven around a sharp curve.\n", + "https://www.MyCarManual.com\n" ] } ], "source": [ - "search_result, all = search(db, \"What is LDA\")\n", + "search_result, all, pages = search(db, \"What is LDA\")\n", "print( search_result )" ] }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[205, 208, 204, 212]" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pages" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/loggings/__init__.py b/loggings/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/loggings/__pycache__/__init__.cpython-311.pyc b/loggings/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 00000000..916d547f Binary files /dev/null and b/loggings/__pycache__/__init__.cpython-311.pyc differ diff --git a/loggings/__pycache__/logging_config.cpython-311.pyc b/loggings/__pycache__/logging_config.cpython-311.pyc new file mode 100644 index 00000000..fb3bcabc Binary files /dev/null and b/loggings/__pycache__/logging_config.cpython-311.pyc differ diff --git a/loggings/app.log b/loggings/app.log new file mode 100644 index 00000000..762510e3 --- /dev/null +++ b/loggings/app.log @@ -0,0 +1,28 @@ +2024-08-05 22:09:11,365 - INFO - Loading and creating vector store for ./data/corolla-2020-toyota-owners-manual.pdf +2024-08-05 22:09:11,365 - INFO - Loading document from ./data/corolla-2020-toyota-owners-manual.pdf +2024-08-05 22:09:11,365 - INFO - Checking if the document is a pdf +2024-08-05 22:09:11,365 - INFO - Document is a pdf +2024-08-05 22:09:11,365 - INFO - Loading and splitting the document +2024-08-05 22:09:56,949 - INFO - Document loaded and split into 588 pages +2024-08-05 22:09:56,949 - INFO - Creating vector store +2024-08-05 22:10:06,736 - INFO - Loading faiss with AVX2 support. +2024-08-05 22:10:06,774 - INFO - Successfully loaded faiss with AVX2 support. +2024-08-05 22:10:06,800 - INFO - Vector store created +2024-08-05 22:10:06,802 - INFO - Saving the vector store +2024-08-05 22:11:24,966 - INFO - Loading and creating vector store for ./data/corolla-2020-toyota-owners-manual.pdf +2024-08-05 22:11:24,966 - INFO - Loading document from ./data/corolla-2020-toyota-owners-manual.pdf +2024-08-05 22:11:24,966 - INFO - Checking if the document is a pdf +2024-08-05 22:11:24,966 - INFO - Document is a pdf +2024-08-05 22:11:24,966 - INFO - Loading and splitting the document +2024-08-05 22:12:09,202 - INFO - Document loaded and split into 588 pages +2024-08-05 22:12:09,202 - INFO - Creating vector store +2024-08-05 22:12:19,066 - INFO - Loading faiss with AVX2 support. +2024-08-05 22:12:19,089 - INFO - Successfully loaded faiss with AVX2 support. +2024-08-05 22:12:19,123 - INFO - Vector store created +2024-08-05 22:12:19,123 - INFO - Saving the vector store +2024-08-05 22:12:19,131 - INFO - Vector store saved +2024-08-05 22:12:55,111 - INFO - Loading faiss with AVX2 support. +2024-08-05 22:12:55,144 - INFO - Successfully loaded faiss with AVX2 support. +2024-08-05 22:12:55,205 - INFO - Receiving the search query +2024-08-05 22:13:04,060 - INFO - Searching for what is LDA? +2024-08-05 22:13:04,241 - INFO - Search completed diff --git a/loggings/logging_config.py b/loggings/logging_config.py new file mode 100644 index 00000000..b8535061 --- /dev/null +++ b/loggings/logging_config.py @@ -0,0 +1,19 @@ +import logging +import logging.handlers +import os + +# Create loggings directory if it doesn't exist +if not os.path.exists('loggings'): + os.makedirs('loggings') + +# Define the logging configuration +LOG_FILE = 'loggings/app.log' + +logging.basicConfig(level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s', + handlers=[ + logging.FileHandler(LOG_FILE), + logging.StreamHandler() + ]) + +logger = logging.getLogger(__name__) diff --git a/search.py b/search.py new file mode 100644 index 00000000..258ee37f --- /dev/null +++ b/search.py @@ -0,0 +1,25 @@ +from data_ingestion.utils import search, load_embedded_data +import sys, os + +# Add the root directory to sys.path +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) +from loggings.logging_config import logger + + + +# loading the embedded data +embed_db = load_embedded_data() + + + +if __name__ == "__main__": + logger.info("Receiving the search query") + query = input("Enter the search query: ") + logger.info(f"Searching for {query}") + page_content, all, pages = search(embed_db, query) + logger.info("Search completed") + logger.info(f"Page content: {page_content}") + print(f"Page content: {page_content}") + print(f"Pages: {pages}") + print(f"All: {all}") + print("Search completed") \ No newline at end of file diff --git a/vec-db/index/faiss_index_rand/index.faiss b/vec-db/index/faiss_index_pdf/index.faiss similarity index 100% rename from vec-db/index/faiss_index_rand/index.faiss rename to vec-db/index/faiss_index_pdf/index.faiss diff --git a/vec-db/index/faiss_index_rand/index.pkl b/vec-db/index/faiss_index_pdf/index.pkl similarity index 94% rename from vec-db/index/faiss_index_rand/index.pkl rename to vec-db/index/faiss_index_pdf/index.pkl index 046f332a..3492cbc0 100644 Binary files a/vec-db/index/faiss_index_rand/index.pkl and b/vec-db/index/faiss_index_pdf/index.pkl differ