Pdf Ingestion pipeline completed

This commit is contained in:
timothyafolami
2024-08-05 22:14:19 +01:00
parent b0c3eb8032
commit c34de21971
15 changed files with 318 additions and 90 deletions
+16
View File
@@ -0,0 +1,16 @@
---- 1. Load User Document
----> Starting with word document. Like Pdf, txt and docx file.
----> Data Ingestion is meant to take in the user data. Load the embedding model, then create a vector database from it.
----> Considerations:
1. Pdfs have pages already, hence text splitter won't be used. We want to be able to make reference to the pages the searched document can be found.
2. The apporach for other data types can be different. we can have text splitter fot txt files and if possible add pages to the chunks made for easy reference.
3.
Data Ingestion Module:
This module will handle the data ingestion process.
uitls.py --> keep the reusable functions
pdf_ingest.py --> This module will handle pdfs
Loggings Module:
This module will keep logs of what's going on here.
View File
Binary file not shown.
Binary file not shown.
+42
View File
@@ -0,0 +1,42 @@
from langchain_community.document_loaders import PyPDFLoader
from utils import create_vector_store, save_embedded_data
import sys, os
# Add the root directory to sys.path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from loggings.logging_config import logger
# A function to load the pdf document
def load_pdf_document(document_path: str):
logger.info(f"Loading document from {document_path}")
logger.info(f"Checking if the document is a pdf")
if document_path.endswith(".pdf"):
logger.info(f"Document is a pdf")
logger.info(f"Loading and splitting the document")
pdf_doc = PyPDFLoader(document_path)
pages = pdf_doc.load_and_split()
logger.info(f"Document loaded and split into {len(pages)} pages")
return pages
else:
logger.error(f"Unsupported document type for {document_path}")
raise ValueError(f"Unsupported document type for {document_path}")
# creating a function that loads the pdf document and creates the vector store
def load_and_create_vector_store(document_path: str):
logger.info(f"Loading and creating vector store for {document_path}")
pages = load_pdf_document(document_path)
logger.info(f"Creating vector store")
embed_db = create_vector_store(pages)
logger.info(f"Vector store created")
logger.info(f"Saving the vector store")
# saving the embedded data
save_embedded_data(embed_db)
logger.info(f"Vector store saved")
return "Vector store created and saved"
if __name__ == "__main__":
document_path = "./data/corolla-2020-toyota-owners-manual.pdf"
load_and_create_vector_store(document_path)
+42
View File
@@ -0,0 +1,42 @@
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.vectorstores import FAISS
# loading the embedding model
def load_embedding_model():
model_name = "BAAI/bge-small-en"
model_kwargs = {"device": "cuda"} #can also be cpu
encode_kwargs = {"normalize_embeddings": True}
embeddings = HuggingFaceBgeEmbeddings(
model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)
return embeddings
# loading the embedding model
embeddings = load_embedding_model()
# A function to create the vector store
def create_vector_store(document, embeddings=embeddings):
embed_db = FAISS.from_documents(document, embeddings)
return embed_db
# A function to save the embedded data
def save_embedded_data(docs, key="pdf"):
docs.save_local(f"vec-db/index/faiss_index_{key}")
print("Embeddings saved")
# A function to load the embedded data
def load_embedded_data(embeddings=embeddings, key="pdf"):
embed_db = FAISS.load_local(f"vec-db/index/faiss_index_{key}", embeddings, allow_dangerous_deserialization=True)
return embed_db
# A document search function
def search(db, query, k=4):
docs = db.similarity_search(query, k)
all = ""
pages = []
for doc in docs:
all += f"{doc.page_content}\n"
pages.append(doc.metadata['page'])
return docs[0].page_content, all, pages
+146 -90
View File
@@ -1,20 +1,35 @@
{ {
"cells": [ "cells": [
{ {
"cell_type": "code", "cell_type": "markdown",
"execution_count": 1,
"metadata": {}, "metadata": {},
"outputs": [],
"source": [ "source": [
"from langchain_community.embeddings import HuggingFaceBgeEmbeddings\n", "## Libs import"
"from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
"from langchain_community.vectorstores import FAISS"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 2, "execution_count": 2,
"metadata": {}, "metadata": {},
"outputs": [],
"source": [
"from langchain_community.embeddings import HuggingFaceBgeEmbeddings\n",
"from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
"from langchain_community.vectorstores import FAISS\n",
"from langchain_community.document_loaders import PyPDFLoader"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Loading the embeddings model"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [ "outputs": [
{ {
"name": "stderr", "name": "stderr",
@@ -32,27 +47,14 @@
"encode_kwargs = {\"normalize_embeddings\": True}\n", "encode_kwargs = {\"normalize_embeddings\": True}\n",
"embeddings = HuggingFaceBgeEmbeddings(\n", "embeddings = HuggingFaceBgeEmbeddings(\n",
" model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs\n", " model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs\n",
" )\n", " )"
"\n"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "markdown",
"execution_count": 3,
"metadata": {}, "metadata": {},
"outputs": [],
"source": [ "source": [
"from langchain_community.document_loaders import PyPDFLoader" "## Experiment for pdf loading"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# Load the document \n",
"loader = PyPDFLoader(\"data/corolla-2020-toyota-owners-manual.pdf\")"
] ]
}, },
{ {
@@ -61,85 +63,54 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"pages = loader.load_and_split()" "# creating a function that checks the document type and loads the document\n",
"def load_pdf_document(document_path):\n",
" if document_path.endswith(\".pdf\"):\n",
" pdf_doc = PyPDFLoader(document_path)\n",
" pages = pdf_doc.load_and_split()\n",
" return pages\n",
" else:\n",
" raise ValueError(f\"Unsupported document type for {document_path}\")\n"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 6, "execution_count": 6,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [],
{
"data": {
"text/plain": [
"588"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [ "source": [
"len(pages)" "# Load the document \n",
] "document_path = \"data/corolla-2020-toyota-owners-manual.pdf\"\n",
}, "pdf_pages = load_pdf_document(document_path)"
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Document(metadata={'source': 'data/corolla-2020-toyota-owners-manual.pdf', 'page': 0}, page_content='1\\n2\\n3\\n456789\\n9\\n10\\nCOROLLA_UPictorial index Search by illustration\\nFor safety \\nand securityMake sure to read through them\\n(Main topics: Child seat, theft deterrent system)\\nVehicle status information and \\nindicatorsReading driving-related information\\n(Main topics: Meters, multi-information display)\\nBefore drivingOpening and closing the doors and windows, \\nadjustment before driving\\n(Main topics: Keys, doors, seats)\\nDrivingOperations and advice which are necessary for driving\\n(Main topics: Starting engine, refueling)\\nEntune audioOperating the Entune Audio\\n(Main topics: Audio/visual, phone, Toyota Entune)\\nInterior featuresUsage of the interior features\\n(Main topics: Air conditioner, storage features)\\nMaintenance \\nand careCaring for your vehicle and maintenance \\nprocedures\\n(Main topics: Interior and exterior, light bulbs)\\nWhen trouble \\narisesWhat to do in case of malfunction and emergency\\n(Main topics: Battery discharge, flat tire)\\nVehicle specifications Vehicle specifications, customizable features\\n(Main topics: Fuel, oil, tire inflation pressure)\\nFor ownersReporting safety defects for U.S. owners, and seat \\nbelt and SRS airbag instructions for Canadian owners\\nIndexSearch by symptom\\nSearch alphabetically\\nhttps://www.MyCarManual.com')"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pages[0]"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 8, "execution_count": 8,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [],
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\timmy_3aupohg\\anaconda3\\envs\\smog_env\\Lib\\site-packages\\transformers\\models\\bert\\modeling_bert.py:439: UserWarning: 1Torch was not compiled with flash attention. (Triggered internally at C:\\cb\\pytorch_1000000000000\\work\\aten\\src\\ATen\\native\\transformers\\cuda\\sdp_utils.cpp:555.)\n",
" attn_output = torch.nn.functional.scaled_dot_product_attention(\n"
]
}
],
"source": [ "source": [
"db = FAISS.from_documents(pages, embeddings)" "db = FAISS.from_documents(pdf_pages, embeddings)"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 20, "execution_count": 10,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"def save_embedded_data(embeddings, key=\"rand\"):\n", "def save_embedded_data(embeddings, key=\"pdf\"):\n",
" embeddings.save_local(f\"vec-db/index/faiss_index_{key}\")\n", " embeddings.save_local(f\"vec-db/index/faiss_index_{key}\")\n",
" print(\"Embeddings saved\")\n", " print(\"Embeddings saved\")\n",
"\n", "\n",
"def load_embedded_data(embeddings, key):\n", "def load_embedded_data(embeddings, key=\"pdf\"):\n",
" embed_db = FAISS.load_local(f\"vec-db/index/faiss_index_{key}\", embeddings, allow_dangerous_deserialization=True)\n", " embed_db = FAISS.load_local(f\"vec-db/index/faiss_index_{key}\", embeddings, allow_dangerous_deserialization=True)\n",
" return embed_db" " return embed_db"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 10, "execution_count": 11,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@@ -151,16 +122,23 @@
} }
], ],
"source": [ "source": [
"save_embedded_data(db, key=\"rand\")" "save_embedded_data(db)"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 23, "execution_count": 12,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"load_db = load_embedded_data(embeddings, key=\"rand\")" "load_db = load_embedded_data(embeddings)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Data Search"
] ]
}, },
{ {
@@ -270,44 +248,122 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 15,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"def search(db, query, k=4):\n", "def search(db, query, k=4):\n",
" docs = db.similarity_search(query, k)\n", " docs = db.similarity_search(query, k)\n",
" all = \"\"\n", " all = \"\"\n",
" pages = []\n",
" for doc in docs:\n", " for doc in docs:\n",
" all += f\"{doc.page_content}\\n\"\n", " all += f\"{doc.page_content}\\n\"\n",
" return docs[0].page_content, all" " pages.append(doc.metadata['page'])\n",
" return docs[0].page_content, all, pages"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 16,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"ename": "TypeError", "name": "stdout",
"evalue": "'FAISS' object is not callable", "output_type": "stream",
"output_type": "error", "text": [
"traceback": [ "206 4-5. Using the driving support systems\n",
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "COROLLA_UWARNING\n",
"\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)", "■Before using LDA system\n",
"Cell \u001b[1;32mIn[16], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m search_result, \u001b[38;5;28mall\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[43msearch\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdb\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mWhat is LDA\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;28mprint\u001b[39m( search_result )\n", "●Do not rely solely upon the LDA \n",
"Cell \u001b[1;32mIn[15], line 2\u001b[0m, in \u001b[0;36msearch\u001b[1;34m(db, query, k)\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21msearch\u001b[39m(db, query, k\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m4\u001b[39m):\n\u001b[1;32m----> 2\u001b[0m docs \u001b[38;5;241m=\u001b[39m \u001b[43mdb\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msimilarity_search\u001b[49m\u001b[43m(\u001b[49m\u001b[43mquery\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mk\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 3\u001b[0m \u001b[38;5;28mall\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 4\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m doc \u001b[38;5;129;01min\u001b[39;00m docs:\n", "system. The LDA system does \n",
"File \u001b[1;32mc:\\Users\\timmy_3aupohg\\anaconda3\\envs\\smog_env\\Lib\\site-packages\\langchain_community\\vectorstores\\faiss.py:530\u001b[0m, in \u001b[0;36mFAISS.similarity_search\u001b[1;34m(self, query, k, filter, fetch_k, **kwargs)\u001b[0m\n\u001b[0;32m 510\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21msimilarity_search\u001b[39m(\n\u001b[0;32m 511\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[0;32m 512\u001b[0m query: \u001b[38;5;28mstr\u001b[39m,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 516\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: Any,\n\u001b[0;32m 517\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m List[Document]:\n\u001b[0;32m 518\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Return docs most similar to query.\u001b[39;00m\n\u001b[0;32m 519\u001b[0m \n\u001b[0;32m 520\u001b[0m \u001b[38;5;124;03m Args:\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 528\u001b[0m \u001b[38;5;124;03m List of Documents most similar to the query.\u001b[39;00m\n\u001b[0;32m 529\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m--> 530\u001b[0m docs_and_scores \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msimilarity_search_with_score\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 531\u001b[0m \u001b[43m \u001b[49m\u001b[43mquery\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mk\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mfilter\u001b[39;49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mfilter\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfetch_k\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfetch_k\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\n\u001b[0;32m 532\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 533\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m [doc \u001b[38;5;28;01mfor\u001b[39;00m doc, _ \u001b[38;5;129;01min\u001b[39;00m docs_and_scores]\n", "not automatically drive the vehi-cle or reduce the amount of \n",
"File \u001b[1;32mc:\\Users\\timmy_3aupohg\\anaconda3\\envs\\smog_env\\Lib\\site-packages\\langchain_community\\vectorstores\\faiss.py:402\u001b[0m, in \u001b[0;36mFAISS.similarity_search_with_score\u001b[1;34m(self, query, k, filter, fetch_k, **kwargs)\u001b[0m\n\u001b[0;32m 378\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21msimilarity_search_with_score\u001b[39m(\n\u001b[0;32m 379\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[0;32m 380\u001b[0m query: \u001b[38;5;28mstr\u001b[39m,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 384\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: Any,\n\u001b[0;32m 385\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m List[Tuple[Document, \u001b[38;5;28mfloat\u001b[39m]]:\n\u001b[0;32m 386\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Return docs most similar to query.\u001b[39;00m\n\u001b[0;32m 387\u001b[0m \n\u001b[0;32m 388\u001b[0m \u001b[38;5;124;03m Args:\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 400\u001b[0m \u001b[38;5;124;03m L2 distance in float. Lower score represents more similarity.\u001b[39;00m\n\u001b[0;32m 401\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m--> 402\u001b[0m embedding \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_embed_query\u001b[49m\u001b[43m(\u001b[49m\u001b[43mquery\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 403\u001b[0m docs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msimilarity_search_with_score_by_vector(\n\u001b[0;32m 404\u001b[0m embedding,\n\u001b[0;32m 405\u001b[0m k,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 408\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[0;32m 409\u001b[0m )\n\u001b[0;32m 410\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m docs\n", "attention that must be paid to \n",
"File \u001b[1;32mc:\\Users\\timmy_3aupohg\\anaconda3\\envs\\smog_env\\Lib\\site-packages\\langchain_community\\vectorstores\\faiss.py:156\u001b[0m, in \u001b[0;36mFAISS._embed_query\u001b[1;34m(self, text)\u001b[0m\n\u001b[0;32m 154\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39membedding_function\u001b[38;5;241m.\u001b[39membed_query(text)\n\u001b[0;32m 155\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m--> 156\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43membedding_function\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtext\u001b[49m\u001b[43m)\u001b[49m\n", "the area in front of the vehicle. The driver must always assume \n",
"\u001b[1;31mTypeError\u001b[0m: 'FAISS' object is not callable" "full responsibilit y for driving \n",
"safely by paying careful atten-\n",
"tion to the surrounding condi-tions and operating the steering \n",
"wheel to correct the path of the \n",
"vehicle. Also, the driver must take adequate breaks when \n",
"fatigued, such as from driving \n",
"for a long period of time.\n",
"●Failure to perform appropriate \n",
"driving operations and pay care-\n",
"ful attention may lead to an \n",
"accident, resulting in death or serious injury.\n",
"●When not using the LDA sys-\n",
"tem, use the LDA switch to turn \n",
"the system off.\n",
"■Situations unsuitable for LDA system\n",
"In the following situations, use the LDA switch to turn the system off. \n",
"Failure to do so may lead to an \n",
"accident, resulting in death or serious injury.\n",
"●Vehicle is driven on a road sur-\n",
"face which is slippery due to \n",
"rainy weather, fallen snow, freezing, etc.\n",
"●Vehicle is driven on a snow-cov-\n",
"ered road.\n",
"●White (yellow) lin es are difficult \n",
"to see due to rain, snow, fog, \n",
"dust, etc.\n",
"●A spare tire, tire chains, etc. are \n",
"equipped.●When the tires have been excessively worn, or when the \n",
"tire inflation p ressure is low.\n",
"●When tires of a size other than specified are installed.\n",
"●Vehicle is driven in traffic lanes \n",
"other than that highways and \n",
"freeways.\n",
"●During emergency towing.\n",
"■Preventing LDA system mal-functions and operations per-\n",
"formed by mistake\n",
"●Do not modify the headlights or place stickers, etc. on the sur-\n",
"face of the lights.\n",
"●Do not modify the suspension etc. If the suspension etc. needs \n",
"to be replaced, contact your \n",
"Toyota dealer.\n",
"●Do not install or place anything on the hoo d or grille. Also, do \n",
"not install a gr ille guard (bull \n",
"bars, kangaroo bar, etc.).\n",
"●If your windshield needs repairs, contact your Toyota \n",
"dealer.\n",
"■Conditions in which functions \n",
"may not operate properly\n",
"In the following situations, the \n",
"functions may not operate prop-erly and the vehicle may depart \n",
"from its lane. Drive safely by \n",
"always paying careful attention to your surroundings and operate \n",
"the steering whee l to correct the \n",
"path of the vehicle without relying \n",
"solely on the functions.\n",
"●Vehicle is being driven around a sharp curve.\n",
"https://www.MyCarManual.com\n"
] ]
} }
], ],
"source": [ "source": [
"search_result, all = search(db, \"What is LDA\")\n", "search_result, all, pages = search(db, \"What is LDA\")\n",
"print( search_result )" "print( search_result )"
] ]
}, },
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[205, 208, 204, 212]"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pages"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
View File
Binary file not shown.
Binary file not shown.
+28
View File
@@ -0,0 +1,28 @@
2024-08-05 22:09:11,365 - INFO - Loading and creating vector store for ./data/corolla-2020-toyota-owners-manual.pdf
2024-08-05 22:09:11,365 - INFO - Loading document from ./data/corolla-2020-toyota-owners-manual.pdf
2024-08-05 22:09:11,365 - INFO - Checking if the document is a pdf
2024-08-05 22:09:11,365 - INFO - Document is a pdf
2024-08-05 22:09:11,365 - INFO - Loading and splitting the document
2024-08-05 22:09:56,949 - INFO - Document loaded and split into 588 pages
2024-08-05 22:09:56,949 - INFO - Creating vector store
2024-08-05 22:10:06,736 - INFO - Loading faiss with AVX2 support.
2024-08-05 22:10:06,774 - INFO - Successfully loaded faiss with AVX2 support.
2024-08-05 22:10:06,800 - INFO - Vector store created
2024-08-05 22:10:06,802 - INFO - Saving the vector store
2024-08-05 22:11:24,966 - INFO - Loading and creating vector store for ./data/corolla-2020-toyota-owners-manual.pdf
2024-08-05 22:11:24,966 - INFO - Loading document from ./data/corolla-2020-toyota-owners-manual.pdf
2024-08-05 22:11:24,966 - INFO - Checking if the document is a pdf
2024-08-05 22:11:24,966 - INFO - Document is a pdf
2024-08-05 22:11:24,966 - INFO - Loading and splitting the document
2024-08-05 22:12:09,202 - INFO - Document loaded and split into 588 pages
2024-08-05 22:12:09,202 - INFO - Creating vector store
2024-08-05 22:12:19,066 - INFO - Loading faiss with AVX2 support.
2024-08-05 22:12:19,089 - INFO - Successfully loaded faiss with AVX2 support.
2024-08-05 22:12:19,123 - INFO - Vector store created
2024-08-05 22:12:19,123 - INFO - Saving the vector store
2024-08-05 22:12:19,131 - INFO - Vector store saved
2024-08-05 22:12:55,111 - INFO - Loading faiss with AVX2 support.
2024-08-05 22:12:55,144 - INFO - Successfully loaded faiss with AVX2 support.
2024-08-05 22:12:55,205 - INFO - Receiving the search query
2024-08-05 22:13:04,060 - INFO - Searching for what is LDA?
2024-08-05 22:13:04,241 - INFO - Search completed
+19
View File
@@ -0,0 +1,19 @@
import logging
import logging.handlers
import os
# Create loggings directory if it doesn't exist
if not os.path.exists('loggings'):
os.makedirs('loggings')
# Define the logging configuration
LOG_FILE = 'loggings/app.log'
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler(LOG_FILE),
logging.StreamHandler()
])
logger = logging.getLogger(__name__)
+25
View File
@@ -0,0 +1,25 @@
from data_ingestion.utils import search, load_embedded_data
import sys, os
# Add the root directory to sys.path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from loggings.logging_config import logger
# loading the embedded data
embed_db = load_embedded_data()
if __name__ == "__main__":
logger.info("Receiving the search query")
query = input("Enter the search query: ")
logger.info(f"Searching for {query}")
page_content, all, pages = search(embed_db, query)
logger.info("Search completed")
logger.info(f"Page content: {page_content}")
print(f"Page content: {page_content}")
print(f"Pages: {pages}")
print(f"All: {all}")
print("Search completed")