Pdf Ingestion pipeline completed
This commit is contained in:
@@ -0,0 +1,16 @@
|
||||
---- 1. Load User Document
|
||||
----> Starting with word document. Like Pdf, txt and docx file.
|
||||
----> Data Ingestion is meant to take in the user data. Load the embedding model, then create a vector database from it.
|
||||
----> Considerations:
|
||||
1. Pdfs have pages already, hence text splitter won't be used. We want to be able to make reference to the pages the searched document can be found.
|
||||
2. The apporach for other data types can be different. we can have text splitter fot txt files and if possible add pages to the chunks made for easy reference.
|
||||
3.
|
||||
|
||||
Data Ingestion Module:
|
||||
This module will handle the data ingestion process.
|
||||
uitls.py --> keep the reusable functions
|
||||
pdf_ingest.py --> This module will handle pdfs
|
||||
|
||||
|
||||
Loggings Module:
|
||||
This module will keep logs of what's going on here.
|
||||
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,42 @@
|
||||
from langchain_community.document_loaders import PyPDFLoader
|
||||
from utils import create_vector_store, save_embedded_data
|
||||
import sys, os
|
||||
|
||||
# Add the root directory to sys.path
|
||||
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
||||
from loggings.logging_config import logger
|
||||
|
||||
# A function to load the pdf document
|
||||
def load_pdf_document(document_path: str):
|
||||
logger.info(f"Loading document from {document_path}")
|
||||
logger.info(f"Checking if the document is a pdf")
|
||||
if document_path.endswith(".pdf"):
|
||||
logger.info(f"Document is a pdf")
|
||||
logger.info(f"Loading and splitting the document")
|
||||
pdf_doc = PyPDFLoader(document_path)
|
||||
pages = pdf_doc.load_and_split()
|
||||
logger.info(f"Document loaded and split into {len(pages)} pages")
|
||||
return pages
|
||||
else:
|
||||
logger.error(f"Unsupported document type for {document_path}")
|
||||
raise ValueError(f"Unsupported document type for {document_path}")
|
||||
|
||||
# creating a function that loads the pdf document and creates the vector store
|
||||
def load_and_create_vector_store(document_path: str):
|
||||
logger.info(f"Loading and creating vector store for {document_path}")
|
||||
pages = load_pdf_document(document_path)
|
||||
logger.info(f"Creating vector store")
|
||||
embed_db = create_vector_store(pages)
|
||||
logger.info(f"Vector store created")
|
||||
logger.info(f"Saving the vector store")
|
||||
# saving the embedded data
|
||||
save_embedded_data(embed_db)
|
||||
logger.info(f"Vector store saved")
|
||||
|
||||
return "Vector store created and saved"
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
document_path = "./data/corolla-2020-toyota-owners-manual.pdf"
|
||||
load_and_create_vector_store(document_path)
|
||||
@@ -0,0 +1,42 @@
|
||||
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
|
||||
from langchain_community.vectorstores import FAISS
|
||||
|
||||
|
||||
# loading the embedding model
|
||||
def load_embedding_model():
|
||||
model_name = "BAAI/bge-small-en"
|
||||
model_kwargs = {"device": "cuda"} #can also be cpu
|
||||
encode_kwargs = {"normalize_embeddings": True}
|
||||
embeddings = HuggingFaceBgeEmbeddings(
|
||||
model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
|
||||
)
|
||||
return embeddings
|
||||
|
||||
# loading the embedding model
|
||||
embeddings = load_embedding_model()
|
||||
|
||||
|
||||
# A function to create the vector store
|
||||
def create_vector_store(document, embeddings=embeddings):
|
||||
embed_db = FAISS.from_documents(document, embeddings)
|
||||
return embed_db
|
||||
|
||||
# A function to save the embedded data
|
||||
def save_embedded_data(docs, key="pdf"):
|
||||
docs.save_local(f"vec-db/index/faiss_index_{key}")
|
||||
print("Embeddings saved")
|
||||
|
||||
# A function to load the embedded data
|
||||
def load_embedded_data(embeddings=embeddings, key="pdf"):
|
||||
embed_db = FAISS.load_local(f"vec-db/index/faiss_index_{key}", embeddings, allow_dangerous_deserialization=True)
|
||||
return embed_db
|
||||
|
||||
# A document search function
|
||||
def search(db, query, k=4):
|
||||
docs = db.similarity_search(query, k)
|
||||
all = ""
|
||||
pages = []
|
||||
for doc in docs:
|
||||
all += f"{doc.page_content}\n"
|
||||
pages.append(doc.metadata['page'])
|
||||
return docs[0].page_content, all, pages
|
||||
+146
-90
@@ -1,20 +1,35 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain_community.embeddings import HuggingFaceBgeEmbeddings\n",
|
||||
"from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
|
||||
"from langchain_community.vectorstores import FAISS"
|
||||
"## Libs import"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain_community.embeddings import HuggingFaceBgeEmbeddings\n",
|
||||
"from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
|
||||
"from langchain_community.vectorstores import FAISS\n",
|
||||
"from langchain_community.document_loaders import PyPDFLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Loading the embeddings model"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
@@ -32,27 +47,14 @@
|
||||
"encode_kwargs = {\"normalize_embeddings\": True}\n",
|
||||
"embeddings = HuggingFaceBgeEmbeddings(\n",
|
||||
" model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs\n",
|
||||
" )\n",
|
||||
"\n"
|
||||
" )"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain_community.document_loaders import PyPDFLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Load the document \n",
|
||||
"loader = PyPDFLoader(\"data/corolla-2020-toyota-owners-manual.pdf\")"
|
||||
"## Experiment for pdf loading"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -61,85 +63,54 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pages = loader.load_and_split()"
|
||||
"# creating a function that checks the document type and loads the document\n",
|
||||
"def load_pdf_document(document_path):\n",
|
||||
" if document_path.endswith(\".pdf\"):\n",
|
||||
" pdf_doc = PyPDFLoader(document_path)\n",
|
||||
" pages = pdf_doc.load_and_split()\n",
|
||||
" return pages\n",
|
||||
" else:\n",
|
||||
" raise ValueError(f\"Unsupported document type for {document_path}\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"588"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"len(pages)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Document(metadata={'source': 'data/corolla-2020-toyota-owners-manual.pdf', 'page': 0}, page_content='1\\n2\\n3\\n456789\\n9\\n10\\nCOROLLA_UPictorial index Search by illustration\\nFor safety \\nand securityMake sure to read through them\\n(Main topics: Child seat, theft deterrent system)\\nVehicle status information and \\nindicatorsReading driving-related information\\n(Main topics: Meters, multi-information display)\\nBefore drivingOpening and closing the doors and windows, \\nadjustment before driving\\n(Main topics: Keys, doors, seats)\\nDrivingOperations and advice which are necessary for driving\\n(Main topics: Starting engine, refueling)\\nEntune audioOperating the Entune Audio\\n(Main topics: Audio/visual, phone, Toyota Entune)\\nInterior featuresUsage of the interior features\\n(Main topics: Air conditioner, storage features)\\nMaintenance \\nand careCaring for your vehicle and maintenance \\nprocedures\\n(Main topics: Interior and exterior, light bulbs)\\nWhen trouble \\narisesWhat to do in case of malfunction and emergency\\n(Main topics: Battery discharge, flat tire)\\nVehicle specifications Vehicle specifications, customizable features\\n(Main topics: Fuel, oil, tire inflation pressure)\\nFor ownersReporting safety defects for U.S. owners, and seat \\nbelt and SRS airbag instructions for Canadian owners\\nIndexSearch by symptom\\nSearch alphabetically\\nhttps://www.MyCarManual.com')"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"pages[0]"
|
||||
"# Load the document \n",
|
||||
"document_path = \"data/corolla-2020-toyota-owners-manual.pdf\"\n",
|
||||
"pdf_pages = load_pdf_document(document_path)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"c:\\Users\\timmy_3aupohg\\anaconda3\\envs\\smog_env\\Lib\\site-packages\\transformers\\models\\bert\\modeling_bert.py:439: UserWarning: 1Torch was not compiled with flash attention. (Triggered internally at C:\\cb\\pytorch_1000000000000\\work\\aten\\src\\ATen\\native\\transformers\\cuda\\sdp_utils.cpp:555.)\n",
|
||||
" attn_output = torch.nn.functional.scaled_dot_product_attention(\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"db = FAISS.from_documents(pages, embeddings)"
|
||||
"db = FAISS.from_documents(pdf_pages, embeddings)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def save_embedded_data(embeddings, key=\"rand\"):\n",
|
||||
"def save_embedded_data(embeddings, key=\"pdf\"):\n",
|
||||
" embeddings.save_local(f\"vec-db/index/faiss_index_{key}\")\n",
|
||||
" print(\"Embeddings saved\")\n",
|
||||
"\n",
|
||||
"def load_embedded_data(embeddings, key):\n",
|
||||
"def load_embedded_data(embeddings, key=\"pdf\"):\n",
|
||||
" embed_db = FAISS.load_local(f\"vec-db/index/faiss_index_{key}\", embeddings, allow_dangerous_deserialization=True)\n",
|
||||
" return embed_db"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@@ -151,16 +122,23 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"save_embedded_data(db, key=\"rand\")"
|
||||
"save_embedded_data(db)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 23,
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"load_db = load_embedded_data(embeddings, key=\"rand\")"
|
||||
"load_db = load_embedded_data(embeddings)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Data Search"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -270,44 +248,122 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def search(db, query, k=4):\n",
|
||||
" docs = db.similarity_search(query, k)\n",
|
||||
" all = \"\"\n",
|
||||
" pages = []\n",
|
||||
" for doc in docs:\n",
|
||||
" all += f\"{doc.page_content}\\n\"\n",
|
||||
" return docs[0].page_content, all"
|
||||
" pages.append(doc.metadata['page'])\n",
|
||||
" return docs[0].page_content, all, pages"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "TypeError",
|
||||
"evalue": "'FAISS' object is not callable",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)",
|
||||
"Cell \u001b[1;32mIn[16], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m search_result, \u001b[38;5;28mall\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[43msearch\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdb\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mWhat is LDA\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;28mprint\u001b[39m( search_result )\n",
|
||||
"Cell \u001b[1;32mIn[15], line 2\u001b[0m, in \u001b[0;36msearch\u001b[1;34m(db, query, k)\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21msearch\u001b[39m(db, query, k\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m4\u001b[39m):\n\u001b[1;32m----> 2\u001b[0m docs \u001b[38;5;241m=\u001b[39m \u001b[43mdb\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msimilarity_search\u001b[49m\u001b[43m(\u001b[49m\u001b[43mquery\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mk\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 3\u001b[0m \u001b[38;5;28mall\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 4\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m doc \u001b[38;5;129;01min\u001b[39;00m docs:\n",
|
||||
"File \u001b[1;32mc:\\Users\\timmy_3aupohg\\anaconda3\\envs\\smog_env\\Lib\\site-packages\\langchain_community\\vectorstores\\faiss.py:530\u001b[0m, in \u001b[0;36mFAISS.similarity_search\u001b[1;34m(self, query, k, filter, fetch_k, **kwargs)\u001b[0m\n\u001b[0;32m 510\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21msimilarity_search\u001b[39m(\n\u001b[0;32m 511\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[0;32m 512\u001b[0m query: \u001b[38;5;28mstr\u001b[39m,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 516\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: Any,\n\u001b[0;32m 517\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m List[Document]:\n\u001b[0;32m 518\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Return docs most similar to query.\u001b[39;00m\n\u001b[0;32m 519\u001b[0m \n\u001b[0;32m 520\u001b[0m \u001b[38;5;124;03m Args:\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 528\u001b[0m \u001b[38;5;124;03m List of Documents most similar to the query.\u001b[39;00m\n\u001b[0;32m 529\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m--> 530\u001b[0m docs_and_scores \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msimilarity_search_with_score\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 531\u001b[0m \u001b[43m \u001b[49m\u001b[43mquery\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mk\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mfilter\u001b[39;49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mfilter\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfetch_k\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfetch_k\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\n\u001b[0;32m 532\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 533\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m [doc \u001b[38;5;28;01mfor\u001b[39;00m doc, _ \u001b[38;5;129;01min\u001b[39;00m docs_and_scores]\n",
|
||||
"File \u001b[1;32mc:\\Users\\timmy_3aupohg\\anaconda3\\envs\\smog_env\\Lib\\site-packages\\langchain_community\\vectorstores\\faiss.py:402\u001b[0m, in \u001b[0;36mFAISS.similarity_search_with_score\u001b[1;34m(self, query, k, filter, fetch_k, **kwargs)\u001b[0m\n\u001b[0;32m 378\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21msimilarity_search_with_score\u001b[39m(\n\u001b[0;32m 379\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[0;32m 380\u001b[0m query: \u001b[38;5;28mstr\u001b[39m,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 384\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: Any,\n\u001b[0;32m 385\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m List[Tuple[Document, \u001b[38;5;28mfloat\u001b[39m]]:\n\u001b[0;32m 386\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Return docs most similar to query.\u001b[39;00m\n\u001b[0;32m 387\u001b[0m \n\u001b[0;32m 388\u001b[0m \u001b[38;5;124;03m Args:\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 400\u001b[0m \u001b[38;5;124;03m L2 distance in float. Lower score represents more similarity.\u001b[39;00m\n\u001b[0;32m 401\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m--> 402\u001b[0m embedding \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_embed_query\u001b[49m\u001b[43m(\u001b[49m\u001b[43mquery\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 403\u001b[0m docs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msimilarity_search_with_score_by_vector(\n\u001b[0;32m 404\u001b[0m embedding,\n\u001b[0;32m 405\u001b[0m k,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 408\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[0;32m 409\u001b[0m )\n\u001b[0;32m 410\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m docs\n",
|
||||
"File \u001b[1;32mc:\\Users\\timmy_3aupohg\\anaconda3\\envs\\smog_env\\Lib\\site-packages\\langchain_community\\vectorstores\\faiss.py:156\u001b[0m, in \u001b[0;36mFAISS._embed_query\u001b[1;34m(self, text)\u001b[0m\n\u001b[0;32m 154\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39membedding_function\u001b[38;5;241m.\u001b[39membed_query(text)\n\u001b[0;32m 155\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m--> 156\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43membedding_function\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtext\u001b[49m\u001b[43m)\u001b[49m\n",
|
||||
"\u001b[1;31mTypeError\u001b[0m: 'FAISS' object is not callable"
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"206 4-5. Using the driving support systems\n",
|
||||
"COROLLA_UWARNING\n",
|
||||
"■Before using LDA system\n",
|
||||
"●Do not rely solely upon the LDA \n",
|
||||
"system. The LDA system does \n",
|
||||
"not automatically drive the vehi-cle or reduce the amount of \n",
|
||||
"attention that must be paid to \n",
|
||||
"the area in front of the vehicle. The driver must always assume \n",
|
||||
"full responsibilit y for driving \n",
|
||||
"safely by paying careful atten-\n",
|
||||
"tion to the surrounding condi-tions and operating the steering \n",
|
||||
"wheel to correct the path of the \n",
|
||||
"vehicle. Also, the driver must take adequate breaks when \n",
|
||||
"fatigued, such as from driving \n",
|
||||
"for a long period of time.\n",
|
||||
"●Failure to perform appropriate \n",
|
||||
"driving operations and pay care-\n",
|
||||
"ful attention may lead to an \n",
|
||||
"accident, resulting in death or serious injury.\n",
|
||||
"●When not using the LDA sys-\n",
|
||||
"tem, use the LDA switch to turn \n",
|
||||
"the system off.\n",
|
||||
"■Situations unsuitable for LDA system\n",
|
||||
"In the following situations, use the LDA switch to turn the system off. \n",
|
||||
"Failure to do so may lead to an \n",
|
||||
"accident, resulting in death or serious injury.\n",
|
||||
"●Vehicle is driven on a road sur-\n",
|
||||
"face which is slippery due to \n",
|
||||
"rainy weather, fallen snow, freezing, etc.\n",
|
||||
"●Vehicle is driven on a snow-cov-\n",
|
||||
"ered road.\n",
|
||||
"●White (yellow) lin es are difficult \n",
|
||||
"to see due to rain, snow, fog, \n",
|
||||
"dust, etc.\n",
|
||||
"●A spare tire, tire chains, etc. are \n",
|
||||
"equipped.●When the tires have been excessively worn, or when the \n",
|
||||
"tire inflation p ressure is low.\n",
|
||||
"●When tires of a size other than specified are installed.\n",
|
||||
"●Vehicle is driven in traffic lanes \n",
|
||||
"other than that highways and \n",
|
||||
"freeways.\n",
|
||||
"●During emergency towing.\n",
|
||||
"■Preventing LDA system mal-functions and operations per-\n",
|
||||
"formed by mistake\n",
|
||||
"●Do not modify the headlights or place stickers, etc. on the sur-\n",
|
||||
"face of the lights.\n",
|
||||
"●Do not modify the suspension etc. If the suspension etc. needs \n",
|
||||
"to be replaced, contact your \n",
|
||||
"Toyota dealer.\n",
|
||||
"●Do not install or place anything on the hoo d or grille. Also, do \n",
|
||||
"not install a gr ille guard (bull \n",
|
||||
"bars, kangaroo bar, etc.).\n",
|
||||
"●If your windshield needs repairs, contact your Toyota \n",
|
||||
"dealer.\n",
|
||||
"■Conditions in which functions \n",
|
||||
"may not operate properly\n",
|
||||
"In the following situations, the \n",
|
||||
"functions may not operate prop-erly and the vehicle may depart \n",
|
||||
"from its lane. Drive safely by \n",
|
||||
"always paying careful attention to your surroundings and operate \n",
|
||||
"the steering whee l to correct the \n",
|
||||
"path of the vehicle without relying \n",
|
||||
"solely on the functions.\n",
|
||||
"●Vehicle is being driven around a sharp curve.\n",
|
||||
"https://www.MyCarManual.com\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"search_result, all = search(db, \"What is LDA\")\n",
|
||||
"search_result, all, pages = search(db, \"What is LDA\")\n",
|
||||
"print( search_result )"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[205, 208, 204, 212]"
|
||||
]
|
||||
},
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"pages"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
||||
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,28 @@
|
||||
2024-08-05 22:09:11,365 - INFO - Loading and creating vector store for ./data/corolla-2020-toyota-owners-manual.pdf
|
||||
2024-08-05 22:09:11,365 - INFO - Loading document from ./data/corolla-2020-toyota-owners-manual.pdf
|
||||
2024-08-05 22:09:11,365 - INFO - Checking if the document is a pdf
|
||||
2024-08-05 22:09:11,365 - INFO - Document is a pdf
|
||||
2024-08-05 22:09:11,365 - INFO - Loading and splitting the document
|
||||
2024-08-05 22:09:56,949 - INFO - Document loaded and split into 588 pages
|
||||
2024-08-05 22:09:56,949 - INFO - Creating vector store
|
||||
2024-08-05 22:10:06,736 - INFO - Loading faiss with AVX2 support.
|
||||
2024-08-05 22:10:06,774 - INFO - Successfully loaded faiss with AVX2 support.
|
||||
2024-08-05 22:10:06,800 - INFO - Vector store created
|
||||
2024-08-05 22:10:06,802 - INFO - Saving the vector store
|
||||
2024-08-05 22:11:24,966 - INFO - Loading and creating vector store for ./data/corolla-2020-toyota-owners-manual.pdf
|
||||
2024-08-05 22:11:24,966 - INFO - Loading document from ./data/corolla-2020-toyota-owners-manual.pdf
|
||||
2024-08-05 22:11:24,966 - INFO - Checking if the document is a pdf
|
||||
2024-08-05 22:11:24,966 - INFO - Document is a pdf
|
||||
2024-08-05 22:11:24,966 - INFO - Loading and splitting the document
|
||||
2024-08-05 22:12:09,202 - INFO - Document loaded and split into 588 pages
|
||||
2024-08-05 22:12:09,202 - INFO - Creating vector store
|
||||
2024-08-05 22:12:19,066 - INFO - Loading faiss with AVX2 support.
|
||||
2024-08-05 22:12:19,089 - INFO - Successfully loaded faiss with AVX2 support.
|
||||
2024-08-05 22:12:19,123 - INFO - Vector store created
|
||||
2024-08-05 22:12:19,123 - INFO - Saving the vector store
|
||||
2024-08-05 22:12:19,131 - INFO - Vector store saved
|
||||
2024-08-05 22:12:55,111 - INFO - Loading faiss with AVX2 support.
|
||||
2024-08-05 22:12:55,144 - INFO - Successfully loaded faiss with AVX2 support.
|
||||
2024-08-05 22:12:55,205 - INFO - Receiving the search query
|
||||
2024-08-05 22:13:04,060 - INFO - Searching for what is LDA?
|
||||
2024-08-05 22:13:04,241 - INFO - Search completed
|
||||
@@ -0,0 +1,19 @@
|
||||
import logging
|
||||
import logging.handlers
|
||||
import os
|
||||
|
||||
# Create loggings directory if it doesn't exist
|
||||
if not os.path.exists('loggings'):
|
||||
os.makedirs('loggings')
|
||||
|
||||
# Define the logging configuration
|
||||
LOG_FILE = 'loggings/app.log'
|
||||
|
||||
logging.basicConfig(level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s',
|
||||
handlers=[
|
||||
logging.FileHandler(LOG_FILE),
|
||||
logging.StreamHandler()
|
||||
])
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -0,0 +1,25 @@
|
||||
from data_ingestion.utils import search, load_embedded_data
|
||||
import sys, os
|
||||
|
||||
# Add the root directory to sys.path
|
||||
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
||||
from loggings.logging_config import logger
|
||||
|
||||
|
||||
|
||||
# loading the embedded data
|
||||
embed_db = load_embedded_data()
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logger.info("Receiving the search query")
|
||||
query = input("Enter the search query: ")
|
||||
logger.info(f"Searching for {query}")
|
||||
page_content, all, pages = search(embed_db, query)
|
||||
logger.info("Search completed")
|
||||
logger.info(f"Page content: {page_content}")
|
||||
print(f"Page content: {page_content}")
|
||||
print(f"Pages: {pages}")
|
||||
print(f"All: {all}")
|
||||
print("Search completed")
|
||||
Binary file not shown.
Reference in New Issue
Block a user