Pdf Ingestion pipeline completed
This commit is contained in:
@@ -0,0 +1,16 @@
|
|||||||
|
---- 1. Load User Document
|
||||||
|
----> Starting with word document. Like Pdf, txt and docx file.
|
||||||
|
----> Data Ingestion is meant to take in the user data. Load the embedding model, then create a vector database from it.
|
||||||
|
----> Considerations:
|
||||||
|
1. Pdfs have pages already, hence text splitter won't be used. We want to be able to make reference to the pages the searched document can be found.
|
||||||
|
2. The apporach for other data types can be different. we can have text splitter fot txt files and if possible add pages to the chunks made for easy reference.
|
||||||
|
3.
|
||||||
|
|
||||||
|
Data Ingestion Module:
|
||||||
|
This module will handle the data ingestion process.
|
||||||
|
uitls.py --> keep the reusable functions
|
||||||
|
pdf_ingest.py --> This module will handle pdfs
|
||||||
|
|
||||||
|
|
||||||
|
Loggings Module:
|
||||||
|
This module will keep logs of what's going on here.
|
||||||
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,42 @@
|
|||||||
|
from langchain_community.document_loaders import PyPDFLoader
|
||||||
|
from utils import create_vector_store, save_embedded_data
|
||||||
|
import sys, os
|
||||||
|
|
||||||
|
# Add the root directory to sys.path
|
||||||
|
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
||||||
|
from loggings.logging_config import logger
|
||||||
|
|
||||||
|
# A function to load the pdf document
|
||||||
|
def load_pdf_document(document_path: str):
|
||||||
|
logger.info(f"Loading document from {document_path}")
|
||||||
|
logger.info(f"Checking if the document is a pdf")
|
||||||
|
if document_path.endswith(".pdf"):
|
||||||
|
logger.info(f"Document is a pdf")
|
||||||
|
logger.info(f"Loading and splitting the document")
|
||||||
|
pdf_doc = PyPDFLoader(document_path)
|
||||||
|
pages = pdf_doc.load_and_split()
|
||||||
|
logger.info(f"Document loaded and split into {len(pages)} pages")
|
||||||
|
return pages
|
||||||
|
else:
|
||||||
|
logger.error(f"Unsupported document type for {document_path}")
|
||||||
|
raise ValueError(f"Unsupported document type for {document_path}")
|
||||||
|
|
||||||
|
# creating a function that loads the pdf document and creates the vector store
|
||||||
|
def load_and_create_vector_store(document_path: str):
|
||||||
|
logger.info(f"Loading and creating vector store for {document_path}")
|
||||||
|
pages = load_pdf_document(document_path)
|
||||||
|
logger.info(f"Creating vector store")
|
||||||
|
embed_db = create_vector_store(pages)
|
||||||
|
logger.info(f"Vector store created")
|
||||||
|
logger.info(f"Saving the vector store")
|
||||||
|
# saving the embedded data
|
||||||
|
save_embedded_data(embed_db)
|
||||||
|
logger.info(f"Vector store saved")
|
||||||
|
|
||||||
|
return "Vector store created and saved"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
document_path = "./data/corolla-2020-toyota-owners-manual.pdf"
|
||||||
|
load_and_create_vector_store(document_path)
|
||||||
@@ -0,0 +1,42 @@
|
|||||||
|
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
|
||||||
|
from langchain_community.vectorstores import FAISS
|
||||||
|
|
||||||
|
|
||||||
|
# loading the embedding model
|
||||||
|
def load_embedding_model():
|
||||||
|
model_name = "BAAI/bge-small-en"
|
||||||
|
model_kwargs = {"device": "cuda"} #can also be cpu
|
||||||
|
encode_kwargs = {"normalize_embeddings": True}
|
||||||
|
embeddings = HuggingFaceBgeEmbeddings(
|
||||||
|
model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
|
||||||
|
)
|
||||||
|
return embeddings
|
||||||
|
|
||||||
|
# loading the embedding model
|
||||||
|
embeddings = load_embedding_model()
|
||||||
|
|
||||||
|
|
||||||
|
# A function to create the vector store
|
||||||
|
def create_vector_store(document, embeddings=embeddings):
|
||||||
|
embed_db = FAISS.from_documents(document, embeddings)
|
||||||
|
return embed_db
|
||||||
|
|
||||||
|
# A function to save the embedded data
|
||||||
|
def save_embedded_data(docs, key="pdf"):
|
||||||
|
docs.save_local(f"vec-db/index/faiss_index_{key}")
|
||||||
|
print("Embeddings saved")
|
||||||
|
|
||||||
|
# A function to load the embedded data
|
||||||
|
def load_embedded_data(embeddings=embeddings, key="pdf"):
|
||||||
|
embed_db = FAISS.load_local(f"vec-db/index/faiss_index_{key}", embeddings, allow_dangerous_deserialization=True)
|
||||||
|
return embed_db
|
||||||
|
|
||||||
|
# A document search function
|
||||||
|
def search(db, query, k=4):
|
||||||
|
docs = db.similarity_search(query, k)
|
||||||
|
all = ""
|
||||||
|
pages = []
|
||||||
|
for doc in docs:
|
||||||
|
all += f"{doc.page_content}\n"
|
||||||
|
pages.append(doc.metadata['page'])
|
||||||
|
return docs[0].page_content, all, pages
|
||||||
+146
-90
@@ -1,20 +1,35 @@
|
|||||||
{
|
{
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "markdown",
|
||||||
"execution_count": 1,
|
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
"source": [
|
||||||
"from langchain_community.embeddings import HuggingFaceBgeEmbeddings\n",
|
"## Libs import"
|
||||||
"from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
|
|
||||||
"from langchain_community.vectorstores import FAISS"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 2,
|
"execution_count": 2,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain_community.embeddings import HuggingFaceBgeEmbeddings\n",
|
||||||
|
"from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
|
||||||
|
"from langchain_community.vectorstores import FAISS\n",
|
||||||
|
"from langchain_community.document_loaders import PyPDFLoader"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Loading the embeddings model"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"name": "stderr",
|
"name": "stderr",
|
||||||
@@ -32,27 +47,14 @@
|
|||||||
"encode_kwargs = {\"normalize_embeddings\": True}\n",
|
"encode_kwargs = {\"normalize_embeddings\": True}\n",
|
||||||
"embeddings = HuggingFaceBgeEmbeddings(\n",
|
"embeddings = HuggingFaceBgeEmbeddings(\n",
|
||||||
" model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs\n",
|
" model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs\n",
|
||||||
" )\n",
|
" )"
|
||||||
"\n"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "markdown",
|
||||||
"execution_count": 3,
|
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
"source": [
|
||||||
"from langchain_community.document_loaders import PyPDFLoader"
|
"## Experiment for pdf loading"
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 4,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# Load the document \n",
|
|
||||||
"loader = PyPDFLoader(\"data/corolla-2020-toyota-owners-manual.pdf\")"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -61,85 +63,54 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"pages = loader.load_and_split()"
|
"# creating a function that checks the document type and loads the document\n",
|
||||||
|
"def load_pdf_document(document_path):\n",
|
||||||
|
" if document_path.endswith(\".pdf\"):\n",
|
||||||
|
" pdf_doc = PyPDFLoader(document_path)\n",
|
||||||
|
" pages = pdf_doc.load_and_split()\n",
|
||||||
|
" return pages\n",
|
||||||
|
" else:\n",
|
||||||
|
" raise ValueError(f\"Unsupported document type for {document_path}\")\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 6,
|
"execution_count": 6,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"588"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 6,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"len(pages)"
|
"# Load the document \n",
|
||||||
]
|
"document_path = \"data/corolla-2020-toyota-owners-manual.pdf\"\n",
|
||||||
},
|
"pdf_pages = load_pdf_document(document_path)"
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 7,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"Document(metadata={'source': 'data/corolla-2020-toyota-owners-manual.pdf', 'page': 0}, page_content='1\\n2\\n3\\n456789\\n9\\n10\\nCOROLLA_UPictorial index Search by illustration\\nFor safety \\nand securityMake sure to read through them\\n(Main topics: Child seat, theft deterrent system)\\nVehicle status information and \\nindicatorsReading driving-related information\\n(Main topics: Meters, multi-information display)\\nBefore drivingOpening and closing the doors and windows, \\nadjustment before driving\\n(Main topics: Keys, doors, seats)\\nDrivingOperations and advice which are necessary for driving\\n(Main topics: Starting engine, refueling)\\nEntune audioOperating the Entune Audio\\n(Main topics: Audio/visual, phone, Toyota Entune)\\nInterior featuresUsage of the interior features\\n(Main topics: Air conditioner, storage features)\\nMaintenance \\nand careCaring for your vehicle and maintenance \\nprocedures\\n(Main topics: Interior and exterior, light bulbs)\\nWhen trouble \\narisesWhat to do in case of malfunction and emergency\\n(Main topics: Battery discharge, flat tire)\\nVehicle specifications Vehicle specifications, customizable features\\n(Main topics: Fuel, oil, tire inflation pressure)\\nFor ownersReporting safety defects for U.S. owners, and seat \\nbelt and SRS airbag instructions for Canadian owners\\nIndexSearch by symptom\\nSearch alphabetically\\nhttps://www.MyCarManual.com')"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 7,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"pages[0]"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 8,
|
"execution_count": 8,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"name": "stderr",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"c:\\Users\\timmy_3aupohg\\anaconda3\\envs\\smog_env\\Lib\\site-packages\\transformers\\models\\bert\\modeling_bert.py:439: UserWarning: 1Torch was not compiled with flash attention. (Triggered internally at C:\\cb\\pytorch_1000000000000\\work\\aten\\src\\ATen\\native\\transformers\\cuda\\sdp_utils.cpp:555.)\n",
|
|
||||||
" attn_output = torch.nn.functional.scaled_dot_product_attention(\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"db = FAISS.from_documents(pages, embeddings)"
|
"db = FAISS.from_documents(pdf_pages, embeddings)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 20,
|
"execution_count": 10,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def save_embedded_data(embeddings, key=\"rand\"):\n",
|
"def save_embedded_data(embeddings, key=\"pdf\"):\n",
|
||||||
" embeddings.save_local(f\"vec-db/index/faiss_index_{key}\")\n",
|
" embeddings.save_local(f\"vec-db/index/faiss_index_{key}\")\n",
|
||||||
" print(\"Embeddings saved\")\n",
|
" print(\"Embeddings saved\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"def load_embedded_data(embeddings, key):\n",
|
"def load_embedded_data(embeddings, key=\"pdf\"):\n",
|
||||||
" embed_db = FAISS.load_local(f\"vec-db/index/faiss_index_{key}\", embeddings, allow_dangerous_deserialization=True)\n",
|
" embed_db = FAISS.load_local(f\"vec-db/index/faiss_index_{key}\", embeddings, allow_dangerous_deserialization=True)\n",
|
||||||
" return embed_db"
|
" return embed_db"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 10,
|
"execution_count": 11,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
@@ -151,16 +122,23 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"save_embedded_data(db, key=\"rand\")"
|
"save_embedded_data(db)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 23,
|
"execution_count": 12,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"load_db = load_embedded_data(embeddings, key=\"rand\")"
|
"load_db = load_embedded_data(embeddings)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Data Search"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -270,44 +248,122 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 15,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def search(db, query, k=4):\n",
|
"def search(db, query, k=4):\n",
|
||||||
" docs = db.similarity_search(query, k)\n",
|
" docs = db.similarity_search(query, k)\n",
|
||||||
" all = \"\"\n",
|
" all = \"\"\n",
|
||||||
|
" pages = []\n",
|
||||||
" for doc in docs:\n",
|
" for doc in docs:\n",
|
||||||
" all += f\"{doc.page_content}\\n\"\n",
|
" all += f\"{doc.page_content}\\n\"\n",
|
||||||
" return docs[0].page_content, all"
|
" pages.append(doc.metadata['page'])\n",
|
||||||
|
" return docs[0].page_content, all, pages"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 16,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"ename": "TypeError",
|
"name": "stdout",
|
||||||
"evalue": "'FAISS' object is not callable",
|
"output_type": "stream",
|
||||||
"output_type": "error",
|
"text": [
|
||||||
"traceback": [
|
"206 4-5. Using the driving support systems\n",
|
||||||
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
"COROLLA_UWARNING\n",
|
||||||
"\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)",
|
"■Before using LDA system\n",
|
||||||
"Cell \u001b[1;32mIn[16], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m search_result, \u001b[38;5;28mall\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[43msearch\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdb\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mWhat is LDA\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;28mprint\u001b[39m( search_result )\n",
|
"●Do not rely solely upon the LDA \n",
|
||||||
"Cell \u001b[1;32mIn[15], line 2\u001b[0m, in \u001b[0;36msearch\u001b[1;34m(db, query, k)\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21msearch\u001b[39m(db, query, k\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m4\u001b[39m):\n\u001b[1;32m----> 2\u001b[0m docs \u001b[38;5;241m=\u001b[39m \u001b[43mdb\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msimilarity_search\u001b[49m\u001b[43m(\u001b[49m\u001b[43mquery\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mk\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 3\u001b[0m \u001b[38;5;28mall\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 4\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m doc \u001b[38;5;129;01min\u001b[39;00m docs:\n",
|
"system. The LDA system does \n",
|
||||||
"File \u001b[1;32mc:\\Users\\timmy_3aupohg\\anaconda3\\envs\\smog_env\\Lib\\site-packages\\langchain_community\\vectorstores\\faiss.py:530\u001b[0m, in \u001b[0;36mFAISS.similarity_search\u001b[1;34m(self, query, k, filter, fetch_k, **kwargs)\u001b[0m\n\u001b[0;32m 510\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21msimilarity_search\u001b[39m(\n\u001b[0;32m 511\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[0;32m 512\u001b[0m query: \u001b[38;5;28mstr\u001b[39m,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 516\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: Any,\n\u001b[0;32m 517\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m List[Document]:\n\u001b[0;32m 518\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Return docs most similar to query.\u001b[39;00m\n\u001b[0;32m 519\u001b[0m \n\u001b[0;32m 520\u001b[0m \u001b[38;5;124;03m Args:\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 528\u001b[0m \u001b[38;5;124;03m List of Documents most similar to the query.\u001b[39;00m\n\u001b[0;32m 529\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m--> 530\u001b[0m docs_and_scores \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msimilarity_search_with_score\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 531\u001b[0m \u001b[43m \u001b[49m\u001b[43mquery\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mk\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mfilter\u001b[39;49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mfilter\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfetch_k\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfetch_k\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\n\u001b[0;32m 532\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 533\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m [doc \u001b[38;5;28;01mfor\u001b[39;00m doc, _ \u001b[38;5;129;01min\u001b[39;00m docs_and_scores]\n",
|
"not automatically drive the vehi-cle or reduce the amount of \n",
|
||||||
"File \u001b[1;32mc:\\Users\\timmy_3aupohg\\anaconda3\\envs\\smog_env\\Lib\\site-packages\\langchain_community\\vectorstores\\faiss.py:402\u001b[0m, in \u001b[0;36mFAISS.similarity_search_with_score\u001b[1;34m(self, query, k, filter, fetch_k, **kwargs)\u001b[0m\n\u001b[0;32m 378\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21msimilarity_search_with_score\u001b[39m(\n\u001b[0;32m 379\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[0;32m 380\u001b[0m query: \u001b[38;5;28mstr\u001b[39m,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 384\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: Any,\n\u001b[0;32m 385\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m List[Tuple[Document, \u001b[38;5;28mfloat\u001b[39m]]:\n\u001b[0;32m 386\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Return docs most similar to query.\u001b[39;00m\n\u001b[0;32m 387\u001b[0m \n\u001b[0;32m 388\u001b[0m \u001b[38;5;124;03m Args:\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 400\u001b[0m \u001b[38;5;124;03m L2 distance in float. Lower score represents more similarity.\u001b[39;00m\n\u001b[0;32m 401\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m--> 402\u001b[0m embedding \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_embed_query\u001b[49m\u001b[43m(\u001b[49m\u001b[43mquery\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 403\u001b[0m docs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msimilarity_search_with_score_by_vector(\n\u001b[0;32m 404\u001b[0m embedding,\n\u001b[0;32m 405\u001b[0m k,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 408\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[0;32m 409\u001b[0m )\n\u001b[0;32m 410\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m docs\n",
|
"attention that must be paid to \n",
|
||||||
"File \u001b[1;32mc:\\Users\\timmy_3aupohg\\anaconda3\\envs\\smog_env\\Lib\\site-packages\\langchain_community\\vectorstores\\faiss.py:156\u001b[0m, in \u001b[0;36mFAISS._embed_query\u001b[1;34m(self, text)\u001b[0m\n\u001b[0;32m 154\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39membedding_function\u001b[38;5;241m.\u001b[39membed_query(text)\n\u001b[0;32m 155\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m--> 156\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43membedding_function\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtext\u001b[49m\u001b[43m)\u001b[49m\n",
|
"the area in front of the vehicle. The driver must always assume \n",
|
||||||
"\u001b[1;31mTypeError\u001b[0m: 'FAISS' object is not callable"
|
"full responsibilit y for driving \n",
|
||||||
|
"safely by paying careful atten-\n",
|
||||||
|
"tion to the surrounding condi-tions and operating the steering \n",
|
||||||
|
"wheel to correct the path of the \n",
|
||||||
|
"vehicle. Also, the driver must take adequate breaks when \n",
|
||||||
|
"fatigued, such as from driving \n",
|
||||||
|
"for a long period of time.\n",
|
||||||
|
"●Failure to perform appropriate \n",
|
||||||
|
"driving operations and pay care-\n",
|
||||||
|
"ful attention may lead to an \n",
|
||||||
|
"accident, resulting in death or serious injury.\n",
|
||||||
|
"●When not using the LDA sys-\n",
|
||||||
|
"tem, use the LDA switch to turn \n",
|
||||||
|
"the system off.\n",
|
||||||
|
"■Situations unsuitable for LDA system\n",
|
||||||
|
"In the following situations, use the LDA switch to turn the system off. \n",
|
||||||
|
"Failure to do so may lead to an \n",
|
||||||
|
"accident, resulting in death or serious injury.\n",
|
||||||
|
"●Vehicle is driven on a road sur-\n",
|
||||||
|
"face which is slippery due to \n",
|
||||||
|
"rainy weather, fallen snow, freezing, etc.\n",
|
||||||
|
"●Vehicle is driven on a snow-cov-\n",
|
||||||
|
"ered road.\n",
|
||||||
|
"●White (yellow) lin es are difficult \n",
|
||||||
|
"to see due to rain, snow, fog, \n",
|
||||||
|
"dust, etc.\n",
|
||||||
|
"●A spare tire, tire chains, etc. are \n",
|
||||||
|
"equipped.●When the tires have been excessively worn, or when the \n",
|
||||||
|
"tire inflation p ressure is low.\n",
|
||||||
|
"●When tires of a size other than specified are installed.\n",
|
||||||
|
"●Vehicle is driven in traffic lanes \n",
|
||||||
|
"other than that highways and \n",
|
||||||
|
"freeways.\n",
|
||||||
|
"●During emergency towing.\n",
|
||||||
|
"■Preventing LDA system mal-functions and operations per-\n",
|
||||||
|
"formed by mistake\n",
|
||||||
|
"●Do not modify the headlights or place stickers, etc. on the sur-\n",
|
||||||
|
"face of the lights.\n",
|
||||||
|
"●Do not modify the suspension etc. If the suspension etc. needs \n",
|
||||||
|
"to be replaced, contact your \n",
|
||||||
|
"Toyota dealer.\n",
|
||||||
|
"●Do not install or place anything on the hoo d or grille. Also, do \n",
|
||||||
|
"not install a gr ille guard (bull \n",
|
||||||
|
"bars, kangaroo bar, etc.).\n",
|
||||||
|
"●If your windshield needs repairs, contact your Toyota \n",
|
||||||
|
"dealer.\n",
|
||||||
|
"■Conditions in which functions \n",
|
||||||
|
"may not operate properly\n",
|
||||||
|
"In the following situations, the \n",
|
||||||
|
"functions may not operate prop-erly and the vehicle may depart \n",
|
||||||
|
"from its lane. Drive safely by \n",
|
||||||
|
"always paying careful attention to your surroundings and operate \n",
|
||||||
|
"the steering whee l to correct the \n",
|
||||||
|
"path of the vehicle without relying \n",
|
||||||
|
"solely on the functions.\n",
|
||||||
|
"●Vehicle is being driven around a sharp curve.\n",
|
||||||
|
"https://www.MyCarManual.com\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"search_result, all = search(db, \"What is LDA\")\n",
|
"search_result, all, pages = search(db, \"What is LDA\")\n",
|
||||||
"print( search_result )"
|
"print( search_result )"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 17,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[205, 208, 204, 212]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 17,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"pages"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
|
|||||||
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,28 @@
|
|||||||
|
2024-08-05 22:09:11,365 - INFO - Loading and creating vector store for ./data/corolla-2020-toyota-owners-manual.pdf
|
||||||
|
2024-08-05 22:09:11,365 - INFO - Loading document from ./data/corolla-2020-toyota-owners-manual.pdf
|
||||||
|
2024-08-05 22:09:11,365 - INFO - Checking if the document is a pdf
|
||||||
|
2024-08-05 22:09:11,365 - INFO - Document is a pdf
|
||||||
|
2024-08-05 22:09:11,365 - INFO - Loading and splitting the document
|
||||||
|
2024-08-05 22:09:56,949 - INFO - Document loaded and split into 588 pages
|
||||||
|
2024-08-05 22:09:56,949 - INFO - Creating vector store
|
||||||
|
2024-08-05 22:10:06,736 - INFO - Loading faiss with AVX2 support.
|
||||||
|
2024-08-05 22:10:06,774 - INFO - Successfully loaded faiss with AVX2 support.
|
||||||
|
2024-08-05 22:10:06,800 - INFO - Vector store created
|
||||||
|
2024-08-05 22:10:06,802 - INFO - Saving the vector store
|
||||||
|
2024-08-05 22:11:24,966 - INFO - Loading and creating vector store for ./data/corolla-2020-toyota-owners-manual.pdf
|
||||||
|
2024-08-05 22:11:24,966 - INFO - Loading document from ./data/corolla-2020-toyota-owners-manual.pdf
|
||||||
|
2024-08-05 22:11:24,966 - INFO - Checking if the document is a pdf
|
||||||
|
2024-08-05 22:11:24,966 - INFO - Document is a pdf
|
||||||
|
2024-08-05 22:11:24,966 - INFO - Loading and splitting the document
|
||||||
|
2024-08-05 22:12:09,202 - INFO - Document loaded and split into 588 pages
|
||||||
|
2024-08-05 22:12:09,202 - INFO - Creating vector store
|
||||||
|
2024-08-05 22:12:19,066 - INFO - Loading faiss with AVX2 support.
|
||||||
|
2024-08-05 22:12:19,089 - INFO - Successfully loaded faiss with AVX2 support.
|
||||||
|
2024-08-05 22:12:19,123 - INFO - Vector store created
|
||||||
|
2024-08-05 22:12:19,123 - INFO - Saving the vector store
|
||||||
|
2024-08-05 22:12:19,131 - INFO - Vector store saved
|
||||||
|
2024-08-05 22:12:55,111 - INFO - Loading faiss with AVX2 support.
|
||||||
|
2024-08-05 22:12:55,144 - INFO - Successfully loaded faiss with AVX2 support.
|
||||||
|
2024-08-05 22:12:55,205 - INFO - Receiving the search query
|
||||||
|
2024-08-05 22:13:04,060 - INFO - Searching for what is LDA?
|
||||||
|
2024-08-05 22:13:04,241 - INFO - Search completed
|
||||||
@@ -0,0 +1,19 @@
|
|||||||
|
import logging
|
||||||
|
import logging.handlers
|
||||||
|
import os
|
||||||
|
|
||||||
|
# Create loggings directory if it doesn't exist
|
||||||
|
if not os.path.exists('loggings'):
|
||||||
|
os.makedirs('loggings')
|
||||||
|
|
||||||
|
# Define the logging configuration
|
||||||
|
LOG_FILE = 'loggings/app.log'
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.INFO,
|
||||||
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
||||||
|
handlers=[
|
||||||
|
logging.FileHandler(LOG_FILE),
|
||||||
|
logging.StreamHandler()
|
||||||
|
])
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
@@ -0,0 +1,25 @@
|
|||||||
|
from data_ingestion.utils import search, load_embedded_data
|
||||||
|
import sys, os
|
||||||
|
|
||||||
|
# Add the root directory to sys.path
|
||||||
|
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
||||||
|
from loggings.logging_config import logger
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# loading the embedded data
|
||||||
|
embed_db = load_embedded_data()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
logger.info("Receiving the search query")
|
||||||
|
query = input("Enter the search query: ")
|
||||||
|
logger.info(f"Searching for {query}")
|
||||||
|
page_content, all, pages = search(embed_db, query)
|
||||||
|
logger.info("Search completed")
|
||||||
|
logger.info(f"Page content: {page_content}")
|
||||||
|
print(f"Page content: {page_content}")
|
||||||
|
print(f"Pages: {pages}")
|
||||||
|
print(f"All: {all}")
|
||||||
|
print("Search completed")
|
||||||
Binary file not shown.
Reference in New Issue
Block a user