Pdf Ingestion pipeline completed

This commit is contained in:
timothyafolami
2024-08-05 22:14:19 +01:00
parent b0c3eb8032
commit c34de21971
15 changed files with 318 additions and 90 deletions
+146 -90
View File
@@ -1,20 +1,35 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"cell_type": "markdown",
"metadata": {},
"outputs": [],
"source": [
"from langchain_community.embeddings import HuggingFaceBgeEmbeddings\n",
"from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
"from langchain_community.vectorstores import FAISS"
"## Libs import"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from langchain_community.embeddings import HuggingFaceBgeEmbeddings\n",
"from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
"from langchain_community.vectorstores import FAISS\n",
"from langchain_community.document_loaders import PyPDFLoader"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Loading the embeddings model"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stderr",
@@ -32,27 +47,14 @@
"encode_kwargs = {\"normalize_embeddings\": True}\n",
"embeddings = HuggingFaceBgeEmbeddings(\n",
" model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs\n",
" )\n",
"\n"
" )"
]
},
{
"cell_type": "code",
"execution_count": 3,
"cell_type": "markdown",
"metadata": {},
"outputs": [],
"source": [
"from langchain_community.document_loaders import PyPDFLoader"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# Load the document \n",
"loader = PyPDFLoader(\"data/corolla-2020-toyota-owners-manual.pdf\")"
"## Experiment for pdf loading"
]
},
{
@@ -61,85 +63,54 @@
"metadata": {},
"outputs": [],
"source": [
"pages = loader.load_and_split()"
"# creating a function that checks the document type and loads the document\n",
"def load_pdf_document(document_path):\n",
" if document_path.endswith(\".pdf\"):\n",
" pdf_doc = PyPDFLoader(document_path)\n",
" pages = pdf_doc.load_and_split()\n",
" return pages\n",
" else:\n",
" raise ValueError(f\"Unsupported document type for {document_path}\")\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"588"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"len(pages)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Document(metadata={'source': 'data/corolla-2020-toyota-owners-manual.pdf', 'page': 0}, page_content='1\\n2\\n3\\n456789\\n9\\n10\\nCOROLLA_UPictorial index Search by illustration\\nFor safety \\nand securityMake sure to read through them\\n(Main topics: Child seat, theft deterrent system)\\nVehicle status information and \\nindicatorsReading driving-related information\\n(Main topics: Meters, multi-information display)\\nBefore drivingOpening and closing the doors and windows, \\nadjustment before driving\\n(Main topics: Keys, doors, seats)\\nDrivingOperations and advice which are necessary for driving\\n(Main topics: Starting engine, refueling)\\nEntune audioOperating the Entune Audio\\n(Main topics: Audio/visual, phone, Toyota Entune)\\nInterior featuresUsage of the interior features\\n(Main topics: Air conditioner, storage features)\\nMaintenance \\nand careCaring for your vehicle and maintenance \\nprocedures\\n(Main topics: Interior and exterior, light bulbs)\\nWhen trouble \\narisesWhat to do in case of malfunction and emergency\\n(Main topics: Battery discharge, flat tire)\\nVehicle specifications Vehicle specifications, customizable features\\n(Main topics: Fuel, oil, tire inflation pressure)\\nFor ownersReporting safety defects for U.S. owners, and seat \\nbelt and SRS airbag instructions for Canadian owners\\nIndexSearch by symptom\\nSearch alphabetically\\nhttps://www.MyCarManual.com')"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pages[0]"
"# Load the document \n",
"document_path = \"data/corolla-2020-toyota-owners-manual.pdf\"\n",
"pdf_pages = load_pdf_document(document_path)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\timmy_3aupohg\\anaconda3\\envs\\smog_env\\Lib\\site-packages\\transformers\\models\\bert\\modeling_bert.py:439: UserWarning: 1Torch was not compiled with flash attention. (Triggered internally at C:\\cb\\pytorch_1000000000000\\work\\aten\\src\\ATen\\native\\transformers\\cuda\\sdp_utils.cpp:555.)\n",
" attn_output = torch.nn.functional.scaled_dot_product_attention(\n"
]
}
],
"outputs": [],
"source": [
"db = FAISS.from_documents(pages, embeddings)"
"db = FAISS.from_documents(pdf_pages, embeddings)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"def save_embedded_data(embeddings, key=\"rand\"):\n",
"def save_embedded_data(embeddings, key=\"pdf\"):\n",
" embeddings.save_local(f\"vec-db/index/faiss_index_{key}\")\n",
" print(\"Embeddings saved\")\n",
"\n",
"def load_embedded_data(embeddings, key):\n",
"def load_embedded_data(embeddings, key=\"pdf\"):\n",
" embed_db = FAISS.load_local(f\"vec-db/index/faiss_index_{key}\", embeddings, allow_dangerous_deserialization=True)\n",
" return embed_db"
]
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 11,
"metadata": {},
"outputs": [
{
@@ -151,16 +122,23 @@
}
],
"source": [
"save_embedded_data(db, key=\"rand\")"
"save_embedded_data(db)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"load_db = load_embedded_data(embeddings, key=\"rand\")"
"load_db = load_embedded_data(embeddings)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Data Search"
]
},
{
@@ -270,44 +248,122 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"def search(db, query, k=4):\n",
" docs = db.similarity_search(query, k)\n",
" all = \"\"\n",
" pages = []\n",
" for doc in docs:\n",
" all += f\"{doc.page_content}\\n\"\n",
" return docs[0].page_content, all"
" pages.append(doc.metadata['page'])\n",
" return docs[0].page_content, all, pages"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 16,
"metadata": {},
"outputs": [
{
"ename": "TypeError",
"evalue": "'FAISS' object is not callable",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[16], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m search_result, \u001b[38;5;28mall\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[43msearch\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdb\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mWhat is LDA\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;28mprint\u001b[39m( search_result )\n",
"Cell \u001b[1;32mIn[15], line 2\u001b[0m, in \u001b[0;36msearch\u001b[1;34m(db, query, k)\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21msearch\u001b[39m(db, query, k\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m4\u001b[39m):\n\u001b[1;32m----> 2\u001b[0m docs \u001b[38;5;241m=\u001b[39m \u001b[43mdb\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msimilarity_search\u001b[49m\u001b[43m(\u001b[49m\u001b[43mquery\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mk\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 3\u001b[0m \u001b[38;5;28mall\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 4\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m doc \u001b[38;5;129;01min\u001b[39;00m docs:\n",
"File \u001b[1;32mc:\\Users\\timmy_3aupohg\\anaconda3\\envs\\smog_env\\Lib\\site-packages\\langchain_community\\vectorstores\\faiss.py:530\u001b[0m, in \u001b[0;36mFAISS.similarity_search\u001b[1;34m(self, query, k, filter, fetch_k, **kwargs)\u001b[0m\n\u001b[0;32m 510\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21msimilarity_search\u001b[39m(\n\u001b[0;32m 511\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[0;32m 512\u001b[0m query: \u001b[38;5;28mstr\u001b[39m,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 516\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: Any,\n\u001b[0;32m 517\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m List[Document]:\n\u001b[0;32m 518\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Return docs most similar to query.\u001b[39;00m\n\u001b[0;32m 519\u001b[0m \n\u001b[0;32m 520\u001b[0m \u001b[38;5;124;03m Args:\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 528\u001b[0m \u001b[38;5;124;03m List of Documents most similar to the query.\u001b[39;00m\n\u001b[0;32m 529\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m--> 530\u001b[0m docs_and_scores \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msimilarity_search_with_score\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 531\u001b[0m \u001b[43m \u001b[49m\u001b[43mquery\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mk\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mfilter\u001b[39;49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mfilter\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfetch_k\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfetch_k\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\n\u001b[0;32m 532\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 533\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m [doc \u001b[38;5;28;01mfor\u001b[39;00m doc, _ \u001b[38;5;129;01min\u001b[39;00m docs_and_scores]\n",
"File \u001b[1;32mc:\\Users\\timmy_3aupohg\\anaconda3\\envs\\smog_env\\Lib\\site-packages\\langchain_community\\vectorstores\\faiss.py:402\u001b[0m, in \u001b[0;36mFAISS.similarity_search_with_score\u001b[1;34m(self, query, k, filter, fetch_k, **kwargs)\u001b[0m\n\u001b[0;32m 378\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21msimilarity_search_with_score\u001b[39m(\n\u001b[0;32m 379\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[0;32m 380\u001b[0m query: \u001b[38;5;28mstr\u001b[39m,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 384\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: Any,\n\u001b[0;32m 385\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m List[Tuple[Document, \u001b[38;5;28mfloat\u001b[39m]]:\n\u001b[0;32m 386\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Return docs most similar to query.\u001b[39;00m\n\u001b[0;32m 387\u001b[0m \n\u001b[0;32m 388\u001b[0m \u001b[38;5;124;03m Args:\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 400\u001b[0m \u001b[38;5;124;03m L2 distance in float. Lower score represents more similarity.\u001b[39;00m\n\u001b[0;32m 401\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m--> 402\u001b[0m embedding \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_embed_query\u001b[49m\u001b[43m(\u001b[49m\u001b[43mquery\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 403\u001b[0m docs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msimilarity_search_with_score_by_vector(\n\u001b[0;32m 404\u001b[0m embedding,\n\u001b[0;32m 405\u001b[0m k,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 408\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[0;32m 409\u001b[0m )\n\u001b[0;32m 410\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m docs\n",
"File \u001b[1;32mc:\\Users\\timmy_3aupohg\\anaconda3\\envs\\smog_env\\Lib\\site-packages\\langchain_community\\vectorstores\\faiss.py:156\u001b[0m, in \u001b[0;36mFAISS._embed_query\u001b[1;34m(self, text)\u001b[0m\n\u001b[0;32m 154\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39membedding_function\u001b[38;5;241m.\u001b[39membed_query(text)\n\u001b[0;32m 155\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m--> 156\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43membedding_function\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtext\u001b[49m\u001b[43m)\u001b[49m\n",
"\u001b[1;31mTypeError\u001b[0m: 'FAISS' object is not callable"
"name": "stdout",
"output_type": "stream",
"text": [
"206 4-5. Using the driving support systems\n",
"COROLLA_UWARNING\n",
"■Before using LDA system\n",
"●Do not rely solely upon the LDA \n",
"system. The LDA system does \n",
"not automatically drive the vehi-cle or reduce the amount of \n",
"attention that must be paid to \n",
"the area in front of the vehicle. The driver must always assume \n",
"full responsibilit y for driving \n",
"safely by paying careful atten-\n",
"tion to the surrounding condi-tions and operating the steering \n",
"wheel to correct the path of the \n",
"vehicle. Also, the driver must take adequate breaks when \n",
"fatigued, such as from driving \n",
"for a long period of time.\n",
"●Failure to perform appropriate \n",
"driving operations and pay care-\n",
"ful attention may lead to an \n",
"accident, resulting in death or serious injury.\n",
"●When not using the LDA sys-\n",
"tem, use the LDA switch to turn \n",
"the system off.\n",
"■Situations unsuitable for LDA system\n",
"In the following situations, use the LDA switch to turn the system off. \n",
"Failure to do so may lead to an \n",
"accident, resulting in death or serious injury.\n",
"●Vehicle is driven on a road sur-\n",
"face which is slippery due to \n",
"rainy weather, fallen snow, freezing, etc.\n",
"●Vehicle is driven on a snow-cov-\n",
"ered road.\n",
"●White (yellow) lin es are difficult \n",
"to see due to rain, snow, fog, \n",
"dust, etc.\n",
"●A spare tire, tire chains, etc. are \n",
"equipped.●When the tires have been excessively worn, or when the \n",
"tire inflation p ressure is low.\n",
"●When tires of a size other than specified are installed.\n",
"●Vehicle is driven in traffic lanes \n",
"other than that highways and \n",
"freeways.\n",
"●During emergency towing.\n",
"■Preventing LDA system mal-functions and operations per-\n",
"formed by mistake\n",
"●Do not modify the headlights or place stickers, etc. on the sur-\n",
"face of the lights.\n",
"●Do not modify the suspension etc. If the suspension etc. needs \n",
"to be replaced, contact your \n",
"Toyota dealer.\n",
"●Do not install or place anything on the hoo d or grille. Also, do \n",
"not install a gr ille guard (bull \n",
"bars, kangaroo bar, etc.).\n",
"●If your windshield needs repairs, contact your Toyota \n",
"dealer.\n",
"■Conditions in which functions \n",
"may not operate properly\n",
"In the following situations, the \n",
"functions may not operate prop-erly and the vehicle may depart \n",
"from its lane. Drive safely by \n",
"always paying careful attention to your surroundings and operate \n",
"the steering whee l to correct the \n",
"path of the vehicle without relying \n",
"solely on the functions.\n",
"●Vehicle is being driven around a sharp curve.\n",
"https://www.MyCarManual.com\n"
]
}
],
"source": [
"search_result, all = search(db, \"What is LDA\")\n",
"search_result, all, pages = search(db, \"What is LDA\")\n",
"print( search_result )"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[205, 208, 204, 212]"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pages"
]
},
{
"cell_type": "code",
"execution_count": null,