Files
ds_fire_fighter/doc-experiment.ipynb
T

341 lines
17 KiB
Plaintext
Raw Normal View History

2024-08-05 21:08:29 +01:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from langchain_community.embeddings import HuggingFaceBgeEmbeddings\n",
"from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
"from langchain_community.vectorstores import FAISS"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\timmy_3aupohg\\anaconda3\\envs\\smog_env\\Lib\\site-packages\\sentence_transformers\\cross_encoder\\CrossEncoder.py:11: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n",
" from tqdm.autonotebook import tqdm, trange\n"
]
}
],
"source": [
"# Initialize embedding\n",
"model_name = \"BAAI/bge-small-en\"\n",
"model_kwargs = {\"device\": \"cuda\"} #can also be cpu\n",
"encode_kwargs = {\"normalize_embeddings\": True}\n",
"embeddings = HuggingFaceBgeEmbeddings(\n",
" model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs\n",
" )\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"from langchain_community.document_loaders import PyPDFLoader"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# Load the document \n",
"loader = PyPDFLoader(\"data/corolla-2020-toyota-owners-manual.pdf\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"pages = loader.load_and_split()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"588"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(pages)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Document(metadata={'source': 'data/corolla-2020-toyota-owners-manual.pdf', 'page': 0}, page_content='1\\n2\\n3\\n456789\\n9\\n10\\nCOROLLA_UPictorial index Search by illustration\\nFor safety \\nand securityMake sure to read through them\\n(Main topics: Child seat, theft deterrent system)\\nVehicle status information and \\nindicatorsReading driving-related information\\n(Main topics: Meters, multi-information display)\\nBefore drivingOpening and closing the doors and windows, \\nadjustment before driving\\n(Main topics: Keys, doors, seats)\\nDrivingOperations and advice which are necessary for driving\\n(Main topics: Starting engine, refueling)\\nEntune audioOperating the Entune Audio\\n(Main topics: Audio/visual, phone, Toyota Entune)\\nInterior featuresUsage of the interior features\\n(Main topics: Air conditioner, storage features)\\nMaintenance \\nand careCaring for your vehicle and maintenance \\nprocedures\\n(Main topics: Interior and exterior, light bulbs)\\nWhen trouble \\narisesWhat to do in case of malfunction and emergency\\n(Main topics: Battery discharge, flat tire)\\nVehicle specifications Vehicle specifications, customizable features\\n(Main topics: Fuel, oil, tire inflation pressure)\\nFor ownersReporting safety defects for U.S. owners, and seat \\nbelt and SRS airbag instructions for Canadian owners\\nIndexSearch by symptom\\nSearch alphabetically\\nhttps://www.MyCarManual.com')"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pages[0]"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\timmy_3aupohg\\anaconda3\\envs\\smog_env\\Lib\\site-packages\\transformers\\models\\bert\\modeling_bert.py:439: UserWarning: 1Torch was not compiled with flash attention. (Triggered internally at C:\\cb\\pytorch_1000000000000\\work\\aten\\src\\ATen\\native\\transformers\\cuda\\sdp_utils.cpp:555.)\n",
" attn_output = torch.nn.functional.scaled_dot_product_attention(\n"
]
}
],
"source": [
"db = FAISS.from_documents(pages, embeddings)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"def save_embedded_data(embeddings, key=\"rand\"):\n",
" embeddings.save_local(f\"vec-db/index/faiss_index_{key}\")\n",
" print(\"Embeddings saved\")\n",
"\n",
"def load_embedded_data(embeddings, key):\n",
" embed_db = FAISS.load_local(f\"vec-db/index/faiss_index_{key}\", embeddings, allow_dangerous_deserialization=True)\n",
" return embed_db"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Embeddings saved\n"
]
}
],
"source": [
"save_embedded_data(db, key=\"rand\")"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"load_db = load_embedded_data(embeddings, key=\"rand\")"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"query = \"Steering assist function/lane centering function\"\n",
"docs = load_db.similarity_search(query)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"202 4-5. Using the driving support systems\n",
"COROLLA_UInside of displayed lines is \n",
"black\n",
"Indicates that the system is not able to recognize white (yellow) \n",
"lines or a course\n",
"* or is temporar-\n",
"ily canceled.\n",
"*: Boundary between asphalt and \n",
"the side of the road, such as \n",
"grass, soil, or a curb\n",
"Follow-up cruising display\n",
"Displayed when the multi-informa-tion display is switched to the driv-ing support system information screen.\n",
"Indicates that steering assist of the \n",
"lane centering function is operating by monitoring the position of a pre-ceding vehicle.\n",
"When the follow-up cruising display \n",
"is displayed, if the preceding vehi-cle moves, your vehicle may move in the same way. A lways pay care-\n",
"ful attention to your surroundings and operate the steering wheel as necessary to correct the path of the vehicle and ensure safety.\n",
"■Operation conditions of each \n",
"function\n",
"●Lane departure alert function\n",
"This function oper ates when all of \n",
"the following cond itions are met.\n",
"• LTA is turned on.• Vehicle speed is approximately 32 \n",
"mph (50 km/h) or more.*1\n",
"• System recognizes white (yellow) \n",
"lane lines or a course*2. (When a \n",
"white [yellow] line or course*2 is \n",
"recognized on only one side, the system will operate only for the \n",
"recognized side.)\n",
"• Width of traffic lane is approxi-\n",
"mately 9.8 ft. (3 m) or more.\n",
"• Turn signal lever is not operated.\n",
"(Vehicles with a Blind Spot Moni-\n",
"tor: Except when another vehicle \n",
"is in the lane on the side where the turn signal was operated)\n",
"• Vehicle is not being driven around \n",
"a sharp curve.\n",
"• No system malfunctions are \n",
"detected. ( P.204)\n",
"*1:The function oper ates even if the \n",
"vehicle speed is less than \n",
"approximately 32 mph (50 km/h) when the lane centering function is operating.\n",
"*2:Boundary between asphalt and \n",
"the side of the road, such as grass, soil, or a curb\n",
"●Steering assist function\n",
"This function operates when all of the following conditions are met in addition to the operation conditions for the lane departure alert function.\n",
"• Setting for “Steering Assist” in \n",
"of the multi-information display is \n",
"set to “ON”. ( P.548)\n",
"• Vehicle is not accelerated or \n",
"decelerated by a fixed amount or more.\n",
"• Steering wheel is not operated \n",
"with a steering force level suitable \n",
"for changing lanes.\n",
"• ABS, VSC, TRAC and PCS are \n",
"not operating.\n",
"• TRAC or VSC is not turned off.\n",
"• Hands off steering wheel warning \n",
"is not displayed. ( P.204)\n",
"●Vehicle sway warning function\n",
"This function operates when all of \n",
"https://www.MyCarManual.com\n"
]
}
],
"source": [
"print(docs[0].page_content)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"201\n"
]
}
],
"source": [
"print(docs[0].metadata['page'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def search(db, query, k=4):\n",
" docs = db.similarity_search(query, k)\n",
" all = \"\"\n",
" for doc in docs:\n",
" all += f\"{doc.page_content}\\n\"\n",
" return docs[0].page_content, all"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"ename": "TypeError",
"evalue": "'FAISS' object is not callable",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[16], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m search_result, \u001b[38;5;28mall\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[43msearch\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdb\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mWhat is LDA\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;28mprint\u001b[39m( search_result )\n",
"Cell \u001b[1;32mIn[15], line 2\u001b[0m, in \u001b[0;36msearch\u001b[1;34m(db, query, k)\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21msearch\u001b[39m(db, query, k\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m4\u001b[39m):\n\u001b[1;32m----> 2\u001b[0m docs \u001b[38;5;241m=\u001b[39m \u001b[43mdb\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msimilarity_search\u001b[49m\u001b[43m(\u001b[49m\u001b[43mquery\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mk\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 3\u001b[0m \u001b[38;5;28mall\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 4\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m doc \u001b[38;5;129;01min\u001b[39;00m docs:\n",
"File \u001b[1;32mc:\\Users\\timmy_3aupohg\\anaconda3\\envs\\smog_env\\Lib\\site-packages\\langchain_community\\vectorstores\\faiss.py:530\u001b[0m, in \u001b[0;36mFAISS.similarity_search\u001b[1;34m(self, query, k, filter, fetch_k, **kwargs)\u001b[0m\n\u001b[0;32m 510\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21msimilarity_search\u001b[39m(\n\u001b[0;32m 511\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[0;32m 512\u001b[0m query: \u001b[38;5;28mstr\u001b[39m,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 516\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: Any,\n\u001b[0;32m 517\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m List[Document]:\n\u001b[0;32m 518\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Return docs most similar to query.\u001b[39;00m\n\u001b[0;32m 519\u001b[0m \n\u001b[0;32m 520\u001b[0m \u001b[38;5;124;03m Args:\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 528\u001b[0m \u001b[38;5;124;03m List of Documents most similar to the query.\u001b[39;00m\n\u001b[0;32m 529\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m--> 530\u001b[0m docs_and_scores \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msimilarity_search_with_score\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 531\u001b[0m \u001b[43m \u001b[49m\u001b[43mquery\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mk\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mfilter\u001b[39;49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mfilter\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfetch_k\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfetch_k\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\n\u001b[0;32m 532\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 533\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m [doc \u001b[38;5;28;01mfor\u001b[39;00m doc, _ \u001b[38;5;129;01min\u001b[39;00m docs_and_scores]\n",
"File \u001b[1;32mc:\\Users\\timmy_3aupohg\\anaconda3\\envs\\smog_env\\Lib\\site-packages\\langchain_community\\vectorstores\\faiss.py:402\u001b[0m, in \u001b[0;36mFAISS.similarity_search_with_score\u001b[1;34m(self, query, k, filter, fetch_k, **kwargs)\u001b[0m\n\u001b[0;32m 378\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21msimilarity_search_with_score\u001b[39m(\n\u001b[0;32m 379\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[0;32m 380\u001b[0m query: \u001b[38;5;28mstr\u001b[39m,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 384\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: Any,\n\u001b[0;32m 385\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m List[Tuple[Document, \u001b[38;5;28mfloat\u001b[39m]]:\n\u001b[0;32m 386\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Return docs most similar to query.\u001b[39;00m\n\u001b[0;32m 387\u001b[0m \n\u001b[0;32m 388\u001b[0m \u001b[38;5;124;03m Args:\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 400\u001b[0m \u001b[38;5;124;03m L2 distance in float. Lower score represents more similarity.\u001b[39;00m\n\u001b[0;32m 401\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m--> 402\u001b[0m embedding \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_embed_query\u001b[49m\u001b[43m(\u001b[49m\u001b[43mquery\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 403\u001b[0m docs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msimilarity_search_with_score_by_vector(\n\u001b[0;32m 404\u001b[0m embedding,\n\u001b[0;32m 405\u001b[0m k,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 408\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[0;32m 409\u001b[0m )\n\u001b[0;32m 410\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m docs\n",
"File \u001b[1;32mc:\\Users\\timmy_3aupohg\\anaconda3\\envs\\smog_env\\Lib\\site-packages\\langchain_community\\vectorstores\\faiss.py:156\u001b[0m, in \u001b[0;36mFAISS._embed_query\u001b[1;34m(self, text)\u001b[0m\n\u001b[0;32m 154\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39membedding_function\u001b[38;5;241m.\u001b[39membed_query(text)\n\u001b[0;32m 155\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m--> 156\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43membedding_function\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtext\u001b[49m\u001b[43m)\u001b[49m\n",
"\u001b[1;31mTypeError\u001b[0m: 'FAISS' object is not callable"
]
}
],
"source": [
"search_result, all = search(db, \"What is LDA\")\n",
"print( search_result )"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "ai_index",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}