diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..0fe238ac --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +Ai indexing +data +images \ No newline at end of file diff --git a/__pycache__/text_extractor.cpython-311.pyc b/__pycache__/text_extractor.cpython-311.pyc new file mode 100644 index 00000000..34fe646c Binary files /dev/null and b/__pycache__/text_extractor.cpython-311.pyc differ diff --git a/audio_experiment.ipynb b/audio_experiment.ipynb new file mode 100644 index 00000000..e69de29b diff --git a/data/documents.json b/data/documents.json index c700f924..e8bf3afc 100644 --- a/data/documents.json +++ b/data/documents.json @@ -1 +1 @@ -{"doc_names": ["Car-Repair-Receipt-repair", "Car-Repair-Receipt-service", "Car-Repair-Receipt-tire", "Car-Repair-Receipt-tuning", "Car-Repair-Receipt-wash", "corolla-2020-toyota-owners-manual", "How to change engine oil and filter on TOYOTA Corolla", "How to change front brake pads on TOYOTA Corolla", "How to change rear windshield wipers on TOYOTA Corolla", "How to change spark plugs on TOYOTA COROLLA"], "docs_id": ["911dea9b7b714adf8ecafd483a37741b", "e9308cf998a64cab8aef9bde04795fc4", "1f013bd6ac464a07acd8d60a425142d7", "3c99eade18a344d4a568cd77e58558f3", "708f7ba5121442c692dba1346097c4e4", "9e134439a0b84f26a213a288cbe45ab5", "8eb0c0f04eb44e2bafba7640ed34b26b", "c4571cec94034cf38b5d2d59a694464e", "4253d6ea5aeb43f1a65b11a2a631389f", "e2c66cfac77b4099908b1d41a66a7fe2"], "num_pages": [1, 2, 2, 2, 1, 588, 6, 7, 6, 10]} \ No newline at end of file +{"doc_names": ["Car-Repair-Receipt-repair", "Car-Repair-Receipt-service", "Car-Repair-Receipt-tire", "Car-Repair-Receipt-tuning", "Car-Repair-Receipt-wash", "corolla-2020-toyota-owners-manual", "data\\dodge-challenger-auto-body-repair-after", "data\\dodge-challenger-auto-body-repair-before", "How to change engine oil and filter on TOYOTA Corolla", "How to change front brake pads on TOYOTA Corolla", "How to change rear windshield wipers on TOYOTA Corolla", "How to change spark plugs on TOYOTA COROLLA", "data\\hyundai-sonata-auto-body-repair-after", "data\\hyundai-sonata-auto-body-repair-before", "data\\IMG_1436", "data\\IMG_1437", "data\\IMG_1438", "data\\IMG_1440", "data\\IMG_1441", "data\\IMG_1442", "data\\IMG_1443", "data\\IMG_1444", "data\\pontiac-vibe-auto-body-repair-after", "data\\pontiac-vibe-auto-body-repair-before", "data\\toyota-tacoma-auto-body-repair-after", "data\\toyota-tacoma-auto-body-repair-before"], "docs_id": ["5f26879376a44a77bbc2b966b9189ca4", "51b1c6cab5f1440e9fd948b6d858e812", "1d63ef4a149d4addb0803370885d70c1", "749ea365f2244eb6b23bb17e28d9cd2e", "e6d3736c0e8f424382c2ff5298814534", "91b116993e4b4865b3dc7bceca9749f0", "77f9558bd9894daeaf9aaea4013ed20e", "d974631f67d242739343b3c32e91355c", "a18ad23b3c7641b3a61e77e0e143a265", "0b710683db314b14ae6f0e0919a12068", "136c808efffa4f8798c55e7595c768a1", "236dc9603c9c4e83840721175d3dc861", "5aa9f750dbdd403c94abb53883c0fad2", "0382e54d68a84021803b07c7cf7c3ad9", "a772d008c9bf4ee6a2026f00998f3f2c", "66afb44563f6449ca705a39c9a72440d", "59ef1e9cc81b41d3a32d5dcc069a0ace", "9991145202384596bc3f5ff666d213bd", "d7f49b6629e84ec7bfd1a0048d2ade76", "689296161d6b46e8b9e792dbdc8a155d", "ba6be2ab8ae74042a9c9da51c46b8f90", "d62daf66b833419fae17333395cd7b04", "7b109e03c62343fd8f8e23dcf6bdfd3b", "8254386611fc4feb85744f69e5120e18", "022d6bae08274a618921c49590040a1f", "719ed0e4d9a94fe39799c227eaac1e05"], "num_pages": [1, 2, 2, 2, 1, 588, 1, 1, 6, 7, 6, 10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]} \ No newline at end of file diff --git a/data_ingestion/__pycache__/utils.cpython-311.pyc b/data_ingestion/__pycache__/utils.cpython-311.pyc index 403ae253..55171dc7 100644 Binary files a/data_ingestion/__pycache__/utils.cpython-311.pyc and b/data_ingestion/__pycache__/utils.cpython-311.pyc differ diff --git a/data_ingestion/utils.py b/data_ingestion/utils.py index 50ce4885..a4eaff52 100644 --- a/data_ingestion/utils.py +++ b/data_ingestion/utils.py @@ -8,6 +8,7 @@ from langchain_community.document_loaders import TextLoader from langchain_community.document_loaders import Docx2txtLoader from uuid import uuid4 from langchain_core.documents import Document +from text_extractor import TextExtractor import os import json @@ -90,6 +91,18 @@ def load_document(document_path): else: raise ValueError(f"Unsupported document type for {document_path}") +def create_image_document(image_path): + # getting the image name from the image path + image_name = image_path.split('/')[-1].split('.')[0] + # setting image name as metadata + metadata = {'filename': image_name} + text_extractor = TextExtractor() + text = text_extractor.read_text_from_image(image_path) + # removing special characters and line breaks + text = ''.join(e for e in text if e.isalnum() or e.isspace() or e == '\n') + doc = Document(page_content=text, metadata=metadata) + # returning the document in a list + return [doc] def save_embedded_data(embeddings, key="data"): @@ -133,6 +146,16 @@ def load_documents_from_directory(directory_path: str): # adding the document name to the doc_names list doc_names.append(doc_name) print(f"Document {doc_name} loaded") + elif extension in image_doc: + # creating an image document + doc = create_image_document(path) + # appending the document to the documents list + documents.append(doc) + # appending the number of pages in the document + num_pages.append(1) + # adding the document name to the doc_names list + doc_names.append(doc[0].metadata['filename']) + print(f"Document {doc[0].metadata['filename']} loaded") # so we need to create a document id for each document docs_id = [uuid4().hex for i in range(len(documents))] @@ -189,11 +212,14 @@ def add_documents_to_vector_store(embeddings, documents: list, docs_id: list, nu # A document search function -def search(db, query, k=4): +def search(db, query, k=3): docs = db.similarity_search(query, k) all = "" pages = [] for doc in docs: - all += f"{doc.page_content}\n" - pages.append(doc.metadata['page']) + all += f"{doc.page_content}\n" + try: + pages.append(doc.metadata['page']) + except: + pages.append(doc.metadata['filename']) return docs[0].page_content, all, pages diff --git a/image_experiment.ipynb b/image_experiment.ipynb new file mode 100644 index 00000000..9ff5adec --- /dev/null +++ b/image_experiment.ipynb @@ -0,0 +1,161 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install -q pdfplumber" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "from text_extractor import TextExtractor\n", + "from langchain_core.documents import Document" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "# creating a function to extract texts from image\n", + "def create_image_document(image_path):\n", + " # getting the image name from the image path\n", + " image_name = image_path.split('/')[-1].split('.')[0]\n", + " # setting image name as metadata\n", + " metadata = {'filename': image_name}\n", + " text_extractor = TextExtractor()\n", + " text = text_extractor.read_text_from_image(image_path)\n", + " # removing special characters and line breaks\n", + " text = ''.join(e for e in text if e.isalnum() or e.isspace() or e == '\\n')\n", + " doc = Document(page_content=text, metadata=metadata)\n", + " # returning the document\n", + " return [doc]" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Document(metadata={'filename': 'IMG_1438'}, page_content='ex a\\n\\nAccidented car before repair\\n')]\n" + ] + } + ], + "source": [ + "# testing the function\n", + "image_path = 'data/IMG_1438.jpeg'\n", + "text = create_image_document(image_path)\n", + "print(text)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'filename': 'IMG_1438'}" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "text[0].metadata" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "smog_env", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/loggings/app.log b/loggings/app.log index 83cf54b1..35d6d571 100644 --- a/loggings/app.log +++ b/loggings/app.log @@ -50,3 +50,33 @@ 2024-08-07 18:46:38,939 - INFO - Loading the embeddings 2024-08-07 18:46:38,939 - INFO - Load pretrained SentenceTransformer: BAAI/bge-small-en 2024-08-07 18:47:03,089 - INFO - Embeddings loaded +2024-08-08 14:03:36,111 - INFO - Loading the embeddings +2024-08-08 14:03:36,113 - INFO - Load pretrained SentenceTransformer: BAAI/bge-small-en +2024-08-08 14:03:39,637 - INFO - Embeddings loaded +2024-08-08 14:03:39,637 - INFO - Loading data from ./data +2024-08-08 14:04:29,085 - INFO - Data loaded +2024-08-08 14:04:29,087 - INFO - Creating vector store +2024-08-08 14:06:40,106 - INFO - Loading the embeddings +2024-08-08 14:06:40,106 - INFO - Load pretrained SentenceTransformer: BAAI/bge-small-en +2024-08-08 14:06:43,523 - INFO - Embeddings loaded +2024-08-08 14:06:43,523 - INFO - Loading data from ./data +2024-08-08 14:20:21,150 - INFO - Loading the embeddings +2024-08-08 14:20:21,150 - INFO - Load pretrained SentenceTransformer: BAAI/bge-small-en +2024-08-08 14:20:25,150 - INFO - Embeddings loaded +2024-08-08 14:20:25,150 - INFO - Loading data from ./data +2024-08-08 14:21:13,769 - INFO - Data loaded +2024-08-08 14:21:13,769 - INFO - Creating vector store +2024-08-08 14:21:24,386 - INFO - Vector store created +2024-08-08 14:21:24,386 - INFO - Saving the vector store +2024-08-08 14:21:24,386 - INFO - Vector store saved +2024-08-08 14:22:17,106 - INFO - Receiving the search query +2024-08-08 14:22:23,740 - INFO - Searching for Accidented car before repair +2024-08-08 14:24:45,013 - INFO - Receiving the search query +2024-08-08 14:25:07,699 - INFO - Searching for Accidented car before repair +2024-08-08 14:28:43,776 - INFO - Receiving the search query +2024-08-08 14:28:46,944 - INFO - Searching for Accidented car before repair +2024-08-08 14:29:13,295 - INFO - Receiving the search query +2024-08-08 14:29:17,628 - INFO - Searching for Accidented car before repair +2024-08-08 14:29:17,820 - INFO - Search completed +2024-08-08 14:29:17,820 - INFO - Page content: Accidented car Before repair + diff --git a/requirements.txt b/requirements.txt index 5f3958d6..6db0dfa7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,4 +11,6 @@ langchain-text-splitters unstructured[all-docs] docx2txt docx -"fastapi[standard]" \ No newline at end of file +fastapi[standard] +pdfplumber +pytesseract \ No newline at end of file diff --git a/text_extractor.py b/text_extractor.py index efd435e9..82d4572a 100644 --- a/text_extractor.py +++ b/text_extractor.py @@ -44,8 +44,8 @@ class TextExtractor: except Exception as e: print(f"Error reading text from image: {e}") return "" - finally: - os.remove(image_path) + # finally: + # os.remove(image_path) def read_text_from_pdf(self, pdf_path): """ diff --git a/vec-db/index/faiss_index_data/index.faiss b/vec-db/index/faiss_index_data/index.faiss index cdb06fcd..163e591d 100644 Binary files a/vec-db/index/faiss_index_data/index.faiss and b/vec-db/index/faiss_index_data/index.faiss differ diff --git a/vec-db/index/faiss_index_data/index.pkl b/vec-db/index/faiss_index_data/index.pkl index 4d6c0f41..cc9c1998 100644 Binary files a/vec-db/index/faiss_index_data/index.pkl and b/vec-db/index/faiss_index_data/index.pkl differ diff --git a/video_experiment.ipynb b/video_experiment.ipynb new file mode 100644 index 00000000..62f01dbc --- /dev/null +++ b/video_experiment.ipynb @@ -0,0 +1,102 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}