image text extraction completed

2024-08-08 14:58:44 +01:00
parent 9a2a4c5fdd
commit c54dc17989
13 changed files with 331 additions and 7 deletions
@@ -0,0 +1,3 @@
+Ai indexing
+data
+images
@@ -1 +1 @@
-{"doc_names": ["Car-Repair-Receipt-repair", "Car-Repair-Receipt-service", "Car-Repair-Receipt-tire", "Car-Repair-Receipt-tuning", "Car-Repair-Receipt-wash", "corolla-2020-toyota-owners-manual", "How to change engine oil and filter on TOYOTA Corolla", "How to change front brake pads on TOYOTA Corolla", "How to change rear windshield wipers on TOYOTA Corolla", "How to change spark plugs on TOYOTA COROLLA"], "docs_id": ["911dea9b7b714adf8ecafd483a37741b", "e9308cf998a64cab8aef9bde04795fc4", "1f013bd6ac464a07acd8d60a425142d7", "3c99eade18a344d4a568cd77e58558f3", "708f7ba5121442c692dba1346097c4e4", "9e134439a0b84f26a213a288cbe45ab5", "8eb0c0f04eb44e2bafba7640ed34b26b", "c4571cec94034cf38b5d2d59a694464e", "4253d6ea5aeb43f1a65b11a2a631389f", "e2c66cfac77b4099908b1d41a66a7fe2"], "num_pages": [1, 2, 2, 2, 1, 588, 6, 7, 6, 10]}
+{"doc_names": ["Car-Repair-Receipt-repair", "Car-Repair-Receipt-service", "Car-Repair-Receipt-tire", "Car-Repair-Receipt-tuning", "Car-Repair-Receipt-wash", "corolla-2020-toyota-owners-manual", "data\\dodge-challenger-auto-body-repair-after", "data\\dodge-challenger-auto-body-repair-before", "How to change engine oil and filter on TOYOTA Corolla", "How to change front brake pads on TOYOTA Corolla", "How to change rear windshield wipers on TOYOTA Corolla", "How to change spark plugs on TOYOTA COROLLA", "data\\hyundai-sonata-auto-body-repair-after", "data\\hyundai-sonata-auto-body-repair-before", "data\\IMG_1436", "data\\IMG_1437", "data\\IMG_1438", "data\\IMG_1440", "data\\IMG_1441", "data\\IMG_1442", "data\\IMG_1443", "data\\IMG_1444", "data\\pontiac-vibe-auto-body-repair-after", "data\\pontiac-vibe-auto-body-repair-before", "data\\toyota-tacoma-auto-body-repair-after", "data\\toyota-tacoma-auto-body-repair-before"], "docs_id": ["5f26879376a44a77bbc2b966b9189ca4", "51b1c6cab5f1440e9fd948b6d858e812", "1d63ef4a149d4addb0803370885d70c1", "749ea365f2244eb6b23bb17e28d9cd2e", "e6d3736c0e8f424382c2ff5298814534", "91b116993e4b4865b3dc7bceca9749f0", "77f9558bd9894daeaf9aaea4013ed20e", "d974631f67d242739343b3c32e91355c", "a18ad23b3c7641b3a61e77e0e143a265", "0b710683db314b14ae6f0e0919a12068", "136c808efffa4f8798c55e7595c768a1", "236dc9603c9c4e83840721175d3dc861", "5aa9f750dbdd403c94abb53883c0fad2", "0382e54d68a84021803b07c7cf7c3ad9", "a772d008c9bf4ee6a2026f00998f3f2c", "66afb44563f6449ca705a39c9a72440d", "59ef1e9cc81b41d3a32d5dcc069a0ace", "9991145202384596bc3f5ff666d213bd", "d7f49b6629e84ec7bfd1a0048d2ade76", "689296161d6b46e8b9e792dbdc8a155d", "ba6be2ab8ae74042a9c9da51c46b8f90", "d62daf66b833419fae17333395cd7b04", "7b109e03c62343fd8f8e23dcf6bdfd3b", "8254386611fc4feb85744f69e5120e18", "022d6bae08274a618921c49590040a1f", "719ed0e4d9a94fe39799c227eaac1e05"], "num_pages": [1, 2, 2, 2, 1, 588, 1, 1, 6, 7, 6, 10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
@@ -8,6 +8,7 @@ from langchain_community.document_loaders import TextLoader
 from langchain_community.document_loaders import Docx2txtLoader
 from uuid import uuid4
 from langchain_core.documents  import Document
+from text_extractor import TextExtractor
 import os
 import json

@@ -90,6 +91,18 @@ def load_document(document_path):
    else:
        raise ValueError(f"Unsupported document type for {document_path}")

+def create_image_document(image_path):
+    # getting the image name from the image path
+    image_name = image_path.split('/')[-1].split('.')[0]
+    # setting image name as metadata
+    metadata = {'filename': image_name}
+    text_extractor = TextExtractor()
+    text = text_extractor.read_text_from_image(image_path)
+    # removing special characters and line breaks
+    text = ''.join(e for e in text if e.isalnum() or e.isspace() or e == '\n')
+    doc = Document(page_content=text, metadata=metadata)
+    # returning the document in a list
+    return [doc]


 def save_embedded_data(embeddings, key="data"):
@@ -133,6 +146,16 @@ def load_documents_from_directory(directory_path: str):
            # adding the document name to the doc_names list
            doc_names.append(doc_name)
            print(f"Document {doc_name} loaded")
+        elif extension in image_doc:
+            # creating an image document
+            doc = create_image_document(path)
+            # appending the document to the documents list
+            documents.append(doc)
+            # appending the number of pages in the document
+            num_pages.append(1)
+            # adding the document name to the doc_names list
+            doc_names.append(doc[0].metadata['filename'])
+            print(f"Document {doc[0].metadata['filename']} loaded")
            
    # so we need to create a document id for each document
    docs_id = [uuid4().hex for i in range(len(documents))]
@@ -189,11 +212,14 @@ def add_documents_to_vector_store(embeddings, documents: list, docs_id: list, nu
    

 # A document search function
-def search(db, query, k=4):
+def search(db, query, k=3):
  docs = db.similarity_search(query, k)
  all = ""
  pages = []
  for doc in docs:
    all += f"{doc.page_content}\n"
+    try:
        pages.append(doc.metadata['page'])
+    except:
+        pages.append(doc.metadata['filename'])
  return docs[0].page_content, all, pages
@@ -0,0 +1,161 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install -q pdfplumber"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from text_extractor import TextExtractor\n",
+    "from langchain_core.documents  import Document"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# creating a function to extract texts from image\n",
+    "def create_image_document(image_path):\n",
+    "    # getting the image name from the image path\n",
+    "    image_name = image_path.split('/')[-1].split('.')[0]\n",
+    "    # setting image name as metadata\n",
+    "    metadata = {'filename': image_name}\n",
+    "    text_extractor = TextExtractor()\n",
+    "    text = text_extractor.read_text_from_image(image_path)\n",
+    "    # removing special characters and line breaks\n",
+    "    text = ''.join(e for e in text if e.isalnum() or e.isspace() or e == '\\n')\n",
+    "    doc = Document(page_content=text, metadata=metadata)\n",
+    "    # returning the document\n",
+    "    return [doc]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[Document(metadata={'filename': 'IMG_1438'}, page_content='ex   a\\n\\nAccidented car before repair\\n')]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# testing the function\n",
+    "image_path = 'data/IMG_1438.jpeg'\n",
+    "text = create_image_document(image_path)\n",
+    "print(text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'filename': 'IMG_1438'}"
+      ]
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "text[0].metadata"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "smog_env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
@@ -50,3 +50,33 @@
 2024-08-07 18:46:38,939 - INFO - Loading the embeddings
 2024-08-07 18:46:38,939 - INFO - Load pretrained SentenceTransformer: BAAI/bge-small-en
 2024-08-07 18:47:03,089 - INFO - Embeddings loaded
+2024-08-08 14:03:36,111 - INFO - Loading the embeddings
+2024-08-08 14:03:36,113 - INFO - Load pretrained SentenceTransformer: BAAI/bge-small-en
+2024-08-08 14:03:39,637 - INFO - Embeddings loaded
+2024-08-08 14:03:39,637 - INFO - Loading data from ./data
+2024-08-08 14:04:29,085 - INFO - Data loaded
+2024-08-08 14:04:29,087 - INFO - Creating vector store
+2024-08-08 14:06:40,106 - INFO - Loading the embeddings
+2024-08-08 14:06:40,106 - INFO - Load pretrained SentenceTransformer: BAAI/bge-small-en
+2024-08-08 14:06:43,523 - INFO - Embeddings loaded
+2024-08-08 14:06:43,523 - INFO - Loading data from ./data
+2024-08-08 14:20:21,150 - INFO - Loading the embeddings
+2024-08-08 14:20:21,150 - INFO - Load pretrained SentenceTransformer: BAAI/bge-small-en
+2024-08-08 14:20:25,150 - INFO - Embeddings loaded
+2024-08-08 14:20:25,150 - INFO - Loading data from ./data
+2024-08-08 14:21:13,769 - INFO - Data loaded
+2024-08-08 14:21:13,769 - INFO - Creating vector store
+2024-08-08 14:21:24,386 - INFO - Vector store created
+2024-08-08 14:21:24,386 - INFO - Saving the vector store
+2024-08-08 14:21:24,386 - INFO - Vector store saved
+2024-08-08 14:22:17,106 - INFO - Receiving the search query
+2024-08-08 14:22:23,740 - INFO - Searching for Accidented car before repair
+2024-08-08 14:24:45,013 - INFO - Receiving the search query
+2024-08-08 14:25:07,699 - INFO - Searching for Accidented car before repair
+2024-08-08 14:28:43,776 - INFO - Receiving the search query
+2024-08-08 14:28:46,944 - INFO - Searching for Accidented car before repair
+2024-08-08 14:29:13,295 - INFO - Receiving the search query
+2024-08-08 14:29:17,628 - INFO - Searching for Accidented car before repair
+2024-08-08 14:29:17,820 - INFO - Search completed
+2024-08-08 14:29:17,820 - INFO - Page content: Accidented car Before repair
+
@@ -11,4 +11,6 @@ langchain-text-splitters
 unstructured[all-docs]
 docx2txt
 docx
-"fastapi[standard]"
+fastapi[standard]
+pdfplumber
+pytesseract
@@ -44,8 +44,8 @@ class TextExtractor:
        except Exception as e:
            print(f"Error reading text from image: {e}")
            return ""
-        finally:
-            os.remove(image_path)
+        # finally:
+        #     os.remove(image_path)

    def read_text_from_pdf(self, pdf_path):
        """
@@ -0,0 +1,102 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}