image text extraction completed

This commit is contained in:
timothyafolami
2024-08-08 14:58:44 +01:00
parent 9a2a4c5fdd
commit c54dc17989
13 changed files with 331 additions and 7 deletions
+3
View File
@@ -0,0 +1,3 @@
Ai indexing
data
images
Binary file not shown.
View File
+1 -1
View File
@@ -1 +1 @@
{"doc_names": ["Car-Repair-Receipt-repair", "Car-Repair-Receipt-service", "Car-Repair-Receipt-tire", "Car-Repair-Receipt-tuning", "Car-Repair-Receipt-wash", "corolla-2020-toyota-owners-manual", "How to change engine oil and filter on TOYOTA Corolla", "How to change front brake pads on TOYOTA Corolla", "How to change rear windshield wipers on TOYOTA Corolla", "How to change spark plugs on TOYOTA COROLLA"], "docs_id": ["911dea9b7b714adf8ecafd483a37741b", "e9308cf998a64cab8aef9bde04795fc4", "1f013bd6ac464a07acd8d60a425142d7", "3c99eade18a344d4a568cd77e58558f3", "708f7ba5121442c692dba1346097c4e4", "9e134439a0b84f26a213a288cbe45ab5", "8eb0c0f04eb44e2bafba7640ed34b26b", "c4571cec94034cf38b5d2d59a694464e", "4253d6ea5aeb43f1a65b11a2a631389f", "e2c66cfac77b4099908b1d41a66a7fe2"], "num_pages": [1, 2, 2, 2, 1, 588, 6, 7, 6, 10]}
{"doc_names": ["Car-Repair-Receipt-repair", "Car-Repair-Receipt-service", "Car-Repair-Receipt-tire", "Car-Repair-Receipt-tuning", "Car-Repair-Receipt-wash", "corolla-2020-toyota-owners-manual", "data\\dodge-challenger-auto-body-repair-after", "data\\dodge-challenger-auto-body-repair-before", "How to change engine oil and filter on TOYOTA Corolla", "How to change front brake pads on TOYOTA Corolla", "How to change rear windshield wipers on TOYOTA Corolla", "How to change spark plugs on TOYOTA COROLLA", "data\\hyundai-sonata-auto-body-repair-after", "data\\hyundai-sonata-auto-body-repair-before", "data\\IMG_1436", "data\\IMG_1437", "data\\IMG_1438", "data\\IMG_1440", "data\\IMG_1441", "data\\IMG_1442", "data\\IMG_1443", "data\\IMG_1444", "data\\pontiac-vibe-auto-body-repair-after", "data\\pontiac-vibe-auto-body-repair-before", "data\\toyota-tacoma-auto-body-repair-after", "data\\toyota-tacoma-auto-body-repair-before"], "docs_id": ["5f26879376a44a77bbc2b966b9189ca4", "51b1c6cab5f1440e9fd948b6d858e812", "1d63ef4a149d4addb0803370885d70c1", "749ea365f2244eb6b23bb17e28d9cd2e", "e6d3736c0e8f424382c2ff5298814534", "91b116993e4b4865b3dc7bceca9749f0", "77f9558bd9894daeaf9aaea4013ed20e", "d974631f67d242739343b3c32e91355c", "a18ad23b3c7641b3a61e77e0e143a265", "0b710683db314b14ae6f0e0919a12068", "136c808efffa4f8798c55e7595c768a1", "236dc9603c9c4e83840721175d3dc861", "5aa9f750dbdd403c94abb53883c0fad2", "0382e54d68a84021803b07c7cf7c3ad9", "a772d008c9bf4ee6a2026f00998f3f2c", "66afb44563f6449ca705a39c9a72440d", "59ef1e9cc81b41d3a32d5dcc069a0ace", "9991145202384596bc3f5ff666d213bd", "d7f49b6629e84ec7bfd1a0048d2ade76", "689296161d6b46e8b9e792dbdc8a155d", "ba6be2ab8ae74042a9c9da51c46b8f90", "d62daf66b833419fae17333395cd7b04", "7b109e03c62343fd8f8e23dcf6bdfd3b", "8254386611fc4feb85744f69e5120e18", "022d6bae08274a618921c49590040a1f", "719ed0e4d9a94fe39799c227eaac1e05"], "num_pages": [1, 2, 2, 2, 1, 588, 1, 1, 6, 7, 6, 10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
Binary file not shown.
+29 -3
View File
@@ -8,6 +8,7 @@ from langchain_community.document_loaders import TextLoader
from langchain_community.document_loaders import Docx2txtLoader
from uuid import uuid4
from langchain_core.documents import Document
from text_extractor import TextExtractor
import os
import json
@@ -90,6 +91,18 @@ def load_document(document_path):
else:
raise ValueError(f"Unsupported document type for {document_path}")
def create_image_document(image_path):
# getting the image name from the image path
image_name = image_path.split('/')[-1].split('.')[0]
# setting image name as metadata
metadata = {'filename': image_name}
text_extractor = TextExtractor()
text = text_extractor.read_text_from_image(image_path)
# removing special characters and line breaks
text = ''.join(e for e in text if e.isalnum() or e.isspace() or e == '\n')
doc = Document(page_content=text, metadata=metadata)
# returning the document in a list
return [doc]
def save_embedded_data(embeddings, key="data"):
@@ -133,6 +146,16 @@ def load_documents_from_directory(directory_path: str):
# adding the document name to the doc_names list
doc_names.append(doc_name)
print(f"Document {doc_name} loaded")
elif extension in image_doc:
# creating an image document
doc = create_image_document(path)
# appending the document to the documents list
documents.append(doc)
# appending the number of pages in the document
num_pages.append(1)
# adding the document name to the doc_names list
doc_names.append(doc[0].metadata['filename'])
print(f"Document {doc[0].metadata['filename']} loaded")
# so we need to create a document id for each document
docs_id = [uuid4().hex for i in range(len(documents))]
@@ -189,11 +212,14 @@ def add_documents_to_vector_store(embeddings, documents: list, docs_id: list, nu
# A document search function
def search(db, query, k=4):
def search(db, query, k=3):
docs = db.similarity_search(query, k)
all = ""
pages = []
for doc in docs:
all += f"{doc.page_content}\n"
pages.append(doc.metadata['page'])
all += f"{doc.page_content}\n"
try:
pages.append(doc.metadata['page'])
except:
pages.append(doc.metadata['filename'])
return docs[0].page_content, all, pages
+161
View File
@@ -0,0 +1,161 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"!pip install -q pdfplumber"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"from text_extractor import TextExtractor\n",
"from langchain_core.documents import Document"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"# creating a function to extract texts from image\n",
"def create_image_document(image_path):\n",
" # getting the image name from the image path\n",
" image_name = image_path.split('/')[-1].split('.')[0]\n",
" # setting image name as metadata\n",
" metadata = {'filename': image_name}\n",
" text_extractor = TextExtractor()\n",
" text = text_extractor.read_text_from_image(image_path)\n",
" # removing special characters and line breaks\n",
" text = ''.join(e for e in text if e.isalnum() or e.isspace() or e == '\\n')\n",
" doc = Document(page_content=text, metadata=metadata)\n",
" # returning the document\n",
" return [doc]"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[Document(metadata={'filename': 'IMG_1438'}, page_content='ex a\\n\\nAccidented car before repair\\n')]\n"
]
}
],
"source": [
"# testing the function\n",
"image_path = 'data/IMG_1438.jpeg'\n",
"text = create_image_document(image_path)\n",
"print(text)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'filename': 'IMG_1438'}"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"text[0].metadata"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "smog_env",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
+30
View File
@@ -50,3 +50,33 @@
2024-08-07 18:46:38,939 - INFO - Loading the embeddings
2024-08-07 18:46:38,939 - INFO - Load pretrained SentenceTransformer: BAAI/bge-small-en
2024-08-07 18:47:03,089 - INFO - Embeddings loaded
2024-08-08 14:03:36,111 - INFO - Loading the embeddings
2024-08-08 14:03:36,113 - INFO - Load pretrained SentenceTransformer: BAAI/bge-small-en
2024-08-08 14:03:39,637 - INFO - Embeddings loaded
2024-08-08 14:03:39,637 - INFO - Loading data from ./data
2024-08-08 14:04:29,085 - INFO - Data loaded
2024-08-08 14:04:29,087 - INFO - Creating vector store
2024-08-08 14:06:40,106 - INFO - Loading the embeddings
2024-08-08 14:06:40,106 - INFO - Load pretrained SentenceTransformer: BAAI/bge-small-en
2024-08-08 14:06:43,523 - INFO - Embeddings loaded
2024-08-08 14:06:43,523 - INFO - Loading data from ./data
2024-08-08 14:20:21,150 - INFO - Loading the embeddings
2024-08-08 14:20:21,150 - INFO - Load pretrained SentenceTransformer: BAAI/bge-small-en
2024-08-08 14:20:25,150 - INFO - Embeddings loaded
2024-08-08 14:20:25,150 - INFO - Loading data from ./data
2024-08-08 14:21:13,769 - INFO - Data loaded
2024-08-08 14:21:13,769 - INFO - Creating vector store
2024-08-08 14:21:24,386 - INFO - Vector store created
2024-08-08 14:21:24,386 - INFO - Saving the vector store
2024-08-08 14:21:24,386 - INFO - Vector store saved
2024-08-08 14:22:17,106 - INFO - Receiving the search query
2024-08-08 14:22:23,740 - INFO - Searching for Accidented car before repair
2024-08-08 14:24:45,013 - INFO - Receiving the search query
2024-08-08 14:25:07,699 - INFO - Searching for Accidented car before repair
2024-08-08 14:28:43,776 - INFO - Receiving the search query
2024-08-08 14:28:46,944 - INFO - Searching for Accidented car before repair
2024-08-08 14:29:13,295 - INFO - Receiving the search query
2024-08-08 14:29:17,628 - INFO - Searching for Accidented car before repair
2024-08-08 14:29:17,820 - INFO - Search completed
2024-08-08 14:29:17,820 - INFO - Page content: Accidented car Before repair
+3 -1
View File
@@ -11,4 +11,6 @@ langchain-text-splitters
unstructured[all-docs]
docx2txt
docx
"fastapi[standard]"
fastapi[standard]
pdfplumber
pytesseract
+2 -2
View File
@@ -44,8 +44,8 @@ class TextExtractor:
except Exception as e:
print(f"Error reading text from image: {e}")
return ""
finally:
os.remove(image_path)
# finally:
# os.remove(image_path)
def read_text_from_pdf(self, pdf_path):
"""
Binary file not shown.
Binary file not shown.
+102
View File
@@ -0,0 +1,102 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 2
}