image text extraction completed
This commit is contained in:
@@ -0,0 +1,3 @@
|
||||
Ai indexing
|
||||
data
|
||||
images
|
||||
Binary file not shown.
+1
-1
@@ -1 +1 @@
|
||||
{"doc_names": ["Car-Repair-Receipt-repair", "Car-Repair-Receipt-service", "Car-Repair-Receipt-tire", "Car-Repair-Receipt-tuning", "Car-Repair-Receipt-wash", "corolla-2020-toyota-owners-manual", "How to change engine oil and filter on TOYOTA Corolla", "How to change front brake pads on TOYOTA Corolla", "How to change rear windshield wipers on TOYOTA Corolla", "How to change spark plugs on TOYOTA COROLLA"], "docs_id": ["911dea9b7b714adf8ecafd483a37741b", "e9308cf998a64cab8aef9bde04795fc4", "1f013bd6ac464a07acd8d60a425142d7", "3c99eade18a344d4a568cd77e58558f3", "708f7ba5121442c692dba1346097c4e4", "9e134439a0b84f26a213a288cbe45ab5", "8eb0c0f04eb44e2bafba7640ed34b26b", "c4571cec94034cf38b5d2d59a694464e", "4253d6ea5aeb43f1a65b11a2a631389f", "e2c66cfac77b4099908b1d41a66a7fe2"], "num_pages": [1, 2, 2, 2, 1, 588, 6, 7, 6, 10]}
|
||||
{"doc_names": ["Car-Repair-Receipt-repair", "Car-Repair-Receipt-service", "Car-Repair-Receipt-tire", "Car-Repair-Receipt-tuning", "Car-Repair-Receipt-wash", "corolla-2020-toyota-owners-manual", "data\\dodge-challenger-auto-body-repair-after", "data\\dodge-challenger-auto-body-repair-before", "How to change engine oil and filter on TOYOTA Corolla", "How to change front brake pads on TOYOTA Corolla", "How to change rear windshield wipers on TOYOTA Corolla", "How to change spark plugs on TOYOTA COROLLA", "data\\hyundai-sonata-auto-body-repair-after", "data\\hyundai-sonata-auto-body-repair-before", "data\\IMG_1436", "data\\IMG_1437", "data\\IMG_1438", "data\\IMG_1440", "data\\IMG_1441", "data\\IMG_1442", "data\\IMG_1443", "data\\IMG_1444", "data\\pontiac-vibe-auto-body-repair-after", "data\\pontiac-vibe-auto-body-repair-before", "data\\toyota-tacoma-auto-body-repair-after", "data\\toyota-tacoma-auto-body-repair-before"], "docs_id": ["5f26879376a44a77bbc2b966b9189ca4", "51b1c6cab5f1440e9fd948b6d858e812", "1d63ef4a149d4addb0803370885d70c1", "749ea365f2244eb6b23bb17e28d9cd2e", "e6d3736c0e8f424382c2ff5298814534", "91b116993e4b4865b3dc7bceca9749f0", "77f9558bd9894daeaf9aaea4013ed20e", "d974631f67d242739343b3c32e91355c", "a18ad23b3c7641b3a61e77e0e143a265", "0b710683db314b14ae6f0e0919a12068", "136c808efffa4f8798c55e7595c768a1", "236dc9603c9c4e83840721175d3dc861", "5aa9f750dbdd403c94abb53883c0fad2", "0382e54d68a84021803b07c7cf7c3ad9", "a772d008c9bf4ee6a2026f00998f3f2c", "66afb44563f6449ca705a39c9a72440d", "59ef1e9cc81b41d3a32d5dcc069a0ace", "9991145202384596bc3f5ff666d213bd", "d7f49b6629e84ec7bfd1a0048d2ade76", "689296161d6b46e8b9e792dbdc8a155d", "ba6be2ab8ae74042a9c9da51c46b8f90", "d62daf66b833419fae17333395cd7b04", "7b109e03c62343fd8f8e23dcf6bdfd3b", "8254386611fc4feb85744f69e5120e18", "022d6bae08274a618921c49590040a1f", "719ed0e4d9a94fe39799c227eaac1e05"], "num_pages": [1, 2, 2, 2, 1, 588, 1, 1, 6, 7, 6, 10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
|
||||
Binary file not shown.
+27
-1
@@ -8,6 +8,7 @@ from langchain_community.document_loaders import TextLoader
|
||||
from langchain_community.document_loaders import Docx2txtLoader
|
||||
from uuid import uuid4
|
||||
from langchain_core.documents import Document
|
||||
from text_extractor import TextExtractor
|
||||
import os
|
||||
import json
|
||||
|
||||
@@ -90,6 +91,18 @@ def load_document(document_path):
|
||||
else:
|
||||
raise ValueError(f"Unsupported document type for {document_path}")
|
||||
|
||||
def create_image_document(image_path):
|
||||
# getting the image name from the image path
|
||||
image_name = image_path.split('/')[-1].split('.')[0]
|
||||
# setting image name as metadata
|
||||
metadata = {'filename': image_name}
|
||||
text_extractor = TextExtractor()
|
||||
text = text_extractor.read_text_from_image(image_path)
|
||||
# removing special characters and line breaks
|
||||
text = ''.join(e for e in text if e.isalnum() or e.isspace() or e == '\n')
|
||||
doc = Document(page_content=text, metadata=metadata)
|
||||
# returning the document in a list
|
||||
return [doc]
|
||||
|
||||
|
||||
def save_embedded_data(embeddings, key="data"):
|
||||
@@ -133,6 +146,16 @@ def load_documents_from_directory(directory_path: str):
|
||||
# adding the document name to the doc_names list
|
||||
doc_names.append(doc_name)
|
||||
print(f"Document {doc_name} loaded")
|
||||
elif extension in image_doc:
|
||||
# creating an image document
|
||||
doc = create_image_document(path)
|
||||
# appending the document to the documents list
|
||||
documents.append(doc)
|
||||
# appending the number of pages in the document
|
||||
num_pages.append(1)
|
||||
# adding the document name to the doc_names list
|
||||
doc_names.append(doc[0].metadata['filename'])
|
||||
print(f"Document {doc[0].metadata['filename']} loaded")
|
||||
|
||||
# so we need to create a document id for each document
|
||||
docs_id = [uuid4().hex for i in range(len(documents))]
|
||||
@@ -189,11 +212,14 @@ def add_documents_to_vector_store(embeddings, documents: list, docs_id: list, nu
|
||||
|
||||
|
||||
# A document search function
|
||||
def search(db, query, k=4):
|
||||
def search(db, query, k=3):
|
||||
docs = db.similarity_search(query, k)
|
||||
all = ""
|
||||
pages = []
|
||||
for doc in docs:
|
||||
all += f"{doc.page_content}\n"
|
||||
try:
|
||||
pages.append(doc.metadata['page'])
|
||||
except:
|
||||
pages.append(doc.metadata['filename'])
|
||||
return docs[0].page_content, all, pages
|
||||
|
||||
@@ -0,0 +1,161 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!pip install -q pdfplumber"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from text_extractor import TextExtractor\n",
|
||||
"from langchain_core.documents import Document"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 26,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# creating a function to extract texts from image\n",
|
||||
"def create_image_document(image_path):\n",
|
||||
" # getting the image name from the image path\n",
|
||||
" image_name = image_path.split('/')[-1].split('.')[0]\n",
|
||||
" # setting image name as metadata\n",
|
||||
" metadata = {'filename': image_name}\n",
|
||||
" text_extractor = TextExtractor()\n",
|
||||
" text = text_extractor.read_text_from_image(image_path)\n",
|
||||
" # removing special characters and line breaks\n",
|
||||
" text = ''.join(e for e in text if e.isalnum() or e.isspace() or e == '\\n')\n",
|
||||
" doc = Document(page_content=text, metadata=metadata)\n",
|
||||
" # returning the document\n",
|
||||
" return [doc]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 27,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[Document(metadata={'filename': 'IMG_1438'}, page_content='ex a\\n\\nAccidented car before repair\\n')]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# testing the function\n",
|
||||
"image_path = 'data/IMG_1438.jpeg'\n",
|
||||
"text = create_image_document(image_path)\n",
|
||||
"print(text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 28,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'filename': 'IMG_1438'}"
|
||||
]
|
||||
},
|
||||
"execution_count": 28,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"text[0].metadata"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "smog_env",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
@@ -50,3 +50,33 @@
|
||||
2024-08-07 18:46:38,939 - INFO - Loading the embeddings
|
||||
2024-08-07 18:46:38,939 - INFO - Load pretrained SentenceTransformer: BAAI/bge-small-en
|
||||
2024-08-07 18:47:03,089 - INFO - Embeddings loaded
|
||||
2024-08-08 14:03:36,111 - INFO - Loading the embeddings
|
||||
2024-08-08 14:03:36,113 - INFO - Load pretrained SentenceTransformer: BAAI/bge-small-en
|
||||
2024-08-08 14:03:39,637 - INFO - Embeddings loaded
|
||||
2024-08-08 14:03:39,637 - INFO - Loading data from ./data
|
||||
2024-08-08 14:04:29,085 - INFO - Data loaded
|
||||
2024-08-08 14:04:29,087 - INFO - Creating vector store
|
||||
2024-08-08 14:06:40,106 - INFO - Loading the embeddings
|
||||
2024-08-08 14:06:40,106 - INFO - Load pretrained SentenceTransformer: BAAI/bge-small-en
|
||||
2024-08-08 14:06:43,523 - INFO - Embeddings loaded
|
||||
2024-08-08 14:06:43,523 - INFO - Loading data from ./data
|
||||
2024-08-08 14:20:21,150 - INFO - Loading the embeddings
|
||||
2024-08-08 14:20:21,150 - INFO - Load pretrained SentenceTransformer: BAAI/bge-small-en
|
||||
2024-08-08 14:20:25,150 - INFO - Embeddings loaded
|
||||
2024-08-08 14:20:25,150 - INFO - Loading data from ./data
|
||||
2024-08-08 14:21:13,769 - INFO - Data loaded
|
||||
2024-08-08 14:21:13,769 - INFO - Creating vector store
|
||||
2024-08-08 14:21:24,386 - INFO - Vector store created
|
||||
2024-08-08 14:21:24,386 - INFO - Saving the vector store
|
||||
2024-08-08 14:21:24,386 - INFO - Vector store saved
|
||||
2024-08-08 14:22:17,106 - INFO - Receiving the search query
|
||||
2024-08-08 14:22:23,740 - INFO - Searching for Accidented car before repair
|
||||
2024-08-08 14:24:45,013 - INFO - Receiving the search query
|
||||
2024-08-08 14:25:07,699 - INFO - Searching for Accidented car before repair
|
||||
2024-08-08 14:28:43,776 - INFO - Receiving the search query
|
||||
2024-08-08 14:28:46,944 - INFO - Searching for Accidented car before repair
|
||||
2024-08-08 14:29:13,295 - INFO - Receiving the search query
|
||||
2024-08-08 14:29:17,628 - INFO - Searching for Accidented car before repair
|
||||
2024-08-08 14:29:17,820 - INFO - Search completed
|
||||
2024-08-08 14:29:17,820 - INFO - Page content: Accidented car Before repair
|
||||
|
||||
|
||||
+3
-1
@@ -11,4 +11,6 @@ langchain-text-splitters
|
||||
unstructured[all-docs]
|
||||
docx2txt
|
||||
docx
|
||||
"fastapi[standard]"
|
||||
fastapi[standard]
|
||||
pdfplumber
|
||||
pytesseract
|
||||
+2
-2
@@ -44,8 +44,8 @@ class TextExtractor:
|
||||
except Exception as e:
|
||||
print(f"Error reading text from image: {e}")
|
||||
return ""
|
||||
finally:
|
||||
os.remove(image_path)
|
||||
# finally:
|
||||
# os.remove(image_path)
|
||||
|
||||
def read_text_from_pdf(self, pdf_path):
|
||||
"""
|
||||
|
||||
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,102 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"language_info": {
|
||||
"name": "python"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
Reference in New Issue
Block a user