image text extraction completed

This commit is contained in:
timothyafolami
2024-08-08 14:58:44 +01:00
parent 9a2a4c5fdd
commit c54dc17989
13 changed files with 331 additions and 7 deletions
+29 -3
View File
@@ -8,6 +8,7 @@ from langchain_community.document_loaders import TextLoader
from langchain_community.document_loaders import Docx2txtLoader
from uuid import uuid4
from langchain_core.documents import Document
from text_extractor import TextExtractor
import os
import json
@@ -90,6 +91,18 @@ def load_document(document_path):
else:
raise ValueError(f"Unsupported document type for {document_path}")
def create_image_document(image_path):
# getting the image name from the image path
image_name = image_path.split('/')[-1].split('.')[0]
# setting image name as metadata
metadata = {'filename': image_name}
text_extractor = TextExtractor()
text = text_extractor.read_text_from_image(image_path)
# removing special characters and line breaks
text = ''.join(e for e in text if e.isalnum() or e.isspace() or e == '\n')
doc = Document(page_content=text, metadata=metadata)
# returning the document in a list
return [doc]
def save_embedded_data(embeddings, key="data"):
@@ -133,6 +146,16 @@ def load_documents_from_directory(directory_path: str):
# adding the document name to the doc_names list
doc_names.append(doc_name)
print(f"Document {doc_name} loaded")
elif extension in image_doc:
# creating an image document
doc = create_image_document(path)
# appending the document to the documents list
documents.append(doc)
# appending the number of pages in the document
num_pages.append(1)
# adding the document name to the doc_names list
doc_names.append(doc[0].metadata['filename'])
print(f"Document {doc[0].metadata['filename']} loaded")
# so we need to create a document id for each document
docs_id = [uuid4().hex for i in range(len(documents))]
@@ -189,11 +212,14 @@ def add_documents_to_vector_store(embeddings, documents: list, docs_id: list, nu
# A document search function
def search(db, query, k=4):
def search(db, query, k=3):
docs = db.similarity_search(query, k)
all = ""
pages = []
for doc in docs:
all += f"{doc.page_content}\n"
pages.append(doc.metadata['page'])
all += f"{doc.page_content}\n"
try:
pages.append(doc.metadata['page'])
except:
pages.append(doc.metadata['filename'])
return docs[0].page_content, all, pages