image text extraction completed
This commit is contained in:
+29
-3
@@ -8,6 +8,7 @@ from langchain_community.document_loaders import TextLoader
|
||||
from langchain_community.document_loaders import Docx2txtLoader
|
||||
from uuid import uuid4
|
||||
from langchain_core.documents import Document
|
||||
from text_extractor import TextExtractor
|
||||
import os
|
||||
import json
|
||||
|
||||
@@ -90,6 +91,18 @@ def load_document(document_path):
|
||||
else:
|
||||
raise ValueError(f"Unsupported document type for {document_path}")
|
||||
|
||||
def create_image_document(image_path):
|
||||
# getting the image name from the image path
|
||||
image_name = image_path.split('/')[-1].split('.')[0]
|
||||
# setting image name as metadata
|
||||
metadata = {'filename': image_name}
|
||||
text_extractor = TextExtractor()
|
||||
text = text_extractor.read_text_from_image(image_path)
|
||||
# removing special characters and line breaks
|
||||
text = ''.join(e for e in text if e.isalnum() or e.isspace() or e == '\n')
|
||||
doc = Document(page_content=text, metadata=metadata)
|
||||
# returning the document in a list
|
||||
return [doc]
|
||||
|
||||
|
||||
def save_embedded_data(embeddings, key="data"):
|
||||
@@ -133,6 +146,16 @@ def load_documents_from_directory(directory_path: str):
|
||||
# adding the document name to the doc_names list
|
||||
doc_names.append(doc_name)
|
||||
print(f"Document {doc_name} loaded")
|
||||
elif extension in image_doc:
|
||||
# creating an image document
|
||||
doc = create_image_document(path)
|
||||
# appending the document to the documents list
|
||||
documents.append(doc)
|
||||
# appending the number of pages in the document
|
||||
num_pages.append(1)
|
||||
# adding the document name to the doc_names list
|
||||
doc_names.append(doc[0].metadata['filename'])
|
||||
print(f"Document {doc[0].metadata['filename']} loaded")
|
||||
|
||||
# so we need to create a document id for each document
|
||||
docs_id = [uuid4().hex for i in range(len(documents))]
|
||||
@@ -189,11 +212,14 @@ def add_documents_to_vector_store(embeddings, documents: list, docs_id: list, nu
|
||||
|
||||
|
||||
# A document search function
|
||||
def search(db, query, k=4):
|
||||
def search(db, query, k=3):
|
||||
docs = db.similarity_search(query, k)
|
||||
all = ""
|
||||
pages = []
|
||||
for doc in docs:
|
||||
all += f"{doc.page_content}\n"
|
||||
pages.append(doc.metadata['page'])
|
||||
all += f"{doc.page_content}\n"
|
||||
try:
|
||||
pages.append(doc.metadata['page'])
|
||||
except:
|
||||
pages.append(doc.metadata['filename'])
|
||||
return docs[0].page_content, all, pages
|
||||
|
||||
Reference in New Issue
Block a user