image text extraction completed

2024-08-08 14:58:44 +01:00
parent 9a2a4c5fdd
commit c54dc17989
13 changed files with 331 additions and 7 deletions
@@ -8,6 +8,7 @@ from langchain_community.document_loaders import TextLoader
 from langchain_community.document_loaders import Docx2txtLoader
 from uuid import uuid4
 from langchain_core.documents  import Document
+from text_extractor import TextExtractor
 import os
 import json

@@ -90,6 +91,18 @@ def load_document(document_path):
    else:
        raise ValueError(f"Unsupported document type for {document_path}")

+def create_image_document(image_path):
+    # getting the image name from the image path
+    image_name = image_path.split('/')[-1].split('.')[0]
+    # setting image name as metadata
+    metadata = {'filename': image_name}
+    text_extractor = TextExtractor()
+    text = text_extractor.read_text_from_image(image_path)
+    # removing special characters and line breaks
+    text = ''.join(e for e in text if e.isalnum() or e.isspace() or e == '\n')
+    doc = Document(page_content=text, metadata=metadata)
+    # returning the document in a list
+    return [doc]


 def save_embedded_data(embeddings, key="data"):
@@ -133,6 +146,16 @@ def load_documents_from_directory(directory_path: str):
            # adding the document name to the doc_names list
            doc_names.append(doc_name)
            print(f"Document {doc_name} loaded")
+        elif extension in image_doc:
+            # creating an image document
+            doc = create_image_document(path)
+            # appending the document to the documents list
+            documents.append(doc)
+            # appending the number of pages in the document
+            num_pages.append(1)
+            # adding the document name to the doc_names list
+            doc_names.append(doc[0].metadata['filename'])
+            print(f"Document {doc[0].metadata['filename']} loaded")
            
    # so we need to create a document id for each document
    docs_id = [uuid4().hex for i in range(len(documents))]
@@ -189,11 +212,14 @@ def add_documents_to_vector_store(embeddings, documents: list, docs_id: list, nu
    

 # A document search function
-def search(db, query, k=4):
+def search(db, query, k=3):
  docs = db.similarity_search(query, k)
  all = ""
  pages = []
  for doc in docs:
-      all += f"{doc.page_content}\n"
-      pages.append(doc.metadata['page'])
+    all += f"{doc.page_content}\n"
+    try:
+        pages.append(doc.metadata['page'])
+    except:
+        pages.append(doc.metadata['filename'])
  return docs[0].page_content, all, pages