complete document ingestion pipeline

2024-08-07 17:50:40 +01:00
parent c65b0ecdb9
commit 8e6acc7cf8
11 changed files with 739 additions and 438 deletions
@@ -0,0 +1 @@
+{"doc_names": ["Car-Repair-Receipt-repair", "Car-Repair-Receipt-service", "Car-Repair-Receipt-tire", "Car-Repair-Receipt-tuning", "Car-Repair-Receipt-wash", "corolla-2020-toyota-owners-manual", "How to change engine oil and filter on TOYOTA Corolla", "How to change front brake pads on TOYOTA Corolla", "How to change rear windshield wipers on TOYOTA Corolla", "How to change spark plugs on TOYOTA COROLLA"], "docs_id": ["320bc9710952499baa9553d3f0d4e727", "6ba07e1cf09a4ae6b54863040f901328", "dd067c452bd146e4becd61bde8602a3c", "640493ad16b546d38851216917d3e82b", "08cf1c3c8eab4efe9f81efcf8ce770be", "d8d6a3ca9a0a44e08cd4423ee3fb979d", "2b6e45cd99ff46b08242282a423642d4", "05524682d2e9425c83c9b57693182c50", "4eb170648fbe47c3b87b2831a97f0dd8", "cec3e82f0432402e940a0299bfa086fe"], "num_pages": [1, 2, 2, 2, 1, 588, 6, 7, 6, 10]}
@@ -1,5 +1,15 @@
 from langchain_community.embeddings import HuggingFaceBgeEmbeddings
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+import faiss
+from langchain_community.docstore.in_memory import InMemoryDocstore
 from langchain_community.vectorstores import FAISS
+from langchain_community.document_loaders import PyPDFLoader
+from langchain_community.document_loaders import TextLoader
+from langchain_community.document_loaders import Docx2txtLoader
+from uuid import uuid4
+from langchain_core.documents  import Document
+import os
+import json


 # loading the embedding model
@@ -16,21 +26,170 @@ def load_embedding_model():
 embeddings = load_embedding_model()


-# A function to create the vector store
-def create_vector_store(document, embeddings=embeddings):
-  embed_db = FAISS.from_documents(document, embeddings)
-  return embed_db
+def create_documents(doc):
+    text = doc[0].page_content
+    metadata = doc[0].metadata
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=1000,
+        chunk_overlap=10,
+        length_function=len,
+        is_separator_regex=False,
+    )
+    docs = text_splitter.create_documents([text])
+    # converting the text into documents
+    documents = []
+    for i, chunk in enumerate(docs):
+        # Increment page number based on the chunk index
+        doc_metadata = metadata.copy()
+        doc_metadata['page'] = i  # Assign page number based on chunk index
+        document = Document(page_content=chunk.page_content, metadata=doc_metadata)
+        documents.append(document)
+    return documents

-# A function to save the embedded data
-def save_embedded_data(docs,  key="pdf"):
-  docs.save_local(f"vec-db/index/faiss_index_{key}")
+
+def load_txt_document(document_path):
+    try:
+        txt_doc = TextLoader(document_path)
+        text = txt_doc.load()
+        # implementig document splitting
+        docs = create_documents(text)
+        return docs
+    except:
+        raise ValueError(f"Error loading -- {document_path}")
+  
+  
+def load_docx_document(document_path):
+    try:
+        docx_doc = Docx2txtLoader(document_path)
+        text = docx_doc.load()
+        # implementig document splitting
+        docs = create_documents(text)
+        return docs
+    except:
+        raise ValueError(f"Error loading -- {document_path}")
+
+    
+# creating a function that checks the document type and loads the document
+def load_pdf_document(document_path):
+    try:
+        pdf_doc = PyPDFLoader(document_path)
+        pages = pdf_doc.load_and_split()
+        return pages
+    except:
+        raise ValueError(f"Error loading -- {document_path}")
+    
+
+
+
+# A general function that loads textual documents
+def load_document(document_path):
+    if document_path.endswith(".pdf"):
+        return load_pdf_document(document_path)
+    elif document_path.endswith(".txt"):
+        return load_txt_document(document_path)
+    elif document_path.endswith(".docx"):
+        return load_docx_document(document_path)
+    else:
+        raise ValueError(f"Unsupported document type for {document_path}")
+
+
+
+def save_embedded_data(embeddings, key="data"):
+  embeddings.save_local(f"vec-db/index/faiss_index_{key}")
  print("Embeddings saved")

-# A function to load the embedded data
-def load_embedded_data(embeddings=embeddings, key="pdf"):
+def load_embedded_data(embeddings=embeddings, key="data"):
  embed_db = FAISS.load_local(f"vec-db/index/faiss_index_{key}", embeddings, allow_dangerous_deserialization=True)
  return embed_db

+
+# creating a function to load all documents from a directory. 
+def load_documents_from_directory(directory_path: str):
+    text_doc = ['pdf', 'txt', 'docx', 'doc', 'md']
+    image_doc = ['jpg', 'jpeg', 'png', 'gif', 'bmp']
+    audio_doc = ['mp3', 'wav', 'flac', 'ogg', 'm4a']
+    video_doc = ['mp4', 'avi', 'mkv', 'flv', 'mov']
+    
+    # accessing the name of the files in the directory
+    files = os.listdir(directory_path)
+    # creating a list to store the documents
+    documents = []
+    # another list for the document names 
+    doc_names = []
+    # counting the number of pages in the document
+    num_pages= []
+    # iterating through the files in the directory
+    for file in files: 
+        # updating the path
+        path = os.path.join(directory_path, file)
+        # getting the file extension and doc name
+        doc_name, extension = file.split('.')[0] , file.split('.')[-1]
+        # checking if the file is a text document
+        if extension in text_doc:
+            # loading the document
+            doc = load_document(path)
+            # appending the document to the documents list
+            documents.append(doc)
+            # appending the number of pages in the document
+            num_pages.append(len(doc))
+            # adding the document name to the doc_names list
+            doc_names.append(doc_name)
+            print(f"Document {doc_name} loaded")
+            
+    # so we need to create a document id for each document
+    docs_id = [uuid4().hex for i in range(len(documents))]
+    # creating a json file to store the documents, checking if it exists then open it, else create it
+    json_file = f"{directory_path}/documents.json"
+    if os.path.exists(json_file):
+        with open(json_file, 'r') as f:
+            data = json.load(f)
+            data['doc_names'] = doc_names
+            data['docs_id'] = docs_id
+            data['num_pages'] = num_pages
+        with open(json_file, 'w') as f:
+            json.dump(data, f)
+    else:
+        data = {'doc_names': doc_names, 'docs_id': docs_id, 'num_pages': num_pages}
+        with open(json_file, 'w') as f:
+            json.dump(data, f)
+    
+    # returning the documents, and doc ids
+    return documents, docs_id, num_pages
+
+
+# A function to create vector store
+def create_vector_store(embeddings, documents: list, docs_id: list, num_pages: list):
+    # index set up with the embedding dimension
+    index = faiss.IndexFlatL2(384)
+    # Initialize the FAISS vector store
+    vector_store = FAISS(
+        embedding_function=embeddings,
+        index=index,
+        docstore=InMemoryDocstore(),
+        index_to_docstore_id={},
+    )
+    # Now adding other documents to the store. 
+    for i in range(len(documents)):
+        doc_id = docs_id[i]
+        page_ids = [doc_id+ str(i) for i in range(num_pages[i])]
+        vector_store.add_documents(documents=documents[i], ids=page_ids)
+        
+    # saving the vector store automatically
+    save_embedded_data(vector_store, key="data")
+    
+    return vector_store
+
+# creating a function to add documents to the vector store
+def add_documents_to_vector_store(embeddings, documents: list, docs_id: list, num_pages: list):
+    # loading the vector store
+    vector_store = load_embedded_data(embeddings)
+    for i in range(len(documents)):
+        doc_id = docs_id[i]
+        page_ids = [doc_id+ str(i) for i in range(num_pages[i])]
+        vector_store.add_documents(documents=documents[i], ids=page_ids)
+    print ("Documents added to the vector store")
+    
+
 # A document search function
 def search(db, query, k=4):
  docs = db.similarity_search(query, k)
@@ -26,3 +26,6 @@
 2024-08-05 22:12:55,205 - INFO - Receiving the search query
 2024-08-05 22:13:04,060 - INFO - Searching for what is LDA?
 2024-08-05 22:13:04,241 - INFO - Search completed
+2024-08-07 17:49:19,962 - INFO - Receiving the search query
+2024-08-07 17:49:29,498 - INFO - Searching for what is lda?
+2024-08-07 17:49:29,876 - INFO - Search completed
@@ -0,0 +1,88 @@
+from flask_restx import Namespace, Resource, fields
+from flask import request, jsonify, current_app as app, send_file
+from ...services.ocr import OCRService
+from ...utils.decorators.auth import protected_route
+from .models.errors import error_404, error_500
+from .models.response import response
+import json
+import os
+import numpy as np
+from werkzeug.datastructures import FileStorage
+
+api = Namespace('OCR',
+                description='Description',
+                path='/v2/api/tools/ocr')
+
+upload_parser = api.parser()
+upload_parser.add_argument('file', location='files',
+                           type=FileStorage, required=True)
+# OCR Data Model
+ocr_model = api.model('OCR', {
+    'format': fields.String(required=True),
+    'data': fields.String(required=False),
+
+})
+
+success_response = api.clone('OCR Model Response', response, {
+    'model': fields.Nested(ocr_model)
+})
+
+
+@api.route('')
+@api.doc(security='apikey')
+class OCRResource(Resource):
+    @api.doc('get_text')
+    @api.expect(upload_parser)
+    @protected_route
+    def post(self):
+        output_format = request.get_json().get('format')
+
+        if not output_format:
+            output_format = 'txt'
+
+        args = upload_parser.parse_args()
+
+        try:
+            if 'file' not in args:
+                raise ValueError("Invalid file")
+
+            file = args['file']
+
+            if file.filename == '':
+                raise ValueError("Invalid file")
+
+            if file:
+                # Save the uploaded file to the UPLOAD_FOLDER
+                filename = os.path.join(file.filename)
+                file.save(filename)
+                ocr_service = OCRService(image_directory='',
+                                         export_directory=os.path.join(app.config['UPLOAD_FOLDER']),
+                                         language='en')
+            output_format = 'text'
+            result = ocr_service.read_text(filename, output_format=output_format)
+            if output_format == 'text':
+                txt_file, message = result
+                if txt_file:
+
+                    return send_file(os.path.join("..",'..','..', txt_file), mimetype='text/plain', as_attachment=True, download_name=txt_file)
+                else:
+                    return jsonify(error=message)
+
+            elif output_format == 'json':
+                json_data, message = result
+                if json_data:
+                    result_json = json.dumps(json_data, default=np_encoder)
+                    return {'model': {
+                        'format': output_format,
+                        'data': result_json
+                    }}
+            api.abort(code=500, message="Invalid Format", error=True)
+
+
+        except Exception as e:
+            api.abort(code=500, message="Something went wrong", error=True)
+
+
+def np_encoder(object):
+    if isinstance(object, np.generic):
+        return object.item()
@@ -21,5 +21,4 @@ if __name__ == "__main__":
    logger.info(f"Page content: {page_content}")
    print(f"Page content: {page_content}")
    print(f"Pages: {pages}")
-    print(f"All: {all}")
    print("Search completed")
@@ -0,0 +1,158 @@
+import pytesseract
+from PIL import Image
+import pdfplumber
+import platform
+import os
+import io
+
+
+class TextExtractor:
+    def __init__(self):
+        self.set_tesseract_path()
+
+    def set_tesseract_path(self):
+        """
+        Sets the path to the Tesseract executable based on the detected platform.
+        """
+        # Get the current platform
+        current_platform = platform.system()
+
+        # Set path to Tesseract executable based on platform
+        if current_platform == 'Linux':
+            pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'
+        elif current_platform == 'Windows':
+            pytesseract.pytesseract.tesseract_cmd = 'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'
+        elif current_platform == 'Darwin':
+            pytesseract.pytesseract.tesseract_cmd = '/usr/local/bin/tesseract'
+        else:
+            print("Unsupported platform. Please set the Tesseract path manually.")
+
+    def read_text_from_image(self, image_path):
+        """
+        Reads text from an image using pytesseract.
+
+        Args:
+            image_path (str): Path to the image file.
+
+        Returns:
+            str: Extracted text from the image.
+        """
+        try:
+            with Image.open(image_path) as img:
+                text = pytesseract.image_to_string(img)
+                return text
+        except Exception as e:
+            print(f"Error reading text from image: {e}")
+            return ""
+        finally:
+            os.remove(image_path)
+
+    def read_text_from_pdf(self, pdf_path):
+        """
+        Reads text from a PDF file using pytesseract.
+
+        Args:
+            pdf_path (str): Path to the PDF file.
+
+        Returns:
+            str: Extracted text from the PDF.
+        """
+        try:
+            text = ""
+            with pdfplumber.open(pdf_path) as pdf:
+                for page in pdf.pages:
+                    text += page.extract_text()
+            return text
+        except Exception as e:
+            print(f"Error reading text from PDF: {e}")
+            return ""
+        finally:
+            os.remove(pdf_path)
+
+    def extract_text_from_pdf(self, pdf_path):
+        """
+        Reads text from a PDF file.
+
+        Args:
+            pdf_path (str): Path to the PDF file.
+
+        Returns:
+            str: Extracted text from the PDF.
+        """
+        try:
+            print("path", pdf_path)
+            text = ""
+            with pdfplumber.open(pdf_path) as pdf:
+                for page in pdf.pages:
+                    if self._has_images(page):
+                        text += self._extract_text_from_images(page)
+                    else:
+                        text += page.extract_text()
+            return text
+        except Exception as e:
+            print(f"Error reading text from PDF: {e}")
+            return ""
+        finally:
+            os.remove(pdf_path)
+
+    def _has_images(self, page):
+        """
+        Checks if a PDF page contains images.
+
+        Args:
+            page (pdfplumber.Page): PDF page object.
+
+        Returns:
+            bool: True if the page contains images, False otherwise.
+        """
+        for obj in page.objects:
+            if page.objects[obj][0]['object_type'] == "image":
+                return True
+        return False
+
+    def _extract_text_from_images(self, page):
+        """
+        Extracts text from images within a PDF page using pytesseract.
+
+        Args:
+            page (pdfplumber.Page): PDF page object.
+
+        Returns:
+            str: Extracted text from the images.
+        """
+        text = ""
+        for obj in page.objects:
+            if page.objects[obj][0]['object_type'] == "image":
+                obj = page.objects[obj][0]
+                text += self._read_text_from_image(obj["x0"], obj["y0"], obj["x1"], obj["y1"], obj['stream'])
+        return text
+
+    def _read_text_from_image(self, x0, y0, x1, y1, stream):
+        """
+        Reads text from a specified region of the page image using pytesseract.
+
+        Args:
+            x0, y0, x1, y1 (float): Coordinates of the region to read text from.
+            page (pdfplumber.Page): PDF page object.
+
+        Returns:
+            str: Extracted text from the specified region of the page image.
+        """
+        try:
+            # Convert the PDF image object to a PIL Image object
+            raw_image = stream.get_rawdata()
+
+            # Convert the raw image data to a PIL Image object
+            pil_image = Image.open(io.BytesIO(raw_image))
+
+            # Crop the PIL Image to the specified region
+            # pil_image = pil_image.crop((x0, y0, x1, y1))
+
+            # Perform OCR on the image and extract text
+            text = pytesseract.image_to_string(pil_image)
+            return text
+        except Exception as e:
+            print(f"Error extracting text from image: {e}")
+            return ""
+
+
				`@@ -0,0 +1 @@`
				{"doc_names": ["Car-Repair-Receipt-repair", "Car-Repair-Receipt-service", "Car-Repair-Receipt-tire", "Car-Repair-Receipt-tuning", "Car-Repair-Receipt-wash", "corolla-2020-toyota-owners-manual", "How to change engine oil and filter on TOYOTA Corolla", "How to change front brake pads on TOYOTA Corolla", "How to change rear windshield wipers on TOYOTA Corolla", "How to change spark plugs on TOYOTA COROLLA"], "docs_id": ["320bc9710952499baa9553d3f0d4e727", "6ba07e1cf09a4ae6b54863040f901328", "dd067c452bd146e4becd61bde8602a3c", "640493ad16b546d38851216917d3e82b", "08cf1c3c8eab4efe9f81efcf8ce770be", "d8d6a3ca9a0a44e08cd4423ee3fb979d", "2b6e45cd99ff46b08242282a423642d4", "05524682d2e9425c83c9b57693182c50", "4eb170648fbe47c3b87b2831a97f0dd8", "cec3e82f0432402e940a0299bfa086fe"], "num_pages": [1, 2, 2, 2, 1, 588, 6, 7, 6, 10]}