complete document ingestion pipeline

2024-08-07 17:50:40 +01:00
parent c65b0ecdb9
commit 8e6acc7cf8
11 changed files with 739 additions and 438 deletions
@@ -0,0 +1 @@
 {"doc_names": ["Car-Repair-Receipt-repair", "Car-Repair-Receipt-service", "Car-Repair-Receipt-tire", "Car-Repair-Receipt-tuning", "Car-Repair-Receipt-wash", "corolla-2020-toyota-owners-manual", "How to change engine oil and filter on TOYOTA Corolla", "How to change front brake pads on TOYOTA Corolla", "How to change rear windshield wipers on TOYOTA Corolla", "How to change spark plugs on TOYOTA COROLLA"], "docs_id": ["320bc9710952499baa9553d3f0d4e727", "6ba07e1cf09a4ae6b54863040f901328", "dd067c452bd146e4becd61bde8602a3c", "640493ad16b546d38851216917d3e82b", "08cf1c3c8eab4efe9f81efcf8ce770be", "d8d6a3ca9a0a44e08cd4423ee3fb979d", "2b6e45cd99ff46b08242282a423642d4", "05524682d2e9425c83c9b57693182c50", "4eb170648fbe47c3b87b2831a97f0dd8", "cec3e82f0432402e940a0299bfa086fe"], "num_pages": [1, 2, 2, 2, 1, 588, 6, 7, 6, 10]}
@@ -1,5 +1,15 @@
 from langchain_community.embeddings import HuggingFaceBgeEmbeddings
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 import faiss
 from langchain_community.docstore.in_memory import InMemoryDocstore
 from langchain_community.vectorstores import FAISS
 from langchain_community.document_loaders import PyPDFLoader
 from langchain_community.document_loaders import TextLoader
 from langchain_community.document_loaders import Docx2txtLoader
 from uuid import uuid4
 from langchain_core.documents  import Document
 import os
 import json
 # loading the embedding model
@@ -16,21 +26,170 @@ def load_embedding_model():
 embeddings = load_embedding_model()
-# A function to create the vector store
+def create_documents(doc):
-def create_vector_store(document, embeddings=embeddings):
+    text = doc[0].page_content
-  embed_db = FAISS.from_documents(document, embeddings)
+    metadata = doc[0].metadata
-  return embed_db
+    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=10,
        length_function=len,
        is_separator_regex=False,
    )
    docs = text_splitter.create_documents([text])
    # converting the text into documents
    documents = []
    for i, chunk in enumerate(docs):
        # Increment page number based on the chunk index
        doc_metadata = metadata.copy()
        doc_metadata['page'] = i  # Assign page number based on chunk index
        document = Document(page_content=chunk.page_content, metadata=doc_metadata)
        documents.append(document)
    return documents
-# A function to save the embedded data
+
-def save_embedded_data(docs,  key="pdf"):
+def load_txt_document(document_path):
-  docs.save_local(f"vec-db/index/faiss_index_{key}")
+    try:
        txt_doc = TextLoader(document_path)
        text = txt_doc.load()
        # implementig document splitting
        docs = create_documents(text)
        return docs
    except:
        raise ValueError(f"Error loading -- {document_path}")
 def load_docx_document(document_path):
    try:
        docx_doc = Docx2txtLoader(document_path)
        text = docx_doc.load()
        # implementig document splitting
        docs = create_documents(text)
        return docs
    except:
        raise ValueError(f"Error loading -- {document_path}")
 # creating a function that checks the document type and loads the document
 def load_pdf_document(document_path):
    try:
        pdf_doc = PyPDFLoader(document_path)
        pages = pdf_doc.load_and_split()
        return pages
    except:
        raise ValueError(f"Error loading -- {document_path}")
 # A general function that loads textual documents
 def load_document(document_path):
    if document_path.endswith(".pdf"):
        return load_pdf_document(document_path)
    elif document_path.endswith(".txt"):
        return load_txt_document(document_path)
    elif document_path.endswith(".docx"):
        return load_docx_document(document_path)
    else:
        raise ValueError(f"Unsupported document type for {document_path}")
 def save_embedded_data(embeddings, key="data"):
  embeddings.save_local(f"vec-db/index/faiss_index_{key}")
  print("Embeddings saved")
-# A function to load the embedded data
+def load_embedded_data(embeddings=embeddings, key="data"):
 def load_embedded_data(embeddings=embeddings, key="pdf"):
  embed_db = FAISS.load_local(f"vec-db/index/faiss_index_{key}", embeddings, allow_dangerous_deserialization=True)
  return embed_db
 # creating a function to load all documents from a directory. 
 def load_documents_from_directory(directory_path: str):
    text_doc = ['pdf', 'txt', 'docx', 'doc', 'md']
    image_doc = ['jpg', 'jpeg', 'png', 'gif', 'bmp']
    audio_doc = ['mp3', 'wav', 'flac', 'ogg', 'm4a']
    video_doc = ['mp4', 'avi', 'mkv', 'flv', 'mov']
    # accessing the name of the files in the directory
    files = os.listdir(directory_path)
    # creating a list to store the documents
    documents = []
    # another list for the document names 
    doc_names = []
    # counting the number of pages in the document
    num_pages= []
    # iterating through the files in the directory
    for file in files: 
        # updating the path
        path = os.path.join(directory_path, file)
        # getting the file extension and doc name
        doc_name, extension = file.split('.')[0] , file.split('.')[-1]
        # checking if the file is a text document
        if extension in text_doc:
            # loading the document
            doc = load_document(path)
            # appending the document to the documents list
            documents.append(doc)
            # appending the number of pages in the document
            num_pages.append(len(doc))
            # adding the document name to the doc_names list
            doc_names.append(doc_name)
            print(f"Document {doc_name} loaded")
    # so we need to create a document id for each document
    docs_id = [uuid4().hex for i in range(len(documents))]
    # creating a json file to store the documents, checking if it exists then open it, else create it
    json_file = f"{directory_path}/documents.json"
    if os.path.exists(json_file):
        with open(json_file, 'r') as f:
            data = json.load(f)
            data['doc_names'] = doc_names
            data['docs_id'] = docs_id
            data['num_pages'] = num_pages
        with open(json_file, 'w') as f:
            json.dump(data, f)
    else:
        data = {'doc_names': doc_names, 'docs_id': docs_id, 'num_pages': num_pages}
        with open(json_file, 'w') as f:
            json.dump(data, f)
    # returning the documents, and doc ids
    return documents, docs_id, num_pages
 # A function to create vector store
 def create_vector_store(embeddings, documents: list, docs_id: list, num_pages: list):
    # index set up with the embedding dimension
    index = faiss.IndexFlatL2(384)
    # Initialize the FAISS vector store
    vector_store = FAISS(
        embedding_function=embeddings,
        index=index,
        docstore=InMemoryDocstore(),
        index_to_docstore_id={},
    )
    # Now adding other documents to the store. 
    for i in range(len(documents)):
        doc_id = docs_id[i]
        page_ids = [doc_id+ str(i) for i in range(num_pages[i])]
        vector_store.add_documents(documents=documents[i], ids=page_ids)
    # saving the vector store automatically
    save_embedded_data(vector_store, key="data")
    return vector_store
 # creating a function to add documents to the vector store
 def add_documents_to_vector_store(embeddings, documents: list, docs_id: list, num_pages: list):
    # loading the vector store
    vector_store = load_embedded_data(embeddings)
    for i in range(len(documents)):
        doc_id = docs_id[i]
        page_ids = [doc_id+ str(i) for i in range(num_pages[i])]
        vector_store.add_documents(documents=documents[i], ids=page_ids)
    print ("Documents added to the vector store")
 # A document search function
 def search(db, query, k=4):
  docs = db.similarity_search(query, k)
@@ -26,3 +26,6 @@
 2024-08-05 22:12:55,205 - INFO - Receiving the search query
 2024-08-05 22:13:04,060 - INFO - Searching for what is LDA?
 2024-08-05 22:13:04,241 - INFO - Search completed
 2024-08-07 17:49:19,962 - INFO - Receiving the search query
 2024-08-07 17:49:29,498 - INFO - Searching for what is lda?
 2024-08-07 17:49:29,876 - INFO - Search completed
@@ -0,0 +1,88 @@
 from flask_restx import Namespace, Resource, fields
 from flask import request, jsonify, current_app as app, send_file
 from ...services.ocr import OCRService
 from ...utils.decorators.auth import protected_route
 from .models.errors import error_404, error_500
 from .models.response import response
 import json
 import os
 import numpy as np
 from werkzeug.datastructures import FileStorage
 api = Namespace('OCR',
                description='Description',
                path='/v2/api/tools/ocr')
 upload_parser = api.parser()
 upload_parser.add_argument('file', location='files',
                           type=FileStorage, required=True)
 # OCR Data Model
 ocr_model = api.model('OCR', {
    'format': fields.String(required=True),
    'data': fields.String(required=False),
 })
 success_response = api.clone('OCR Model Response', response, {
    'model': fields.Nested(ocr_model)
 })
@api.route('')
@api.doc(security='apikey')
 class OCRResource(Resource):
    @api.doc('get_text')
    @api.expect(upload_parser)
    @protected_route
    def post(self):
        output_format = request.get_json().get('format')
        if not output_format:
            output_format = 'txt'
        args = upload_parser.parse_args()
        try:
            if 'file' not in args:
                raise ValueError("Invalid file")
            file = args['file']
            if file.filename == '':
                raise ValueError("Invalid file")
            if file:
                # Save the uploaded file to the UPLOAD_FOLDER
                filename = os.path.join(file.filename)
                file.save(filename)
                ocr_service = OCRService(image_directory='',
                                         export_directory=os.path.join(app.config['UPLOAD_FOLDER']),
                                         language='en')
            output_format = 'text'
            result = ocr_service.read_text(filename, output_format=output_format)
            if output_format == 'text':
                txt_file, message = result
                if txt_file:
                    return send_file(os.path.join("..",'..','..', txt_file), mimetype='text/plain', as_attachment=True, download_name=txt_file)
                else:
                    return jsonify(error=message)
            elif output_format == 'json':
                json_data, message = result
                if json_data:
                    result_json = json.dumps(json_data, default=np_encoder)
                    return {'model': {
                        'format': output_format,
                        'data': result_json
                    }}
            api.abort(code=500, message="Invalid Format", error=True)
        except Exception as e:
            api.abort(code=500, message="Something went wrong", error=True)
 def np_encoder(object):
    if isinstance(object, np.generic):
        return object.item()
@@ -21,5 +21,4 @@ if __name__ == "__main__":
    logger.info(f"Page content: {page_content}")
    print(f"Page content: {page_content}")
    print(f"Pages: {pages}")
    print(f"All: {all}")
    print("Search completed")
@@ -0,0 +1,158 @@
 import pytesseract
 from PIL import Image
 import pdfplumber
 import platform
 import os
 import io
 class TextExtractor:
    def __init__(self):
        self.set_tesseract_path()
    def set_tesseract_path(self):
        """
        Sets the path to the Tesseract executable based on the detected platform.
        """
        # Get the current platform
        current_platform = platform.system()
        # Set path to Tesseract executable based on platform
        if current_platform == 'Linux':
            pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'
        elif current_platform == 'Windows':
            pytesseract.pytesseract.tesseract_cmd = 'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'
        elif current_platform == 'Darwin':
            pytesseract.pytesseract.tesseract_cmd = '/usr/local/bin/tesseract'
        else:
            print("Unsupported platform. Please set the Tesseract path manually.")
    def read_text_from_image(self, image_path):
        """
        Reads text from an image using pytesseract.
        Args:
            image_path (str): Path to the image file.
        Returns:
            str: Extracted text from the image.
        """
        try:
            with Image.open(image_path) as img:
                text = pytesseract.image_to_string(img)
                return text
        except Exception as e:
            print(f"Error reading text from image: {e}")
            return ""
        finally:
            os.remove(image_path)
    def read_text_from_pdf(self, pdf_path):
        """
        Reads text from a PDF file using pytesseract.
        Args:
            pdf_path (str): Path to the PDF file.
        Returns:
            str: Extracted text from the PDF.
        """
        try:
            text = ""
            with pdfplumber.open(pdf_path) as pdf:
                for page in pdf.pages:
                    text += page.extract_text()
            return text
        except Exception as e:
            print(f"Error reading text from PDF: {e}")
            return ""
        finally:
            os.remove(pdf_path)
    def extract_text_from_pdf(self, pdf_path):
        """
        Reads text from a PDF file.
        Args:
            pdf_path (str): Path to the PDF file.
        Returns:
            str: Extracted text from the PDF.
        """
        try:
            print("path", pdf_path)
            text = ""
            with pdfplumber.open(pdf_path) as pdf:
                for page in pdf.pages:
                    if self._has_images(page):
                        text += self._extract_text_from_images(page)
                    else:
                        text += page.extract_text()
            return text
        except Exception as e:
            print(f"Error reading text from PDF: {e}")
            return ""
        finally:
            os.remove(pdf_path)
    def _has_images(self, page):
        """
        Checks if a PDF page contains images.
        Args:
            page (pdfplumber.Page): PDF page object.
        Returns:
            bool: True if the page contains images, False otherwise.
        """
        for obj in page.objects:
            if page.objects[obj][0]['object_type'] == "image":
                return True
        return False
    def _extract_text_from_images(self, page):
        """
        Extracts text from images within a PDF page using pytesseract.
        Args:
            page (pdfplumber.Page): PDF page object.
        Returns:
            str: Extracted text from the images.
        """
        text = ""
        for obj in page.objects:
            if page.objects[obj][0]['object_type'] == "image":
                obj = page.objects[obj][0]
                text += self._read_text_from_image(obj["x0"], obj["y0"], obj["x1"], obj["y1"], obj['stream'])
        return text
    def _read_text_from_image(self, x0, y0, x1, y1, stream):
        """
        Reads text from a specified region of the page image using pytesseract.
        Args:
            x0, y0, x1, y1 (float): Coordinates of the region to read text from.
            page (pdfplumber.Page): PDF page object.
        Returns:
            str: Extracted text from the specified region of the page image.
        """
        try:
            # Convert the PDF image object to a PIL Image object
            raw_image = stream.get_rawdata()
            # Convert the raw image data to a PIL Image object
            pil_image = Image.open(io.BytesIO(raw_image))
            # Crop the PIL Image to the specified region
            # pil_image = pil_image.crop((x0, y0, x1, y1))
            # Perform OCR on the image and extract text
            text = pytesseract.image_to_string(pil_image)
            return text
        except Exception as e:
            print(f"Error extracting text from image: {e}")
            return ""
		`@@ -0,0 +1 @@`
							{"doc_names": ["Car-Repair-Receipt-repair", "Car-Repair-Receipt-service", "Car-Repair-Receipt-tire", "Car-Repair-Receipt-tuning", "Car-Repair-Receipt-wash", "corolla-2020-toyota-owners-manual", "How to change engine oil and filter on TOYOTA Corolla", "How to change front brake pads on TOYOTA Corolla", "How to change rear windshield wipers on TOYOTA Corolla", "How to change spark plugs on TOYOTA COROLLA"], "docs_id": ["320bc9710952499baa9553d3f0d4e727", "6ba07e1cf09a4ae6b54863040f901328", "dd067c452bd146e4becd61bde8602a3c", "640493ad16b546d38851216917d3e82b", "08cf1c3c8eab4efe9f81efcf8ce770be", "d8d6a3ca9a0a44e08cd4423ee3fb979d", "2b6e45cd99ff46b08242282a423642d4", "05524682d2e9425c83c9b57693182c50", "4eb170648fbe47c3b87b2831a97f0dd8", "cec3e82f0432402e940a0299bfa086fe"], "num_pages": [1, 2, 2, 2, 1, 588, 6, 7, 6, 10]}