complete document ingestion pipeline

This commit is contained in:
timothyafolami
2024-08-07 17:50:40 +01:00
parent c65b0ecdb9
commit 8e6acc7cf8
11 changed files with 739 additions and 438 deletions
+1
View File
@@ -0,0 +1 @@
{"doc_names": ["Car-Repair-Receipt-repair", "Car-Repair-Receipt-service", "Car-Repair-Receipt-tire", "Car-Repair-Receipt-tuning", "Car-Repair-Receipt-wash", "corolla-2020-toyota-owners-manual", "How to change engine oil and filter on TOYOTA Corolla", "How to change front brake pads on TOYOTA Corolla", "How to change rear windshield wipers on TOYOTA Corolla", "How to change spark plugs on TOYOTA COROLLA"], "docs_id": ["320bc9710952499baa9553d3f0d4e727", "6ba07e1cf09a4ae6b54863040f901328", "dd067c452bd146e4becd61bde8602a3c", "640493ad16b546d38851216917d3e82b", "08cf1c3c8eab4efe9f81efcf8ce770be", "d8d6a3ca9a0a44e08cd4423ee3fb979d", "2b6e45cd99ff46b08242282a423642d4", "05524682d2e9425c83c9b57693182c50", "4eb170648fbe47c3b87b2831a97f0dd8", "cec3e82f0432402e940a0299bfa086fe"], "num_pages": [1, 2, 2, 2, 1, 588, 6, 7, 6, 10]}
Binary file not shown.
+168 -9
View File
@@ -1,5 +1,15 @@
from langchain_community.embeddings import HuggingFaceBgeEmbeddings from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import TextLoader
from langchain_community.document_loaders import Docx2txtLoader
from uuid import uuid4
from langchain_core.documents import Document
import os
import json
# loading the embedding model # loading the embedding model
@@ -16,21 +26,170 @@ def load_embedding_model():
embeddings = load_embedding_model() embeddings = load_embedding_model()
# A function to create the vector store def create_documents(doc):
def create_vector_store(document, embeddings=embeddings): text = doc[0].page_content
embed_db = FAISS.from_documents(document, embeddings) metadata = doc[0].metadata
return embed_db text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=10,
length_function=len,
is_separator_regex=False,
)
docs = text_splitter.create_documents([text])
# converting the text into documents
documents = []
for i, chunk in enumerate(docs):
# Increment page number based on the chunk index
doc_metadata = metadata.copy()
doc_metadata['page'] = i # Assign page number based on chunk index
document = Document(page_content=chunk.page_content, metadata=doc_metadata)
documents.append(document)
return documents
# A function to save the embedded data
def save_embedded_data(docs, key="pdf"): def load_txt_document(document_path):
docs.save_local(f"vec-db/index/faiss_index_{key}") try:
txt_doc = TextLoader(document_path)
text = txt_doc.load()
# implementig document splitting
docs = create_documents(text)
return docs
except:
raise ValueError(f"Error loading -- {document_path}")
def load_docx_document(document_path):
try:
docx_doc = Docx2txtLoader(document_path)
text = docx_doc.load()
# implementig document splitting
docs = create_documents(text)
return docs
except:
raise ValueError(f"Error loading -- {document_path}")
# creating a function that checks the document type and loads the document
def load_pdf_document(document_path):
try:
pdf_doc = PyPDFLoader(document_path)
pages = pdf_doc.load_and_split()
return pages
except:
raise ValueError(f"Error loading -- {document_path}")
# A general function that loads textual documents
def load_document(document_path):
if document_path.endswith(".pdf"):
return load_pdf_document(document_path)
elif document_path.endswith(".txt"):
return load_txt_document(document_path)
elif document_path.endswith(".docx"):
return load_docx_document(document_path)
else:
raise ValueError(f"Unsupported document type for {document_path}")
def save_embedded_data(embeddings, key="data"):
embeddings.save_local(f"vec-db/index/faiss_index_{key}")
print("Embeddings saved") print("Embeddings saved")
# A function to load the embedded data def load_embedded_data(embeddings=embeddings, key="data"):
def load_embedded_data(embeddings=embeddings, key="pdf"):
embed_db = FAISS.load_local(f"vec-db/index/faiss_index_{key}", embeddings, allow_dangerous_deserialization=True) embed_db = FAISS.load_local(f"vec-db/index/faiss_index_{key}", embeddings, allow_dangerous_deserialization=True)
return embed_db return embed_db
# creating a function to load all documents from a directory.
def load_documents_from_directory(directory_path: str):
text_doc = ['pdf', 'txt', 'docx', 'doc', 'md']
image_doc = ['jpg', 'jpeg', 'png', 'gif', 'bmp']
audio_doc = ['mp3', 'wav', 'flac', 'ogg', 'm4a']
video_doc = ['mp4', 'avi', 'mkv', 'flv', 'mov']
# accessing the name of the files in the directory
files = os.listdir(directory_path)
# creating a list to store the documents
documents = []
# another list for the document names
doc_names = []
# counting the number of pages in the document
num_pages= []
# iterating through the files in the directory
for file in files:
# updating the path
path = os.path.join(directory_path, file)
# getting the file extension and doc name
doc_name, extension = file.split('.')[0] , file.split('.')[-1]
# checking if the file is a text document
if extension in text_doc:
# loading the document
doc = load_document(path)
# appending the document to the documents list
documents.append(doc)
# appending the number of pages in the document
num_pages.append(len(doc))
# adding the document name to the doc_names list
doc_names.append(doc_name)
print(f"Document {doc_name} loaded")
# so we need to create a document id for each document
docs_id = [uuid4().hex for i in range(len(documents))]
# creating a json file to store the documents, checking if it exists then open it, else create it
json_file = f"{directory_path}/documents.json"
if os.path.exists(json_file):
with open(json_file, 'r') as f:
data = json.load(f)
data['doc_names'] = doc_names
data['docs_id'] = docs_id
data['num_pages'] = num_pages
with open(json_file, 'w') as f:
json.dump(data, f)
else:
data = {'doc_names': doc_names, 'docs_id': docs_id, 'num_pages': num_pages}
with open(json_file, 'w') as f:
json.dump(data, f)
# returning the documents, and doc ids
return documents, docs_id, num_pages
# A function to create vector store
def create_vector_store(embeddings, documents: list, docs_id: list, num_pages: list):
# index set up with the embedding dimension
index = faiss.IndexFlatL2(384)
# Initialize the FAISS vector store
vector_store = FAISS(
embedding_function=embeddings,
index=index,
docstore=InMemoryDocstore(),
index_to_docstore_id={},
)
# Now adding other documents to the store.
for i in range(len(documents)):
doc_id = docs_id[i]
page_ids = [doc_id+ str(i) for i in range(num_pages[i])]
vector_store.add_documents(documents=documents[i], ids=page_ids)
# saving the vector store automatically
save_embedded_data(vector_store, key="data")
return vector_store
# creating a function to add documents to the vector store
def add_documents_to_vector_store(embeddings, documents: list, docs_id: list, num_pages: list):
# loading the vector store
vector_store = load_embedded_data(embeddings)
for i in range(len(documents)):
doc_id = docs_id[i]
page_ids = [doc_id+ str(i) for i in range(num_pages[i])]
vector_store.add_documents(documents=documents[i], ids=page_ids)
print ("Documents added to the vector store")
# A document search function # A document search function
def search(db, query, k=4): def search(db, query, k=4):
docs = db.similarity_search(query, k) docs = db.similarity_search(query, k)
+321 -428
View File
File diff suppressed because one or more lines are too long
+3
View File
@@ -26,3 +26,6 @@
2024-08-05 22:12:55,205 - INFO - Receiving the search query 2024-08-05 22:12:55,205 - INFO - Receiving the search query
2024-08-05 22:13:04,060 - INFO - Searching for what is LDA? 2024-08-05 22:13:04,060 - INFO - Searching for what is LDA?
2024-08-05 22:13:04,241 - INFO - Search completed 2024-08-05 22:13:04,241 - INFO - Search completed
2024-08-07 17:49:19,962 - INFO - Receiving the search query
2024-08-07 17:49:29,498 - INFO - Searching for what is lda?
2024-08-07 17:49:29,876 - INFO - Search completed
+88
View File
@@ -0,0 +1,88 @@
from flask_restx import Namespace, Resource, fields
from flask import request, jsonify, current_app as app, send_file
from ...services.ocr import OCRService
from ...utils.decorators.auth import protected_route
from .models.errors import error_404, error_500
from .models.response import response
import json
import os
import numpy as np
from werkzeug.datastructures import FileStorage
api = Namespace('OCR',
description='Description',
path='/v2/api/tools/ocr')
upload_parser = api.parser()
upload_parser.add_argument('file', location='files',
type=FileStorage, required=True)
# OCR Data Model
ocr_model = api.model('OCR', {
'format': fields.String(required=True),
'data': fields.String(required=False),
})
success_response = api.clone('OCR Model Response', response, {
'model': fields.Nested(ocr_model)
})
@api.route('')
@api.doc(security='apikey')
class OCRResource(Resource):
@api.doc('get_text')
@api.expect(upload_parser)
@protected_route
def post(self):
output_format = request.get_json().get('format')
if not output_format:
output_format = 'txt'
args = upload_parser.parse_args()
try:
if 'file' not in args:
raise ValueError("Invalid file")
file = args['file']
if file.filename == '':
raise ValueError("Invalid file")
if file:
# Save the uploaded file to the UPLOAD_FOLDER
filename = os.path.join(file.filename)
file.save(filename)
ocr_service = OCRService(image_directory='',
export_directory=os.path.join(app.config['UPLOAD_FOLDER']),
language='en')
output_format = 'text'
result = ocr_service.read_text(filename, output_format=output_format)
if output_format == 'text':
txt_file, message = result
if txt_file:
return send_file(os.path.join("..",'..','..', txt_file), mimetype='text/plain', as_attachment=True, download_name=txt_file)
else:
return jsonify(error=message)
elif output_format == 'json':
json_data, message = result
if json_data:
result_json = json.dumps(json_data, default=np_encoder)
return {'model': {
'format': output_format,
'data': result_json
}}
api.abort(code=500, message="Invalid Format", error=True)
except Exception as e:
api.abort(code=500, message="Something went wrong", error=True)
def np_encoder(object):
if isinstance(object, np.generic):
return object.item()
-1
View File
@@ -21,5 +21,4 @@ if __name__ == "__main__":
logger.info(f"Page content: {page_content}") logger.info(f"Page content: {page_content}")
print(f"Page content: {page_content}") print(f"Page content: {page_content}")
print(f"Pages: {pages}") print(f"Pages: {pages}")
print(f"All: {all}")
print("Search completed") print("Search completed")
+158
View File
@@ -0,0 +1,158 @@
import pytesseract
from PIL import Image
import pdfplumber
import platform
import os
import io
class TextExtractor:
def __init__(self):
self.set_tesseract_path()
def set_tesseract_path(self):
"""
Sets the path to the Tesseract executable based on the detected platform.
"""
# Get the current platform
current_platform = platform.system()
# Set path to Tesseract executable based on platform
if current_platform == 'Linux':
pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'
elif current_platform == 'Windows':
pytesseract.pytesseract.tesseract_cmd = 'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'
elif current_platform == 'Darwin':
pytesseract.pytesseract.tesseract_cmd = '/usr/local/bin/tesseract'
else:
print("Unsupported platform. Please set the Tesseract path manually.")
def read_text_from_image(self, image_path):
"""
Reads text from an image using pytesseract.
Args:
image_path (str): Path to the image file.
Returns:
str: Extracted text from the image.
"""
try:
with Image.open(image_path) as img:
text = pytesseract.image_to_string(img)
return text
except Exception as e:
print(f"Error reading text from image: {e}")
return ""
finally:
os.remove(image_path)
def read_text_from_pdf(self, pdf_path):
"""
Reads text from a PDF file using pytesseract.
Args:
pdf_path (str): Path to the PDF file.
Returns:
str: Extracted text from the PDF.
"""
try:
text = ""
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
text += page.extract_text()
return text
except Exception as e:
print(f"Error reading text from PDF: {e}")
return ""
finally:
os.remove(pdf_path)
def extract_text_from_pdf(self, pdf_path):
"""
Reads text from a PDF file.
Args:
pdf_path (str): Path to the PDF file.
Returns:
str: Extracted text from the PDF.
"""
try:
print("path", pdf_path)
text = ""
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
if self._has_images(page):
text += self._extract_text_from_images(page)
else:
text += page.extract_text()
return text
except Exception as e:
print(f"Error reading text from PDF: {e}")
return ""
finally:
os.remove(pdf_path)
def _has_images(self, page):
"""
Checks if a PDF page contains images.
Args:
page (pdfplumber.Page): PDF page object.
Returns:
bool: True if the page contains images, False otherwise.
"""
for obj in page.objects:
if page.objects[obj][0]['object_type'] == "image":
return True
return False
def _extract_text_from_images(self, page):
"""
Extracts text from images within a PDF page using pytesseract.
Args:
page (pdfplumber.Page): PDF page object.
Returns:
str: Extracted text from the images.
"""
text = ""
for obj in page.objects:
if page.objects[obj][0]['object_type'] == "image":
obj = page.objects[obj][0]
text += self._read_text_from_image(obj["x0"], obj["y0"], obj["x1"], obj["y1"], obj['stream'])
return text
def _read_text_from_image(self, x0, y0, x1, y1, stream):
"""
Reads text from a specified region of the page image using pytesseract.
Args:
x0, y0, x1, y1 (float): Coordinates of the region to read text from.
page (pdfplumber.Page): PDF page object.
Returns:
str: Extracted text from the specified region of the page image.
"""
try:
# Convert the PDF image object to a PIL Image object
raw_image = stream.get_rawdata()
# Convert the raw image data to a PIL Image object
pil_image = Image.open(io.BytesIO(raw_image))
# Crop the PIL Image to the specified region
# pil_image = pil_image.crop((x0, y0, x1, y1))
# Perform OCR on the image and extract text
text = pytesseract.image_to_string(pil_image)
return text
except Exception as e:
print(f"Error extracting text from image: {e}")
return ""
Binary file not shown.
Binary file not shown.