complete document ingestion pipeline
This commit is contained in:
@@ -0,0 +1,158 @@
|
||||
import pytesseract
|
||||
from PIL import Image
|
||||
import pdfplumber
|
||||
import platform
|
||||
import os
|
||||
import io
|
||||
|
||||
|
||||
class TextExtractor:
|
||||
def __init__(self):
|
||||
self.set_tesseract_path()
|
||||
|
||||
def set_tesseract_path(self):
|
||||
"""
|
||||
Sets the path to the Tesseract executable based on the detected platform.
|
||||
"""
|
||||
# Get the current platform
|
||||
current_platform = platform.system()
|
||||
|
||||
# Set path to Tesseract executable based on platform
|
||||
if current_platform == 'Linux':
|
||||
pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'
|
||||
elif current_platform == 'Windows':
|
||||
pytesseract.pytesseract.tesseract_cmd = 'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'
|
||||
elif current_platform == 'Darwin':
|
||||
pytesseract.pytesseract.tesseract_cmd = '/usr/local/bin/tesseract'
|
||||
else:
|
||||
print("Unsupported platform. Please set the Tesseract path manually.")
|
||||
|
||||
def read_text_from_image(self, image_path):
|
||||
"""
|
||||
Reads text from an image using pytesseract.
|
||||
|
||||
Args:
|
||||
image_path (str): Path to the image file.
|
||||
|
||||
Returns:
|
||||
str: Extracted text from the image.
|
||||
"""
|
||||
try:
|
||||
with Image.open(image_path) as img:
|
||||
text = pytesseract.image_to_string(img)
|
||||
return text
|
||||
except Exception as e:
|
||||
print(f"Error reading text from image: {e}")
|
||||
return ""
|
||||
finally:
|
||||
os.remove(image_path)
|
||||
|
||||
def read_text_from_pdf(self, pdf_path):
|
||||
"""
|
||||
Reads text from a PDF file using pytesseract.
|
||||
|
||||
Args:
|
||||
pdf_path (str): Path to the PDF file.
|
||||
|
||||
Returns:
|
||||
str: Extracted text from the PDF.
|
||||
"""
|
||||
try:
|
||||
text = ""
|
||||
with pdfplumber.open(pdf_path) as pdf:
|
||||
for page in pdf.pages:
|
||||
text += page.extract_text()
|
||||
return text
|
||||
except Exception as e:
|
||||
print(f"Error reading text from PDF: {e}")
|
||||
return ""
|
||||
finally:
|
||||
os.remove(pdf_path)
|
||||
|
||||
def extract_text_from_pdf(self, pdf_path):
|
||||
"""
|
||||
Reads text from a PDF file.
|
||||
|
||||
Args:
|
||||
pdf_path (str): Path to the PDF file.
|
||||
|
||||
Returns:
|
||||
str: Extracted text from the PDF.
|
||||
"""
|
||||
try:
|
||||
print("path", pdf_path)
|
||||
text = ""
|
||||
with pdfplumber.open(pdf_path) as pdf:
|
||||
for page in pdf.pages:
|
||||
if self._has_images(page):
|
||||
text += self._extract_text_from_images(page)
|
||||
else:
|
||||
text += page.extract_text()
|
||||
return text
|
||||
except Exception as e:
|
||||
print(f"Error reading text from PDF: {e}")
|
||||
return ""
|
||||
finally:
|
||||
os.remove(pdf_path)
|
||||
|
||||
def _has_images(self, page):
|
||||
"""
|
||||
Checks if a PDF page contains images.
|
||||
|
||||
Args:
|
||||
page (pdfplumber.Page): PDF page object.
|
||||
|
||||
Returns:
|
||||
bool: True if the page contains images, False otherwise.
|
||||
"""
|
||||
for obj in page.objects:
|
||||
if page.objects[obj][0]['object_type'] == "image":
|
||||
return True
|
||||
return False
|
||||
|
||||
def _extract_text_from_images(self, page):
|
||||
"""
|
||||
Extracts text from images within a PDF page using pytesseract.
|
||||
|
||||
Args:
|
||||
page (pdfplumber.Page): PDF page object.
|
||||
|
||||
Returns:
|
||||
str: Extracted text from the images.
|
||||
"""
|
||||
text = ""
|
||||
for obj in page.objects:
|
||||
if page.objects[obj][0]['object_type'] == "image":
|
||||
obj = page.objects[obj][0]
|
||||
text += self._read_text_from_image(obj["x0"], obj["y0"], obj["x1"], obj["y1"], obj['stream'])
|
||||
return text
|
||||
|
||||
def _read_text_from_image(self, x0, y0, x1, y1, stream):
|
||||
"""
|
||||
Reads text from a specified region of the page image using pytesseract.
|
||||
|
||||
Args:
|
||||
x0, y0, x1, y1 (float): Coordinates of the region to read text from.
|
||||
page (pdfplumber.Page): PDF page object.
|
||||
|
||||
Returns:
|
||||
str: Extracted text from the specified region of the page image.
|
||||
"""
|
||||
try:
|
||||
# Convert the PDF image object to a PIL Image object
|
||||
raw_image = stream.get_rawdata()
|
||||
|
||||
# Convert the raw image data to a PIL Image object
|
||||
pil_image = Image.open(io.BytesIO(raw_image))
|
||||
|
||||
# Crop the PIL Image to the specified region
|
||||
# pil_image = pil_image.crop((x0, y0, x1, y1))
|
||||
|
||||
# Perform OCR on the image and extract text
|
||||
text = pytesseract.image_to_string(pil_image)
|
||||
return text
|
||||
except Exception as e:
|
||||
print(f"Error extracting text from image: {e}")
|
||||
return ""
|
||||
|
||||
|
||||
Reference in New Issue
Block a user