Files
ds_erp_ai/text_extractor.py
2024-08-08 14:58:44 +01:00

159 lines
4.8 KiB
Python

import pytesseract
from PIL import Image
import pdfplumber
import platform
import os
import io
class TextExtractor:
def __init__(self):
self.set_tesseract_path()
def set_tesseract_path(self):
"""
Sets the path to the Tesseract executable based on the detected platform.
"""
# Get the current platform
current_platform = platform.system()
# Set path to Tesseract executable based on platform
if current_platform == 'Linux':
pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'
elif current_platform == 'Windows':
pytesseract.pytesseract.tesseract_cmd = 'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'
elif current_platform == 'Darwin':
pytesseract.pytesseract.tesseract_cmd = '/usr/local/bin/tesseract'
else:
print("Unsupported platform. Please set the Tesseract path manually.")
def read_text_from_image(self, image_path):
"""
Reads text from an image using pytesseract.
Args:
image_path (str): Path to the image file.
Returns:
str: Extracted text from the image.
"""
try:
with Image.open(image_path) as img:
text = pytesseract.image_to_string(img)
return text
except Exception as e:
print(f"Error reading text from image: {e}")
return ""
# finally:
# os.remove(image_path)
def read_text_from_pdf(self, pdf_path):
"""
Reads text from a PDF file using pytesseract.
Args:
pdf_path (str): Path to the PDF file.
Returns:
str: Extracted text from the PDF.
"""
try:
text = ""
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
text += page.extract_text()
return text
except Exception as e:
print(f"Error reading text from PDF: {e}")
return ""
finally:
os.remove(pdf_path)
def extract_text_from_pdf(self, pdf_path):
"""
Reads text from a PDF file.
Args:
pdf_path (str): Path to the PDF file.
Returns:
str: Extracted text from the PDF.
"""
try:
print("path", pdf_path)
text = ""
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
if self._has_images(page):
text += self._extract_text_from_images(page)
else:
text += page.extract_text()
return text
except Exception as e:
print(f"Error reading text from PDF: {e}")
return ""
finally:
os.remove(pdf_path)
def _has_images(self, page):
"""
Checks if a PDF page contains images.
Args:
page (pdfplumber.Page): PDF page object.
Returns:
bool: True if the page contains images, False otherwise.
"""
for obj in page.objects:
if page.objects[obj][0]['object_type'] == "image":
return True
return False
def _extract_text_from_images(self, page):
"""
Extracts text from images within a PDF page using pytesseract.
Args:
page (pdfplumber.Page): PDF page object.
Returns:
str: Extracted text from the images.
"""
text = ""
for obj in page.objects:
if page.objects[obj][0]['object_type'] == "image":
obj = page.objects[obj][0]
text += self._read_text_from_image(obj["x0"], obj["y0"], obj["x1"], obj["y1"], obj['stream'])
return text
def _read_text_from_image(self, x0, y0, x1, y1, stream):
"""
Reads text from a specified region of the page image using pytesseract.
Args:
x0, y0, x1, y1 (float): Coordinates of the region to read text from.
page (pdfplumber.Page): PDF page object.
Returns:
str: Extracted text from the specified region of the page image.
"""
try:
# Convert the PDF image object to a PIL Image object
raw_image = stream.get_rawdata()
# Convert the raw image data to a PIL Image object
pil_image = Image.open(io.BytesIO(raw_image))
# Crop the PIL Image to the specified region
# pil_image = pil_image.crop((x0, y0, x1, y1))
# Perform OCR on the image and extract text
text = pytesseract.image_to_string(pil_image)
return text
except Exception as e:
print(f"Error extracting text from image: {e}")
return ""