import pytesseract from PIL import Image import pdfplumber import platform import os import io class TextExtractor: def __init__(self): self.set_tesseract_path() def set_tesseract_path(self): """ Sets the path to the Tesseract executable based on the detected platform. """ # Get the current platform current_platform = platform.system() # Set path to Tesseract executable based on platform if current_platform == 'Linux': pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract' elif current_platform == 'Windows': pytesseract.pytesseract.tesseract_cmd = 'C:\\Program Files\\Tesseract-OCR\\tesseract.exe' elif current_platform == 'Darwin': pytesseract.pytesseract.tesseract_cmd = '/usr/local/bin/tesseract' else: print("Unsupported platform. Please set the Tesseract path manually.") def read_text_from_image(self, image_path): """ Reads text from an image using pytesseract. Args: image_path (str): Path to the image file. Returns: str: Extracted text from the image. """ try: with Image.open(image_path) as img: text = pytesseract.image_to_string(img) return text except Exception as e: print(f"Error reading text from image: {e}") return "" # finally: # os.remove(image_path) def read_text_from_pdf(self, pdf_path): """ Reads text from a PDF file using pytesseract. Args: pdf_path (str): Path to the PDF file. Returns: str: Extracted text from the PDF. """ try: text = "" with pdfplumber.open(pdf_path) as pdf: for page in pdf.pages: text += page.extract_text() return text except Exception as e: print(f"Error reading text from PDF: {e}") return "" finally: os.remove(pdf_path) def extract_text_from_pdf(self, pdf_path): """ Reads text from a PDF file. Args: pdf_path (str): Path to the PDF file. Returns: str: Extracted text from the PDF. """ try: print("path", pdf_path) text = "" with pdfplumber.open(pdf_path) as pdf: for page in pdf.pages: if self._has_images(page): text += self._extract_text_from_images(page) else: text += page.extract_text() return text except Exception as e: print(f"Error reading text from PDF: {e}") return "" finally: os.remove(pdf_path) def _has_images(self, page): """ Checks if a PDF page contains images. Args: page (pdfplumber.Page): PDF page object. Returns: bool: True if the page contains images, False otherwise. """ for obj in page.objects: if page.objects[obj][0]['object_type'] == "image": return True return False def _extract_text_from_images(self, page): """ Extracts text from images within a PDF page using pytesseract. Args: page (pdfplumber.Page): PDF page object. Returns: str: Extracted text from the images. """ text = "" for obj in page.objects: if page.objects[obj][0]['object_type'] == "image": obj = page.objects[obj][0] text += self._read_text_from_image(obj["x0"], obj["y0"], obj["x1"], obj["y1"], obj['stream']) return text def _read_text_from_image(self, x0, y0, x1, y1, stream): """ Reads text from a specified region of the page image using pytesseract. Args: x0, y0, x1, y1 (float): Coordinates of the region to read text from. page (pdfplumber.Page): PDF page object. Returns: str: Extracted text from the specified region of the page image. """ try: # Convert the PDF image object to a PIL Image object raw_image = stream.get_rawdata() # Convert the raw image data to a PIL Image object pil_image = Image.open(io.BytesIO(raw_image)) # Crop the PIL Image to the specified region # pil_image = pil_image.crop((x0, y0, x1, y1)) # Perform OCR on the image and extract text text = pytesseract.image_to_string(pil_image) return text except Exception as e: print(f"Error extracting text from image: {e}") return ""