ds_erp_ai/text_extractor.py

import pytesseract
from PIL import Image
import pdfplumber
import platform
import os
import io


class TextExtractor:
    def __init__(self):
        self.set_tesseract_path()

    def set_tesseract_path(self):
        """
        Sets the path to the Tesseract executable based on the detected platform.
        """
        # Get the current platform
        current_platform = platform.system()

        # Set path to Tesseract executable based on platform
        if current_platform == 'Linux':
            pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'
        elif current_platform == 'Windows':
            pytesseract.pytesseract.tesseract_cmd = 'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'
        elif current_platform == 'Darwin':
            pytesseract.pytesseract.tesseract_cmd = '/usr/local/bin/tesseract'
        else:
            print("Unsupported platform. Please set the Tesseract path manually.")

    def read_text_from_image(self, image_path):
        """
        Reads text from an image using pytesseract.

        Args:
            image_path (str): Path to the image file.

        Returns:
            str: Extracted text from the image.
        """
        try:
            with Image.open(image_path) as img:
                text = pytesseract.image_to_string(img)
                return text
        except Exception as e:
            print(f"Error reading text from image: {e}")
            return ""
        # finally:
        #     os.remove(image_path)

    def read_text_from_pdf(self, pdf_path):
        """
        Reads text from a PDF file using pytesseract.

        Args:
            pdf_path (str): Path to the PDF file.

        Returns:
            str: Extracted text from the PDF.
        """
        try:
            text = ""
            with pdfplumber.open(pdf_path) as pdf:
                for page in pdf.pages:
                    text += page.extract_text()
            return text
        except Exception as e:
            print(f"Error reading text from PDF: {e}")
            return ""
        finally:
            os.remove(pdf_path)

    def extract_text_from_pdf(self, pdf_path):
        """
        Reads text from a PDF file.

        Args:
            pdf_path (str): Path to the PDF file.

        Returns:
            str: Extracted text from the PDF.
        """
        try:
            print("path", pdf_path)
            text = ""
            with pdfplumber.open(pdf_path) as pdf:
                for page in pdf.pages:
                    if self._has_images(page):
                        text += self._extract_text_from_images(page)
                    else:
                        text += page.extract_text()
            return text
        except Exception as e:
            print(f"Error reading text from PDF: {e}")
            return ""
        finally:
            os.remove(pdf_path)

    def _has_images(self, page):
        """
        Checks if a PDF page contains images.

        Args:
            page (pdfplumber.Page): PDF page object.

        Returns:
            bool: True if the page contains images, False otherwise.
        """
        for obj in page.objects:
            if page.objects[obj][0]['object_type'] == "image":
                return True
        return False

    def _extract_text_from_images(self, page):
        """
        Extracts text from images within a PDF page using pytesseract.

        Args:
            page (pdfplumber.Page): PDF page object.

        Returns:
            str: Extracted text from the images.
        """
        text = ""
        for obj in page.objects:
            if page.objects[obj][0]['object_type'] == "image":
                obj = page.objects[obj][0]
                text += self._read_text_from_image(obj["x0"], obj["y0"], obj["x1"], obj["y1"], obj['stream'])
        return text

    def _read_text_from_image(self, x0, y0, x1, y1, stream):
        """
        Reads text from a specified region of the page image using pytesseract.

        Args:
            x0, y0, x1, y1 (float): Coordinates of the region to read text from.
            page (pdfplumber.Page): PDF page object.

        Returns:
            str: Extracted text from the specified region of the page image.
        """
        try:
            # Convert the PDF image object to a PIL Image object
            raw_image = stream.get_rawdata()

            # Convert the raw image data to a PIL Image object
            pil_image = Image.open(io.BytesIO(raw_image))

            # Crop the PIL Image to the specified region
            # pil_image = pil_image.crop((x0, y0, x1, y1))

            # Perform OCR on the image and extract text
            text = pytesseract.image_to_string(pil_image)
            return text
        except Exception as e:
            print(f"Error extracting text from image: {e}")
            return ""