complete document ingestion pipeline

2024-08-07 17:50:40 +01:00
parent c65b0ecdb9
commit 8e6acc7cf8
11 changed files with 739 additions and 438 deletions
@@ -0,0 +1,158 @@
+import pytesseract
+from PIL import Image
+import pdfplumber
+import platform
+import os
+import io
+
+
+class TextExtractor:
+    def __init__(self):
+        self.set_tesseract_path()
+
+    def set_tesseract_path(self):
+        """
+        Sets the path to the Tesseract executable based on the detected platform.
+        """
+        # Get the current platform
+        current_platform = platform.system()
+
+        # Set path to Tesseract executable based on platform
+        if current_platform == 'Linux':
+            pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'
+        elif current_platform == 'Windows':
+            pytesseract.pytesseract.tesseract_cmd = 'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'
+        elif current_platform == 'Darwin':
+            pytesseract.pytesseract.tesseract_cmd = '/usr/local/bin/tesseract'
+        else:
+            print("Unsupported platform. Please set the Tesseract path manually.")
+
+    def read_text_from_image(self, image_path):
+        """
+        Reads text from an image using pytesseract.
+
+        Args:
+            image_path (str): Path to the image file.
+
+        Returns:
+            str: Extracted text from the image.
+        """
+        try:
+            with Image.open(image_path) as img:
+                text = pytesseract.image_to_string(img)
+                return text
+        except Exception as e:
+            print(f"Error reading text from image: {e}")
+            return ""
+        finally:
+            os.remove(image_path)
+
+    def read_text_from_pdf(self, pdf_path):
+        """
+        Reads text from a PDF file using pytesseract.
+
+        Args:
+            pdf_path (str): Path to the PDF file.
+
+        Returns:
+            str: Extracted text from the PDF.
+        """
+        try:
+            text = ""
+            with pdfplumber.open(pdf_path) as pdf:
+                for page in pdf.pages:
+                    text += page.extract_text()
+            return text
+        except Exception as e:
+            print(f"Error reading text from PDF: {e}")
+            return ""
+        finally:
+            os.remove(pdf_path)
+
+    def extract_text_from_pdf(self, pdf_path):
+        """
+        Reads text from a PDF file.
+
+        Args:
+            pdf_path (str): Path to the PDF file.
+
+        Returns:
+            str: Extracted text from the PDF.
+        """
+        try:
+            print("path", pdf_path)
+            text = ""
+            with pdfplumber.open(pdf_path) as pdf:
+                for page in pdf.pages:
+                    if self._has_images(page):
+                        text += self._extract_text_from_images(page)
+                    else:
+                        text += page.extract_text()
+            return text
+        except Exception as e:
+            print(f"Error reading text from PDF: {e}")
+            return ""
+        finally:
+            os.remove(pdf_path)
+
+    def _has_images(self, page):
+        """
+        Checks if a PDF page contains images.
+
+        Args:
+            page (pdfplumber.Page): PDF page object.
+
+        Returns:
+            bool: True if the page contains images, False otherwise.
+        """
+        for obj in page.objects:
+            if page.objects[obj][0]['object_type'] == "image":
+                return True
+        return False
+
+    def _extract_text_from_images(self, page):
+        """
+        Extracts text from images within a PDF page using pytesseract.
+
+        Args:
+            page (pdfplumber.Page): PDF page object.
+
+        Returns:
+            str: Extracted text from the images.
+        """
+        text = ""
+        for obj in page.objects:
+            if page.objects[obj][0]['object_type'] == "image":
+                obj = page.objects[obj][0]
+                text += self._read_text_from_image(obj["x0"], obj["y0"], obj["x1"], obj["y1"], obj['stream'])
+        return text
+
+    def _read_text_from_image(self, x0, y0, x1, y1, stream):
+        """
+        Reads text from a specified region of the page image using pytesseract.
+
+        Args:
+            x0, y0, x1, y1 (float): Coordinates of the region to read text from.
+            page (pdfplumber.Page): PDF page object.
+
+        Returns:
+            str: Extracted text from the specified region of the page image.
+        """
+        try:
+            # Convert the PDF image object to a PIL Image object
+            raw_image = stream.get_rawdata()
+
+            # Convert the raw image data to a PIL Image object
+            pil_image = Image.open(io.BytesIO(raw_image))
+
+            # Crop the PIL Image to the specified region
+            # pil_image = pil_image.crop((x0, y0, x1, y1))
+
+            # Perform OCR on the image and extract text
+            text = pytesseract.image_to_string(pil_image)
+            return text
+        except Exception as e:
+            print(f"Error extracting text from image: {e}")
+            return ""
+
+