image pipeline perfected. audio pipeline in progress

2024-08-08 22:06:39 +01:00
parent c54dc17989
commit f1aa34bef2
10 changed files with 319 additions and 63 deletions
@@ -11,6 +11,13 @@ from langchain_core.documents  import Document
 from text_extractor import TextExtractor
 import os
 import json
+import base64
+import requests
+from dotenv import load_dotenv
+load_dotenv()
+
+# OpenAI API Key
+api_key = os.getenv('OPENAI_API_KEY')


 # loading the embedding model
@@ -91,6 +98,56 @@ def load_document(document_path):
    else:
        raise ValueError(f"Unsupported document type for {document_path}")

+# Function to encode the image
+def encode_image(image_path):
+  with open(image_path, "rb") as image_file:
+    return base64.b64encode(image_file.read()).decode('utf-8')
+
+# Vision API to process the image
+def process_image(image_path):
+    global api_key
+
+    # Getting the base64 string
+    base64_image = encode_image(image_path)
+
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {api_key}"
+    }
+
+    try:
+        payload = {
+            "model": "gpt-4o-mini",
+            "messages": [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "What’s in this image?"
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:image/jpeg;base64,{base64_image}"
+                            }
+                        }
+                    ]
+                }
+            ],
+            "max_tokens": 300
+        }
+
+        response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
+        # returning the content of the response
+        response = response.json()['choices'][0]['message']['content']
+    except Exception as e:
+        response = "Image not good enough for processing"
+
+    return response
+
+
+# create image document
 def create_image_document(image_path):
    # getting the image name from the image path
    image_name = image_path.split('/')[-1].split('.')[0]
@@ -100,9 +157,19 @@ def create_image_document(image_path):
    text = text_extractor.read_text_from_image(image_path)
    # removing special characters and line breaks
    text = ''.join(e for e in text if e.isalnum() or e.isspace() or e == '\n')
-    doc = Document(page_content=text, metadata=metadata)
-    # returning the document in a list
-    return [doc]
+    
+    # if the text is empty, then we will process the image with OpenAI vision model
+    if text == '':
+        text = process_image(image_path)
+        
+    # checking if there's no value error or something, we will only return the text if there isnt any error
+    if text != "Image not good enough for processing":
+        # creating a document from the text
+        doc = Document(page_content=text, metadata=metadata)
+        # returning the document
+        return [doc]
+    else:
+        pass # if there's an error, we will return None


 def save_embedded_data(embeddings, key="data"):