image pipeline perfected. audio pipeline in progress

2024-08-08 22:06:39 +01:00
parent c54dc17989
commit f1aa34bef2
10 changed files with 319 additions and 63 deletions
@@ -11,17 +11,93 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from text_extractor import TextExtractor\n",
-    "from langchain_core.documents  import Document"
+    "from langchain_core.documents  import Document\n",
+    "import os\n",
+    "import base64\n",
+    "import requests\n",
+    "from dotenv import load_dotenv\n",
+    "load_dotenv()\n",
+    "\n",
+    "# OpenAI API Key\n",
+    "api_key = os.getenv('OPENAI_API_KEY')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Vision Model Set Up"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Function to encode the image\n",
+    "def encode_image(image_path):\n",
+    "  with open(image_path, \"rb\") as image_file:\n",
+    "    return base64.b64encode(image_file.read()).decode('utf-8')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def process_image(image_path):\n",
+    "    global api_key\n",
+    "\n",
+    "    # Getting the base64 string\n",
+    "    base64_image = encode_image(image_path)\n",
+    "\n",
+    "    headers = {\n",
+    "        \"Content-Type\": \"application/json\",\n",
+    "        \"Authorization\": f\"Bearer {api_key}\"\n",
+    "    }\n",
+    "\n",
+    "    try:\n",
+    "        payload = {\n",
+    "            \"model\": \"gpt-4o-mini\",\n",
+    "            \"messages\": [\n",
+    "                {\n",
+    "                    \"role\": \"user\",\n",
+    "                    \"content\": [\n",
+    "                        {\n",
+    "                            \"type\": \"text\",\n",
+    "                            \"text\": \"What’s in this image?\"\n",
+    "                        },\n",
+    "                        {\n",
+    "                            \"type\": \"image_url\",\n",
+    "                            \"image_url\": {\n",
+    "                                \"url\": f\"data:image/jpeg;base64,{base64_image}\"\n",
+    "                            }\n",
+    "                        }\n",
+    "                    ]\n",
+    "                }\n",
+    "            ],\n",
+    "            \"max_tokens\": 300\n",
+    "        }\n",
+    "\n",
+    "        response = requests.post(\"https://api.openai.com/v1/chat/completions\", headers=headers, json=payload)\n",
+    "        # returning the content of the response\n",
+    "        response = response.json()['choices'][0]['message']['content']\n",
+    "    except Exception as e:\n",
+    "        response = \"Image not good enough for processing\"\n",
+    "\n",
+    "    return response"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -35,79 +111,41 @@
    "    text = text_extractor.read_text_from_image(image_path)\n",
    "    # removing special characters and line breaks\n",
    "    text = ''.join(e for e in text if e.isalnum() or e.isspace() or e == '\\n')\n",
-    "    doc = Document(page_content=text, metadata=metadata)\n",
-    "    # returning the document\n",
-    "    return [doc]"
+    "    \n",
+    "    # if the text is empty, then we will process the image with OpenAI vision model\n",
+    "    if text == '':\n",
+    "        text = process_image(image_path)\n",
+    "        \n",
+    "    # checking if there's no value error or something, we will only return the text if there isnt any error\n",
+    "    if text != \"Image not good enough for processing\":\n",
+    "        # creating a document from the text\n",
+    "        doc = Document(page_content=text, metadata=metadata)\n",
+    "        # returning the document\n",
+    "        return [doc]\n",
+    "    else:\n",
+    "        pass # if there's an error, we will return None"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "[Document(metadata={'filename': 'IMG_1438'}, page_content='ex   a\\n\\nAccidented car before repair\\n')]\n"
+      "[Document(metadata={'filename': 'hyundai-sonata-auto-body-repair-before'}, page_content=\"The image shows a dark-colored car with visible damage on the driver's side. The damage appears to be a dent and scratches on the door and fender area. The car is parked indoors, likely in a garage.\")]\n"
     ]
    }
   ],
   "source": [
    "# testing the function\n",
-    "image_path = 'data/IMG_1438.jpeg'\n",
+    "image_path = 'data/hyundai-sonata-auto-body-repair-before.jpg'\n",
    "text = create_image_document(image_path)\n",
    "print(text)"
   ]
  },
-  {
-   "cell_type": "code",
-   "execution_count": 28,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'filename': 'IMG_1438'}"
-      ]
-     },
-     "execution_count": 28,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "text[0].metadata"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
  {
   "cell_type": "code",
   "execution_count": null,