image_experiment.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install -q pdfplumber"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from text_extractor import TextExtractor\n",
    "from langchain_core.documents  import Document\n",
    "import os\n",
    "import base64\n",
    "import requests\n",
    "from dotenv import load_dotenv\n",
    "load_dotenv()\n",
    "\n",
    "# OpenAI API Key\n",
    "api_key = os.getenv('OPENAI_API_KEY')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Vision Model Set Up"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Function to encode the image\n",
    "def encode_image(image_path):\n",
    "  with open(image_path, \"rb\") as image_file:\n",
    "    return base64.b64encode(image_file.read()).decode('utf-8')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_image(image_path):\n",
    "    global api_key\n",
    "\n",
    "    # Getting the base64 string\n",
    "    base64_image = encode_image(image_path)\n",
    "\n",
    "    headers = {\n",
    "        \"Content-Type\": \"application/json\",\n",
    "        \"Authorization\": f\"Bearer {api_key}\"\n",
    "    }\n",
    "\n",
    "    try:\n",
    "        payload = {\n",
    "            \"model\": \"gpt-4o-mini\",\n",
    "            \"messages\": [\n",
    "                {\n",
    "                    \"role\": \"user\",\n",
    "                    \"content\": [\n",
    "                        {\n",
    "                            \"type\": \"text\",\n",
    "                            \"text\": \"What’s in this image?\"\n",
    "                        },\n",
    "                        {\n",
    "                            \"type\": \"image_url\",\n",
    "                            \"image_url\": {\n",
    "                                \"url\": f\"data:image/jpeg;base64,{base64_image}\"\n",
    "                            }\n",
    "                        }\n",
    "                    ]\n",
    "                }\n",
    "            ],\n",
    "            \"max_tokens\": 300\n",
    "        }\n",
    "\n",
    "        response = requests.post(\"https://api.openai.com/v1/chat/completions\", headers=headers, json=payload)\n",
    "        # returning the content of the response\n",
    "        response = response.json()['choices'][0]['message']['content']\n",
    "    except Exception as e:\n",
    "        response = \"Image not good enough for processing\"\n",
    "\n",
    "    return response"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# creating a function to extract texts from image\n",
    "def create_image_document(image_path):\n",
    "    # getting the image name from the image path\n",
    "    image_name = image_path.split('/')[-1].split('.')[0]\n",
    "    # setting image name as metadata\n",
    "    metadata = {'filename': image_name}\n",
    "    text_extractor = TextExtractor()\n",
    "    text = text_extractor.read_text_from_image(image_path)\n",
    "    # removing special characters and line breaks\n",
    "    text = ''.join(e for e in text if e.isalnum() or e.isspace() or e == '\\n')\n",
    "    \n",
    "    # if the text is empty, then we will process the image with OpenAI vision model\n",
    "    if text == '':\n",
    "        text = process_image(image_path)\n",
    "        \n",
    "    # checking if there's no value error or something, we will only return the text if there isnt any error\n",
    "    if text != \"Image not good enough for processing\":\n",
    "        # creating a document from the text\n",
    "        doc = Document(page_content=text, metadata=metadata)\n",
    "        # returning the document\n",
    "        return [doc]\n",
    "    else:\n",
    "        pass # if there's an error, we will return None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[Document(metadata={'filename': 'hyundai-sonata-auto-body-repair-before'}, page_content=\"The image shows a dark-colored car with visible damage on the driver's side. The damage appears to be a dent and scratches on the door and fender area. The car is parked indoors, likely in a garage.\")]\n"
     ]
    }
   ],
   "source": [
    "# testing the function\n",
    "image_path = 'data/hyundai-sonata-auto-body-repair-before.jpg'\n",
    "text = create_image_document(image_path)\n",
    "print(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "smog_env",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}