{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "!pip install -q pdfplumber" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from text_extractor import TextExtractor\n", "from langchain_core.documents import Document\n", "import os\n", "import base64\n", "import requests\n", "from dotenv import load_dotenv\n", "load_dotenv()\n", "\n", "# OpenAI API Key\n", "api_key = os.getenv('OPENAI_API_KEY')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Vision Model Set Up" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# Function to encode the image\n", "def encode_image(image_path):\n", " with open(image_path, \"rb\") as image_file:\n", " return base64.b64encode(image_file.read()).decode('utf-8')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "def process_image(image_path):\n", " global api_key\n", "\n", " # Getting the base64 string\n", " base64_image = encode_image(image_path)\n", "\n", " headers = {\n", " \"Content-Type\": \"application/json\",\n", " \"Authorization\": f\"Bearer {api_key}\"\n", " }\n", "\n", " try:\n", " payload = {\n", " \"model\": \"gpt-4o-mini\",\n", " \"messages\": [\n", " {\n", " \"role\": \"user\",\n", " \"content\": [\n", " {\n", " \"type\": \"text\",\n", " \"text\": \"What’s in this image?\"\n", " },\n", " {\n", " \"type\": \"image_url\",\n", " \"image_url\": {\n", " \"url\": f\"data:image/jpeg;base64,{base64_image}\"\n", " }\n", " }\n", " ]\n", " }\n", " ],\n", " \"max_tokens\": 300\n", " }\n", "\n", " response = requests.post(\"https://api.openai.com/v1/chat/completions\", headers=headers, json=payload)\n", " # returning the content of the response\n", " response = response.json()['choices'][0]['message']['content']\n", " except Exception as e:\n", " response = \"Image not good enough for processing\"\n", "\n", " return response" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "# creating a function to extract texts from image\n", "def create_image_document(image_path):\n", " # getting the image name from the image path\n", " image_name = image_path.split('/')[-1].split('.')[0]\n", " # setting image name as metadata\n", " metadata = {'filename': image_name}\n", " text_extractor = TextExtractor()\n", " text = text_extractor.read_text_from_image(image_path)\n", " # removing special characters and line breaks\n", " text = ''.join(e for e in text if e.isalnum() or e.isspace() or e == '\\n')\n", " \n", " # if the text is empty, then we will process the image with OpenAI vision model\n", " if text == '':\n", " text = process_image(image_path)\n", " \n", " # checking if there's no value error or something, we will only return the text if there isnt any error\n", " if text != \"Image not good enough for processing\":\n", " # creating a document from the text\n", " doc = Document(page_content=text, metadata=metadata)\n", " # returning the document\n", " return [doc]\n", " else:\n", " pass # if there's an error, we will return None" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[Document(metadata={'filename': 'hyundai-sonata-auto-body-repair-before'}, page_content=\"The image shows a dark-colored car with visible damage on the driver's side. The damage appears to be a dent and scratches on the door and fender area. The car is parked indoors, likely in a garage.\")]\n" ] } ], "source": [ "# testing the function\n", "image_path = 'data/hyundai-sonata-auto-body-repair-before.jpg'\n", "text = create_image_document(image_path)\n", "print(text)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "smog_env", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.9" } }, "nbformat": 4, "nbformat_minor": 2 }