Files
ds_erp_ai/image_experiment.ipynb
2024-08-08 22:06:39 +01:00

200 lines
5.6 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"!pip install -q pdfplumber"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from text_extractor import TextExtractor\n",
"from langchain_core.documents import Document\n",
"import os\n",
"import base64\n",
"import requests\n",
"from dotenv import load_dotenv\n",
"load_dotenv()\n",
"\n",
"# OpenAI API Key\n",
"api_key = os.getenv('OPENAI_API_KEY')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Vision Model Set Up"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# Function to encode the image\n",
"def encode_image(image_path):\n",
" with open(image_path, \"rb\") as image_file:\n",
" return base64.b64encode(image_file.read()).decode('utf-8')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def process_image(image_path):\n",
" global api_key\n",
"\n",
" # Getting the base64 string\n",
" base64_image = encode_image(image_path)\n",
"\n",
" headers = {\n",
" \"Content-Type\": \"application/json\",\n",
" \"Authorization\": f\"Bearer {api_key}\"\n",
" }\n",
"\n",
" try:\n",
" payload = {\n",
" \"model\": \"gpt-4o-mini\",\n",
" \"messages\": [\n",
" {\n",
" \"role\": \"user\",\n",
" \"content\": [\n",
" {\n",
" \"type\": \"text\",\n",
" \"text\": \"Whats in this image?\"\n",
" },\n",
" {\n",
" \"type\": \"image_url\",\n",
" \"image_url\": {\n",
" \"url\": f\"data:image/jpeg;base64,{base64_image}\"\n",
" }\n",
" }\n",
" ]\n",
" }\n",
" ],\n",
" \"max_tokens\": 300\n",
" }\n",
"\n",
" response = requests.post(\"https://api.openai.com/v1/chat/completions\", headers=headers, json=payload)\n",
" # returning the content of the response\n",
" response = response.json()['choices'][0]['message']['content']\n",
" except Exception as e:\n",
" response = \"Image not good enough for processing\"\n",
"\n",
" return response"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"# creating a function to extract texts from image\n",
"def create_image_document(image_path):\n",
" # getting the image name from the image path\n",
" image_name = image_path.split('/')[-1].split('.')[0]\n",
" # setting image name as metadata\n",
" metadata = {'filename': image_name}\n",
" text_extractor = TextExtractor()\n",
" text = text_extractor.read_text_from_image(image_path)\n",
" # removing special characters and line breaks\n",
" text = ''.join(e for e in text if e.isalnum() or e.isspace() or e == '\\n')\n",
" \n",
" # if the text is empty, then we will process the image with OpenAI vision model\n",
" if text == '':\n",
" text = process_image(image_path)\n",
" \n",
" # checking if there's no value error or something, we will only return the text if there isnt any error\n",
" if text != \"Image not good enough for processing\":\n",
" # creating a document from the text\n",
" doc = Document(page_content=text, metadata=metadata)\n",
" # returning the document\n",
" return [doc]\n",
" else:\n",
" pass # if there's an error, we will return None"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[Document(metadata={'filename': 'hyundai-sonata-auto-body-repair-before'}, page_content=\"The image shows a dark-colored car with visible damage on the driver's side. The damage appears to be a dent and scratches on the door and fender area. The car is parked indoors, likely in a garage.\")]\n"
]
}
],
"source": [
"# testing the function\n",
"image_path = 'data/hyundai-sonata-auto-body-repair-before.jpg'\n",
"text = create_image_document(image_path)\n",
"print(text)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "smog_env",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}