200 lines
5.6 KiB
Plaintext
200 lines
5.6 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"!pip install -q pdfplumber"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"from text_extractor import TextExtractor\n",
|
||
"from langchain_core.documents import Document\n",
|
||
"import os\n",
|
||
"import base64\n",
|
||
"import requests\n",
|
||
"from dotenv import load_dotenv\n",
|
||
"load_dotenv()\n",
|
||
"\n",
|
||
"# OpenAI API Key\n",
|
||
"api_key = os.getenv('OPENAI_API_KEY')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Vision Model Set Up"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Function to encode the image\n",
|
||
"def encode_image(image_path):\n",
|
||
" with open(image_path, \"rb\") as image_file:\n",
|
||
" return base64.b64encode(image_file.read()).decode('utf-8')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"def process_image(image_path):\n",
|
||
" global api_key\n",
|
||
"\n",
|
||
" # Getting the base64 string\n",
|
||
" base64_image = encode_image(image_path)\n",
|
||
"\n",
|
||
" headers = {\n",
|
||
" \"Content-Type\": \"application/json\",\n",
|
||
" \"Authorization\": f\"Bearer {api_key}\"\n",
|
||
" }\n",
|
||
"\n",
|
||
" try:\n",
|
||
" payload = {\n",
|
||
" \"model\": \"gpt-4o-mini\",\n",
|
||
" \"messages\": [\n",
|
||
" {\n",
|
||
" \"role\": \"user\",\n",
|
||
" \"content\": [\n",
|
||
" {\n",
|
||
" \"type\": \"text\",\n",
|
||
" \"text\": \"What’s in this image?\"\n",
|
||
" },\n",
|
||
" {\n",
|
||
" \"type\": \"image_url\",\n",
|
||
" \"image_url\": {\n",
|
||
" \"url\": f\"data:image/jpeg;base64,{base64_image}\"\n",
|
||
" }\n",
|
||
" }\n",
|
||
" ]\n",
|
||
" }\n",
|
||
" ],\n",
|
||
" \"max_tokens\": 300\n",
|
||
" }\n",
|
||
"\n",
|
||
" response = requests.post(\"https://api.openai.com/v1/chat/completions\", headers=headers, json=payload)\n",
|
||
" # returning the content of the response\n",
|
||
" response = response.json()['choices'][0]['message']['content']\n",
|
||
" except Exception as e:\n",
|
||
" response = \"Image not good enough for processing\"\n",
|
||
"\n",
|
||
" return response"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 13,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# creating a function to extract texts from image\n",
|
||
"def create_image_document(image_path):\n",
|
||
" # getting the image name from the image path\n",
|
||
" image_name = image_path.split('/')[-1].split('.')[0]\n",
|
||
" # setting image name as metadata\n",
|
||
" metadata = {'filename': image_name}\n",
|
||
" text_extractor = TextExtractor()\n",
|
||
" text = text_extractor.read_text_from_image(image_path)\n",
|
||
" # removing special characters and line breaks\n",
|
||
" text = ''.join(e for e in text if e.isalnum() or e.isspace() or e == '\\n')\n",
|
||
" \n",
|
||
" # if the text is empty, then we will process the image with OpenAI vision model\n",
|
||
" if text == '':\n",
|
||
" text = process_image(image_path)\n",
|
||
" \n",
|
||
" # checking if there's no value error or something, we will only return the text if there isnt any error\n",
|
||
" if text != \"Image not good enough for processing\":\n",
|
||
" # creating a document from the text\n",
|
||
" doc = Document(page_content=text, metadata=metadata)\n",
|
||
" # returning the document\n",
|
||
" return [doc]\n",
|
||
" else:\n",
|
||
" pass # if there's an error, we will return None"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 14,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"[Document(metadata={'filename': 'hyundai-sonata-auto-body-repair-before'}, page_content=\"The image shows a dark-colored car with visible damage on the driver's side. The damage appears to be a dent and scratches on the door and fender area. The car is parked indoors, likely in a garage.\")]\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# testing the function\n",
|
||
"image_path = 'data/hyundai-sonata-auto-body-repair-before.jpg'\n",
|
||
"text = create_image_document(image_path)\n",
|
||
"print(text)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "smog_env",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.11.9"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|