image text extraction completed

This commit is contained in:
timothyafolami
2024-08-08 14:58:44 +01:00
parent 9a2a4c5fdd
commit c54dc17989
13 changed files with 331 additions and 7 deletions
+161
View File
@@ -0,0 +1,161 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"!pip install -q pdfplumber"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"from text_extractor import TextExtractor\n",
"from langchain_core.documents import Document"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"# creating a function to extract texts from image\n",
"def create_image_document(image_path):\n",
" # getting the image name from the image path\n",
" image_name = image_path.split('/')[-1].split('.')[0]\n",
" # setting image name as metadata\n",
" metadata = {'filename': image_name}\n",
" text_extractor = TextExtractor()\n",
" text = text_extractor.read_text_from_image(image_path)\n",
" # removing special characters and line breaks\n",
" text = ''.join(e for e in text if e.isalnum() or e.isspace() or e == '\\n')\n",
" doc = Document(page_content=text, metadata=metadata)\n",
" # returning the document\n",
" return [doc]"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[Document(metadata={'filename': 'IMG_1438'}, page_content='ex a\\n\\nAccidented car before repair\\n')]\n"
]
}
],
"source": [
"# testing the function\n",
"image_path = 'data/IMG_1438.jpeg'\n",
"text = create_image_document(image_path)\n",
"print(text)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'filename': 'IMG_1438'}"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"text[0].metadata"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "smog_env",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}