complete document ingestion pipeline
This commit is contained in:
@@ -0,0 +1,88 @@
|
||||
from flask_restx import Namespace, Resource, fields
|
||||
from flask import request, jsonify, current_app as app, send_file
|
||||
from ...services.ocr import OCRService
|
||||
from ...utils.decorators.auth import protected_route
|
||||
from .models.errors import error_404, error_500
|
||||
from .models.response import response
|
||||
import json
|
||||
import os
|
||||
import numpy as np
|
||||
from werkzeug.datastructures import FileStorage
|
||||
|
||||
api = Namespace('OCR',
|
||||
description='Description',
|
||||
path='/v2/api/tools/ocr')
|
||||
|
||||
upload_parser = api.parser()
|
||||
upload_parser.add_argument('file', location='files',
|
||||
type=FileStorage, required=True)
|
||||
# OCR Data Model
|
||||
ocr_model = api.model('OCR', {
|
||||
'format': fields.String(required=True),
|
||||
'data': fields.String(required=False),
|
||||
|
||||
})
|
||||
|
||||
success_response = api.clone('OCR Model Response', response, {
|
||||
'model': fields.Nested(ocr_model)
|
||||
})
|
||||
|
||||
|
||||
@api.route('')
|
||||
@api.doc(security='apikey')
|
||||
class OCRResource(Resource):
|
||||
@api.doc('get_text')
|
||||
@api.expect(upload_parser)
|
||||
@protected_route
|
||||
def post(self):
|
||||
output_format = request.get_json().get('format')
|
||||
|
||||
if not output_format:
|
||||
output_format = 'txt'
|
||||
|
||||
args = upload_parser.parse_args()
|
||||
|
||||
try:
|
||||
if 'file' not in args:
|
||||
raise ValueError("Invalid file")
|
||||
|
||||
file = args['file']
|
||||
|
||||
if file.filename == '':
|
||||
raise ValueError("Invalid file")
|
||||
|
||||
if file:
|
||||
# Save the uploaded file to the UPLOAD_FOLDER
|
||||
filename = os.path.join(file.filename)
|
||||
file.save(filename)
|
||||
ocr_service = OCRService(image_directory='',
|
||||
export_directory=os.path.join(app.config['UPLOAD_FOLDER']),
|
||||
language='en')
|
||||
output_format = 'text'
|
||||
result = ocr_service.read_text(filename, output_format=output_format)
|
||||
if output_format == 'text':
|
||||
txt_file, message = result
|
||||
if txt_file:
|
||||
|
||||
return send_file(os.path.join("..",'..','..', txt_file), mimetype='text/plain', as_attachment=True, download_name=txt_file)
|
||||
else:
|
||||
return jsonify(error=message)
|
||||
|
||||
elif output_format == 'json':
|
||||
json_data, message = result
|
||||
if json_data:
|
||||
result_json = json.dumps(json_data, default=np_encoder)
|
||||
return {'model': {
|
||||
'format': output_format,
|
||||
'data': result_json
|
||||
}}
|
||||
api.abort(code=500, message="Invalid Format", error=True)
|
||||
|
||||
|
||||
except Exception as e:
|
||||
api.abort(code=500, message="Something went wrong", error=True)
|
||||
|
||||
|
||||
def np_encoder(object):
|
||||
if isinstance(object, np.generic):
|
||||
return object.item()
|
||||
Reference in New Issue
Block a user