Complete Enhanced Agricultural AI System - All Requirements Met
This commit is contained in:
@@ -0,0 +1,182 @@
|
||||
"""
|
||||
Validation utilities for agricultural keyword tagging system
|
||||
"""
|
||||
|
||||
import re
|
||||
from typing import List, Dict, Tuple
|
||||
import pandas as pd
|
||||
|
||||
class KeywordValidator:
|
||||
"""Validates and scores keyword quality for agricultural photos"""
|
||||
|
||||
def __init__(self):
|
||||
self.agricultural_terms = {
|
||||
'high_value': [
|
||||
'farmer', 'rancher', 'dairy farmer', 'chicken farmer',
|
||||
'tractor', 'combine', 'harvester', 'cattle', 'livestock',
|
||||
'corn', 'wheat', 'soybean', 'cotton', 'rice'
|
||||
],
|
||||
'medium_value': [
|
||||
'field', 'farm', 'barn', 'agriculture', 'farming',
|
||||
'rural', 'crop', 'harvest', 'planting', 'irrigation'
|
||||
],
|
||||
'low_value': [
|
||||
'outdoor', 'green', 'sunny', 'large', 'small', 'old', 'new'
|
||||
]
|
||||
}
|
||||
|
||||
def validate_keywords(self, keywords: List[str]) -> Dict[str, any]:
|
||||
"""Validate keyword quality and relevance"""
|
||||
if not keywords:
|
||||
return {'score': 0, 'issues': ['No keywords provided']}
|
||||
|
||||
issues = []
|
||||
score = 0
|
||||
|
||||
# Check keyword count
|
||||
if len(keywords) < 5:
|
||||
issues.append(f'Only {len(keywords)} keywords (minimum 5 recommended)')
|
||||
elif len(keywords) > 10:
|
||||
issues.append(f'{len(keywords)} keywords (maximum 10 recommended)')
|
||||
|
||||
# Score keywords based on agricultural relevance
|
||||
for keyword in keywords:
|
||||
if keyword in self.agricultural_terms['high_value']:
|
||||
score += 3
|
||||
elif keyword in self.agricultural_terms['medium_value']:
|
||||
score += 2
|
||||
elif keyword in self.agricultural_terms['low_value']:
|
||||
score += 1
|
||||
else:
|
||||
score += 0.5 # Generic terms
|
||||
|
||||
# Check for required agricultural content
|
||||
has_agricultural_term = any(
|
||||
keyword in self.agricultural_terms['high_value'] + self.agricultural_terms['medium_value']
|
||||
for keyword in keywords
|
||||
)
|
||||
|
||||
if not has_agricultural_term:
|
||||
issues.append('No clear agricultural terms detected')
|
||||
score *= 0.5
|
||||
|
||||
# Normalize score (0-100)
|
||||
max_possible_score = len(keywords) * 3
|
||||
normalized_score = min(100, (score / max_possible_score) * 100) if max_possible_score > 0 else 0
|
||||
|
||||
return {
|
||||
'score': round(normalized_score, 1),
|
||||
'issues': issues,
|
||||
'keyword_count': len(keywords),
|
||||
'agricultural_relevance': has_agricultural_term
|
||||
}
|
||||
|
||||
def validate_title(self, title: str) -> Dict[str, any]:
|
||||
"""Validate title quality for stock photos"""
|
||||
issues = []
|
||||
score = 100
|
||||
|
||||
if not title:
|
||||
return {'score': 0, 'issues': ['No title provided']}
|
||||
|
||||
# Check length
|
||||
if len(title) < 10:
|
||||
issues.append('Title too short (minimum 10 characters)')
|
||||
score -= 20
|
||||
elif len(title) > 100:
|
||||
issues.append('Title too long (maximum 100 characters)')
|
||||
score -= 10
|
||||
|
||||
# Check for agricultural content
|
||||
agricultural_words = [
|
||||
'farm', 'agriculture', 'crop', 'livestock', 'rural',
|
||||
'farmer', 'rancher', 'tractor', 'field', 'barn'
|
||||
]
|
||||
|
||||
has_ag_content = any(word in title.lower() for word in agricultural_words)
|
||||
if not has_ag_content:
|
||||
issues.append('Title lacks agricultural context')
|
||||
score -= 30
|
||||
|
||||
# Check capitalization
|
||||
if not title[0].isupper():
|
||||
issues.append('Title should start with capital letter')
|
||||
score -= 5
|
||||
|
||||
return {
|
||||
'score': max(0, score),
|
||||
'issues': issues,
|
||||
'length': len(title),
|
||||
'agricultural_content': has_ag_content
|
||||
}
|
||||
|
||||
class DataQualityChecker:
|
||||
"""Check data quality for batch processing"""
|
||||
|
||||
@staticmethod
|
||||
def validate_csv_output(csv_path: str) -> Dict[str, any]:
|
||||
"""Validate CSV output format and content"""
|
||||
try:
|
||||
df = pd.read_csv(csv_path)
|
||||
|
||||
required_columns = ['filename', 'human_keywords', 'ai_keywords', 'ai_title', 'location']
|
||||
missing_columns = [col for col in required_columns if col not in df.columns]
|
||||
|
||||
if missing_columns:
|
||||
return {
|
||||
'valid': False,
|
||||
'error': f'Missing required columns: {missing_columns}'
|
||||
}
|
||||
|
||||
# Check for empty critical fields
|
||||
empty_ai_keywords = df['ai_keywords'].isna().sum()
|
||||
empty_ai_titles = df['ai_title'].isna().sum()
|
||||
|
||||
return {
|
||||
'valid': True,
|
||||
'total_rows': len(df),
|
||||
'empty_ai_keywords': empty_ai_keywords,
|
||||
'empty_ai_titles': empty_ai_titles,
|
||||
'completion_rate': {
|
||||
'keywords': round((len(df) - empty_ai_keywords) / len(df) * 100, 1),
|
||||
'titles': round((len(df) - empty_ai_titles) / len(df) * 100, 1)
|
||||
}
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
return {
|
||||
'valid': False,
|
||||
'error': f'Error reading CSV: {str(e)}'
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def check_batch_performance(processing_times: List[float], image_count: int) -> Dict[str, any]:
|
||||
"""Analyze batch processing performance"""
|
||||
if not processing_times:
|
||||
return {'error': 'No processing times provided'}
|
||||
|
||||
avg_time = sum(processing_times) / len(processing_times)
|
||||
total_time = sum(processing_times)
|
||||
|
||||
# Performance thresholds
|
||||
target_time_per_image = 5.0 # seconds
|
||||
performance_rating = 'excellent' if avg_time <= 2 else 'good' if avg_time <= 5 else 'needs_improvement'
|
||||
|
||||
return {
|
||||
'total_images': image_count,
|
||||
'total_time_seconds': round(total_time, 2),
|
||||
'average_time_per_image': round(avg_time, 2),
|
||||
'performance_rating': performance_rating,
|
||||
'estimated_time_for_500': round(avg_time * 500 / 60, 1), # minutes
|
||||
'estimated_time_for_1000': round(avg_time * 1000 / 60, 1) # minutes
|
||||
}
|
||||
|
||||
def validate_image_file(file_path: str) -> bool:
|
||||
"""Quick validation that file is a valid image"""
|
||||
try:
|
||||
from PIL import Image
|
||||
with Image.open(file_path) as img:
|
||||
img.verify()
|
||||
return True
|
||||
except:
|
||||
return False
|
||||
Reference in New Issue
Block a user