183 lines
6.5 KiB
Python
183 lines
6.5 KiB
Python
|
|
"""
|
||
|
|
Validation utilities for agricultural keyword tagging system
|
||
|
|
"""
|
||
|
|
|
||
|
|
import re
|
||
|
|
from typing import List, Dict, Tuple
|
||
|
|
import pandas as pd
|
||
|
|
|
||
|
|
class KeywordValidator:
|
||
|
|
"""Validates and scores keyword quality for agricultural photos"""
|
||
|
|
|
||
|
|
def __init__(self):
|
||
|
|
self.agricultural_terms = {
|
||
|
|
'high_value': [
|
||
|
|
'farmer', 'rancher', 'dairy farmer', 'chicken farmer',
|
||
|
|
'tractor', 'combine', 'harvester', 'cattle', 'livestock',
|
||
|
|
'corn', 'wheat', 'soybean', 'cotton', 'rice'
|
||
|
|
],
|
||
|
|
'medium_value': [
|
||
|
|
'field', 'farm', 'barn', 'agriculture', 'farming',
|
||
|
|
'rural', 'crop', 'harvest', 'planting', 'irrigation'
|
||
|
|
],
|
||
|
|
'low_value': [
|
||
|
|
'outdoor', 'green', 'sunny', 'large', 'small', 'old', 'new'
|
||
|
|
]
|
||
|
|
}
|
||
|
|
|
||
|
|
def validate_keywords(self, keywords: List[str]) -> Dict[str, any]:
|
||
|
|
"""Validate keyword quality and relevance"""
|
||
|
|
if not keywords:
|
||
|
|
return {'score': 0, 'issues': ['No keywords provided']}
|
||
|
|
|
||
|
|
issues = []
|
||
|
|
score = 0
|
||
|
|
|
||
|
|
# Check keyword count
|
||
|
|
if len(keywords) < 5:
|
||
|
|
issues.append(f'Only {len(keywords)} keywords (minimum 5 recommended)')
|
||
|
|
elif len(keywords) > 10:
|
||
|
|
issues.append(f'{len(keywords)} keywords (maximum 10 recommended)')
|
||
|
|
|
||
|
|
# Score keywords based on agricultural relevance
|
||
|
|
for keyword in keywords:
|
||
|
|
if keyword in self.agricultural_terms['high_value']:
|
||
|
|
score += 3
|
||
|
|
elif keyword in self.agricultural_terms['medium_value']:
|
||
|
|
score += 2
|
||
|
|
elif keyword in self.agricultural_terms['low_value']:
|
||
|
|
score += 1
|
||
|
|
else:
|
||
|
|
score += 0.5 # Generic terms
|
||
|
|
|
||
|
|
# Check for required agricultural content
|
||
|
|
has_agricultural_term = any(
|
||
|
|
keyword in self.agricultural_terms['high_value'] + self.agricultural_terms['medium_value']
|
||
|
|
for keyword in keywords
|
||
|
|
)
|
||
|
|
|
||
|
|
if not has_agricultural_term:
|
||
|
|
issues.append('No clear agricultural terms detected')
|
||
|
|
score *= 0.5
|
||
|
|
|
||
|
|
# Normalize score (0-100)
|
||
|
|
max_possible_score = len(keywords) * 3
|
||
|
|
normalized_score = min(100, (score / max_possible_score) * 100) if max_possible_score > 0 else 0
|
||
|
|
|
||
|
|
return {
|
||
|
|
'score': round(normalized_score, 1),
|
||
|
|
'issues': issues,
|
||
|
|
'keyword_count': len(keywords),
|
||
|
|
'agricultural_relevance': has_agricultural_term
|
||
|
|
}
|
||
|
|
|
||
|
|
def validate_title(self, title: str) -> Dict[str, any]:
|
||
|
|
"""Validate title quality for stock photos"""
|
||
|
|
issues = []
|
||
|
|
score = 100
|
||
|
|
|
||
|
|
if not title:
|
||
|
|
return {'score': 0, 'issues': ['No title provided']}
|
||
|
|
|
||
|
|
# Check length
|
||
|
|
if len(title) < 10:
|
||
|
|
issues.append('Title too short (minimum 10 characters)')
|
||
|
|
score -= 20
|
||
|
|
elif len(title) > 100:
|
||
|
|
issues.append('Title too long (maximum 100 characters)')
|
||
|
|
score -= 10
|
||
|
|
|
||
|
|
# Check for agricultural content
|
||
|
|
agricultural_words = [
|
||
|
|
'farm', 'agriculture', 'crop', 'livestock', 'rural',
|
||
|
|
'farmer', 'rancher', 'tractor', 'field', 'barn'
|
||
|
|
]
|
||
|
|
|
||
|
|
has_ag_content = any(word in title.lower() for word in agricultural_words)
|
||
|
|
if not has_ag_content:
|
||
|
|
issues.append('Title lacks agricultural context')
|
||
|
|
score -= 30
|
||
|
|
|
||
|
|
# Check capitalization
|
||
|
|
if not title[0].isupper():
|
||
|
|
issues.append('Title should start with capital letter')
|
||
|
|
score -= 5
|
||
|
|
|
||
|
|
return {
|
||
|
|
'score': max(0, score),
|
||
|
|
'issues': issues,
|
||
|
|
'length': len(title),
|
||
|
|
'agricultural_content': has_ag_content
|
||
|
|
}
|
||
|
|
|
||
|
|
class DataQualityChecker:
|
||
|
|
"""Check data quality for batch processing"""
|
||
|
|
|
||
|
|
@staticmethod
|
||
|
|
def validate_csv_output(csv_path: str) -> Dict[str, any]:
|
||
|
|
"""Validate CSV output format and content"""
|
||
|
|
try:
|
||
|
|
df = pd.read_csv(csv_path)
|
||
|
|
|
||
|
|
required_columns = ['filename', 'human_keywords', 'ai_keywords', 'ai_title', 'location']
|
||
|
|
missing_columns = [col for col in required_columns if col not in df.columns]
|
||
|
|
|
||
|
|
if missing_columns:
|
||
|
|
return {
|
||
|
|
'valid': False,
|
||
|
|
'error': f'Missing required columns: {missing_columns}'
|
||
|
|
}
|
||
|
|
|
||
|
|
# Check for empty critical fields
|
||
|
|
empty_ai_keywords = df['ai_keywords'].isna().sum()
|
||
|
|
empty_ai_titles = df['ai_title'].isna().sum()
|
||
|
|
|
||
|
|
return {
|
||
|
|
'valid': True,
|
||
|
|
'total_rows': len(df),
|
||
|
|
'empty_ai_keywords': empty_ai_keywords,
|
||
|
|
'empty_ai_titles': empty_ai_titles,
|
||
|
|
'completion_rate': {
|
||
|
|
'keywords': round((len(df) - empty_ai_keywords) / len(df) * 100, 1),
|
||
|
|
'titles': round((len(df) - empty_ai_titles) / len(df) * 100, 1)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
return {
|
||
|
|
'valid': False,
|
||
|
|
'error': f'Error reading CSV: {str(e)}'
|
||
|
|
}
|
||
|
|
|
||
|
|
@staticmethod
|
||
|
|
def check_batch_performance(processing_times: List[float], image_count: int) -> Dict[str, any]:
|
||
|
|
"""Analyze batch processing performance"""
|
||
|
|
if not processing_times:
|
||
|
|
return {'error': 'No processing times provided'}
|
||
|
|
|
||
|
|
avg_time = sum(processing_times) / len(processing_times)
|
||
|
|
total_time = sum(processing_times)
|
||
|
|
|
||
|
|
# Performance thresholds
|
||
|
|
target_time_per_image = 5.0 # seconds
|
||
|
|
performance_rating = 'excellent' if avg_time <= 2 else 'good' if avg_time <= 5 else 'needs_improvement'
|
||
|
|
|
||
|
|
return {
|
||
|
|
'total_images': image_count,
|
||
|
|
'total_time_seconds': round(total_time, 2),
|
||
|
|
'average_time_per_image': round(avg_time, 2),
|
||
|
|
'performance_rating': performance_rating,
|
||
|
|
'estimated_time_for_500': round(avg_time * 500 / 60, 1), # minutes
|
||
|
|
'estimated_time_for_1000': round(avg_time * 1000 / 60, 1) # minutes
|
||
|
|
}
|
||
|
|
|
||
|
|
def validate_image_file(file_path: str) -> bool:
|
||
|
|
"""Quick validation that file is a valid image"""
|
||
|
|
try:
|
||
|
|
from PIL import Image
|
||
|
|
with Image.open(file_path) as img:
|
||
|
|
img.verify()
|
||
|
|
return True
|
||
|
|
except:
|
||
|
|
return False
|