Files
ds-smart-farm-project/src/utils/validation.py
T

183 lines
6.5 KiB
Python
Raw Normal View History

"""
Validation utilities for agricultural keyword tagging system
"""
import re
from typing import List, Dict, Tuple
import pandas as pd
class KeywordValidator:
"""Validates and scores keyword quality for agricultural photos"""
def __init__(self):
self.agricultural_terms = {
'high_value': [
'farmer', 'rancher', 'dairy farmer', 'chicken farmer',
'tractor', 'combine', 'harvester', 'cattle', 'livestock',
'corn', 'wheat', 'soybean', 'cotton', 'rice'
],
'medium_value': [
'field', 'farm', 'barn', 'agriculture', 'farming',
'rural', 'crop', 'harvest', 'planting', 'irrigation'
],
'low_value': [
'outdoor', 'green', 'sunny', 'large', 'small', 'old', 'new'
]
}
def validate_keywords(self, keywords: List[str]) -> Dict[str, any]:
"""Validate keyword quality and relevance"""
if not keywords:
return {'score': 0, 'issues': ['No keywords provided']}
issues = []
score = 0
# Check keyword count
if len(keywords) < 5:
issues.append(f'Only {len(keywords)} keywords (minimum 5 recommended)')
elif len(keywords) > 10:
issues.append(f'{len(keywords)} keywords (maximum 10 recommended)')
# Score keywords based on agricultural relevance
for keyword in keywords:
if keyword in self.agricultural_terms['high_value']:
score += 3
elif keyword in self.agricultural_terms['medium_value']:
score += 2
elif keyword in self.agricultural_terms['low_value']:
score += 1
else:
score += 0.5 # Generic terms
# Check for required agricultural content
has_agricultural_term = any(
keyword in self.agricultural_terms['high_value'] + self.agricultural_terms['medium_value']
for keyword in keywords
)
if not has_agricultural_term:
issues.append('No clear agricultural terms detected')
score *= 0.5
# Normalize score (0-100)
max_possible_score = len(keywords) * 3
normalized_score = min(100, (score / max_possible_score) * 100) if max_possible_score > 0 else 0
return {
'score': round(normalized_score, 1),
'issues': issues,
'keyword_count': len(keywords),
'agricultural_relevance': has_agricultural_term
}
def validate_title(self, title: str) -> Dict[str, any]:
"""Validate title quality for stock photos"""
issues = []
score = 100
if not title:
return {'score': 0, 'issues': ['No title provided']}
# Check length
if len(title) < 10:
issues.append('Title too short (minimum 10 characters)')
score -= 20
elif len(title) > 100:
issues.append('Title too long (maximum 100 characters)')
score -= 10
# Check for agricultural content
agricultural_words = [
'farm', 'agriculture', 'crop', 'livestock', 'rural',
'farmer', 'rancher', 'tractor', 'field', 'barn'
]
has_ag_content = any(word in title.lower() for word in agricultural_words)
if not has_ag_content:
issues.append('Title lacks agricultural context')
score -= 30
# Check capitalization
if not title[0].isupper():
issues.append('Title should start with capital letter')
score -= 5
return {
'score': max(0, score),
'issues': issues,
'length': len(title),
'agricultural_content': has_ag_content
}
class DataQualityChecker:
"""Check data quality for batch processing"""
@staticmethod
def validate_csv_output(csv_path: str) -> Dict[str, any]:
"""Validate CSV output format and content"""
try:
df = pd.read_csv(csv_path)
required_columns = ['filename', 'human_keywords', 'ai_keywords', 'ai_title', 'location']
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
return {
'valid': False,
'error': f'Missing required columns: {missing_columns}'
}
# Check for empty critical fields
empty_ai_keywords = df['ai_keywords'].isna().sum()
empty_ai_titles = df['ai_title'].isna().sum()
return {
'valid': True,
'total_rows': len(df),
'empty_ai_keywords': empty_ai_keywords,
'empty_ai_titles': empty_ai_titles,
'completion_rate': {
'keywords': round((len(df) - empty_ai_keywords) / len(df) * 100, 1),
'titles': round((len(df) - empty_ai_titles) / len(df) * 100, 1)
}
}
except Exception as e:
return {
'valid': False,
'error': f'Error reading CSV: {str(e)}'
}
@staticmethod
def check_batch_performance(processing_times: List[float], image_count: int) -> Dict[str, any]:
"""Analyze batch processing performance"""
if not processing_times:
return {'error': 'No processing times provided'}
avg_time = sum(processing_times) / len(processing_times)
total_time = sum(processing_times)
# Performance thresholds
target_time_per_image = 5.0 # seconds
performance_rating = 'excellent' if avg_time <= 2 else 'good' if avg_time <= 5 else 'needs_improvement'
return {
'total_images': image_count,
'total_time_seconds': round(total_time, 2),
'average_time_per_image': round(avg_time, 2),
'performance_rating': performance_rating,
'estimated_time_for_500': round(avg_time * 500 / 60, 1), # minutes
'estimated_time_for_1000': round(avg_time * 1000 / 60, 1) # minutes
}
def validate_image_file(file_path: str) -> bool:
"""Quick validation that file is a valid image"""
try:
from PIL import Image
with Image.open(file_path) as img:
img.verify()
return True
except:
return False