Files
ds-smart-farm-project/src/utils/validation.py
T
2025-07-16 20:35:20 +01:00

183 lines
6.5 KiB
Python

"""
Validation utilities for agricultural keyword tagging system
"""
import re
from typing import List, Dict, Tuple
import pandas as pd
class KeywordValidator:
"""Validates and scores keyword quality for agricultural photos"""
def __init__(self):
self.agricultural_terms = {
'high_value': [
'farmer', 'rancher', 'dairy farmer', 'chicken farmer',
'tractor', 'combine', 'harvester', 'cattle', 'livestock',
'corn', 'wheat', 'soybean', 'cotton', 'rice'
],
'medium_value': [
'field', 'farm', 'barn', 'agriculture', 'farming',
'rural', 'crop', 'harvest', 'planting', 'irrigation'
],
'low_value': [
'outdoor', 'green', 'sunny', 'large', 'small', 'old', 'new'
]
}
def validate_keywords(self, keywords: List[str]) -> Dict[str, any]:
"""Validate keyword quality and relevance"""
if not keywords:
return {'score': 0, 'issues': ['No keywords provided']}
issues = []
score = 0
# Check keyword count
if len(keywords) < 5:
issues.append(f'Only {len(keywords)} keywords (minimum 5 recommended)')
elif len(keywords) > 10:
issues.append(f'{len(keywords)} keywords (maximum 10 recommended)')
# Score keywords based on agricultural relevance
for keyword in keywords:
if keyword in self.agricultural_terms['high_value']:
score += 3
elif keyword in self.agricultural_terms['medium_value']:
score += 2
elif keyword in self.agricultural_terms['low_value']:
score += 1
else:
score += 0.5 # Generic terms
# Check for required agricultural content
has_agricultural_term = any(
keyword in self.agricultural_terms['high_value'] + self.agricultural_terms['medium_value']
for keyword in keywords
)
if not has_agricultural_term:
issues.append('No clear agricultural terms detected')
score *= 0.5
# Normalize score (0-100)
max_possible_score = len(keywords) * 3
normalized_score = min(100, (score / max_possible_score) * 100) if max_possible_score > 0 else 0
return {
'score': round(normalized_score, 1),
'issues': issues,
'keyword_count': len(keywords),
'agricultural_relevance': has_agricultural_term
}
def validate_title(self, title: str) -> Dict[str, any]:
"""Validate title quality for stock photos"""
issues = []
score = 100
if not title:
return {'score': 0, 'issues': ['No title provided']}
# Check length
if len(title) < 10:
issues.append('Title too short (minimum 10 characters)')
score -= 20
elif len(title) > 100:
issues.append('Title too long (maximum 100 characters)')
score -= 10
# Check for agricultural content
agricultural_words = [
'farm', 'agriculture', 'crop', 'livestock', 'rural',
'farmer', 'rancher', 'tractor', 'field', 'barn'
]
has_ag_content = any(word in title.lower() for word in agricultural_words)
if not has_ag_content:
issues.append('Title lacks agricultural context')
score -= 30
# Check capitalization
if not title[0].isupper():
issues.append('Title should start with capital letter')
score -= 5
return {
'score': max(0, score),
'issues': issues,
'length': len(title),
'agricultural_content': has_ag_content
}
class DataQualityChecker:
"""Check data quality for batch processing"""
@staticmethod
def validate_csv_output(csv_path: str) -> Dict[str, any]:
"""Validate CSV output format and content"""
try:
df = pd.read_csv(csv_path)
required_columns = ['filename', 'human_keywords', 'ai_keywords', 'ai_title', 'location']
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
return {
'valid': False,
'error': f'Missing required columns: {missing_columns}'
}
# Check for empty critical fields
empty_ai_keywords = df['ai_keywords'].isna().sum()
empty_ai_titles = df['ai_title'].isna().sum()
return {
'valid': True,
'total_rows': len(df),
'empty_ai_keywords': empty_ai_keywords,
'empty_ai_titles': empty_ai_titles,
'completion_rate': {
'keywords': round((len(df) - empty_ai_keywords) / len(df) * 100, 1),
'titles': round((len(df) - empty_ai_titles) / len(df) * 100, 1)
}
}
except Exception as e:
return {
'valid': False,
'error': f'Error reading CSV: {str(e)}'
}
@staticmethod
def check_batch_performance(processing_times: List[float], image_count: int) -> Dict[str, any]:
"""Analyze batch processing performance"""
if not processing_times:
return {'error': 'No processing times provided'}
avg_time = sum(processing_times) / len(processing_times)
total_time = sum(processing_times)
# Performance thresholds
target_time_per_image = 5.0 # seconds
performance_rating = 'excellent' if avg_time <= 2 else 'good' if avg_time <= 5 else 'needs_improvement'
return {
'total_images': image_count,
'total_time_seconds': round(total_time, 2),
'average_time_per_image': round(avg_time, 2),
'performance_rating': performance_rating,
'estimated_time_for_500': round(avg_time * 500 / 60, 1), # minutes
'estimated_time_for_1000': round(avg_time * 1000 / 60, 1) # minutes
}
def validate_image_file(file_path: str) -> bool:
"""Quick validation that file is a valid image"""
try:
from PIL import Image
with Image.open(file_path) as img:
img.verify()
return True
except:
return False