""" Validation utilities for agricultural keyword tagging system """ import re from typing import List, Dict, Tuple import pandas as pd class KeywordValidator: """Validates and scores keyword quality for agricultural photos""" def __init__(self): self.agricultural_terms = { 'high_value': [ 'farmer', 'rancher', 'dairy farmer', 'chicken farmer', 'tractor', 'combine', 'harvester', 'cattle', 'livestock', 'corn', 'wheat', 'soybean', 'cotton', 'rice' ], 'medium_value': [ 'field', 'farm', 'barn', 'agriculture', 'farming', 'rural', 'crop', 'harvest', 'planting', 'irrigation' ], 'low_value': [ 'outdoor', 'green', 'sunny', 'large', 'small', 'old', 'new' ] } def validate_keywords(self, keywords: List[str]) -> Dict[str, any]: """Validate keyword quality and relevance""" if not keywords: return {'score': 0, 'issues': ['No keywords provided']} issues = [] score = 0 # Check keyword count if len(keywords) < 5: issues.append(f'Only {len(keywords)} keywords (minimum 5 recommended)') elif len(keywords) > 10: issues.append(f'{len(keywords)} keywords (maximum 10 recommended)') # Score keywords based on agricultural relevance for keyword in keywords: if keyword in self.agricultural_terms['high_value']: score += 3 elif keyword in self.agricultural_terms['medium_value']: score += 2 elif keyword in self.agricultural_terms['low_value']: score += 1 else: score += 0.5 # Generic terms # Check for required agricultural content has_agricultural_term = any( keyword in self.agricultural_terms['high_value'] + self.agricultural_terms['medium_value'] for keyword in keywords ) if not has_agricultural_term: issues.append('No clear agricultural terms detected') score *= 0.5 # Normalize score (0-100) max_possible_score = len(keywords) * 3 normalized_score = min(100, (score / max_possible_score) * 100) if max_possible_score > 0 else 0 return { 'score': round(normalized_score, 1), 'issues': issues, 'keyword_count': len(keywords), 'agricultural_relevance': has_agricultural_term } def validate_title(self, title: str) -> Dict[str, any]: """Validate title quality for stock photos""" issues = [] score = 100 if not title: return {'score': 0, 'issues': ['No title provided']} # Check length if len(title) < 10: issues.append('Title too short (minimum 10 characters)') score -= 20 elif len(title) > 100: issues.append('Title too long (maximum 100 characters)') score -= 10 # Check for agricultural content agricultural_words = [ 'farm', 'agriculture', 'crop', 'livestock', 'rural', 'farmer', 'rancher', 'tractor', 'field', 'barn' ] has_ag_content = any(word in title.lower() for word in agricultural_words) if not has_ag_content: issues.append('Title lacks agricultural context') score -= 30 # Check capitalization if not title[0].isupper(): issues.append('Title should start with capital letter') score -= 5 return { 'score': max(0, score), 'issues': issues, 'length': len(title), 'agricultural_content': has_ag_content } class DataQualityChecker: """Check data quality for batch processing""" @staticmethod def validate_csv_output(csv_path: str) -> Dict[str, any]: """Validate CSV output format and content""" try: df = pd.read_csv(csv_path) required_columns = ['filename', 'human_keywords', 'ai_keywords', 'ai_title', 'location'] missing_columns = [col for col in required_columns if col not in df.columns] if missing_columns: return { 'valid': False, 'error': f'Missing required columns: {missing_columns}' } # Check for empty critical fields empty_ai_keywords = df['ai_keywords'].isna().sum() empty_ai_titles = df['ai_title'].isna().sum() return { 'valid': True, 'total_rows': len(df), 'empty_ai_keywords': empty_ai_keywords, 'empty_ai_titles': empty_ai_titles, 'completion_rate': { 'keywords': round((len(df) - empty_ai_keywords) / len(df) * 100, 1), 'titles': round((len(df) - empty_ai_titles) / len(df) * 100, 1) } } except Exception as e: return { 'valid': False, 'error': f'Error reading CSV: {str(e)}' } @staticmethod def check_batch_performance(processing_times: List[float], image_count: int) -> Dict[str, any]: """Analyze batch processing performance""" if not processing_times: return {'error': 'No processing times provided'} avg_time = sum(processing_times) / len(processing_times) total_time = sum(processing_times) # Performance thresholds target_time_per_image = 5.0 # seconds performance_rating = 'excellent' if avg_time <= 2 else 'good' if avg_time <= 5 else 'needs_improvement' return { 'total_images': image_count, 'total_time_seconds': round(total_time, 2), 'average_time_per_image': round(avg_time, 2), 'performance_rating': performance_rating, 'estimated_time_for_500': round(avg_time * 500 / 60, 1), # minutes 'estimated_time_for_1000': round(avg_time * 1000 / 60, 1) # minutes } def validate_image_file(file_path: str) -> bool: """Quick validation that file is a valid image""" try: from PIL import Image with Image.open(file_path) as img: img.verify() return True except: return False