src/utils/validation.py

"""
Validation utilities for agricultural keyword tagging system
"""

import re
from typing import List, Dict, Tuple
import pandas as pd

class KeywordValidator:
    """Validates and scores keyword quality for agricultural photos"""
    
    def __init__(self):
        self.agricultural_terms = {
            'high_value': [
                'farmer', 'rancher', 'dairy farmer', 'chicken farmer',
                'tractor', 'combine', 'harvester', 'cattle', 'livestock',
                'corn', 'wheat', 'soybean', 'cotton', 'rice'
            ],
            'medium_value': [
                'field', 'farm', 'barn', 'agriculture', 'farming',
                'rural', 'crop', 'harvest', 'planting', 'irrigation'
            ],
            'low_value': [
                'outdoor', 'green', 'sunny', 'large', 'small', 'old', 'new'
            ]
        }
    
    def validate_keywords(self, keywords: List[str]) -> Dict[str, any]:
        """Validate keyword quality and relevance"""
        if not keywords:
            return {'score': 0, 'issues': ['No keywords provided']}
        
        issues = []
        score = 0
        
        # Check keyword count
        if len(keywords) < 5:
            issues.append(f'Only {len(keywords)} keywords (minimum 5 recommended)')
        elif len(keywords) > 10:
            issues.append(f'{len(keywords)} keywords (maximum 10 recommended)')
        
        # Score keywords based on agricultural relevance
        for keyword in keywords:
            if keyword in self.agricultural_terms['high_value']:
                score += 3
            elif keyword in self.agricultural_terms['medium_value']:
                score += 2
            elif keyword in self.agricultural_terms['low_value']:
                score += 1
            else:
                score += 0.5  # Generic terms
        
        # Check for required agricultural content
        has_agricultural_term = any(
            keyword in self.agricultural_terms['high_value'] + self.agricultural_terms['medium_value']
            for keyword in keywords
        )
        
        if not has_agricultural_term:
            issues.append('No clear agricultural terms detected')
            score *= 0.5
        
        # Normalize score (0-100)
        max_possible_score = len(keywords) * 3
        normalized_score = min(100, (score / max_possible_score) * 100) if max_possible_score > 0 else 0
        
        return {
            'score': round(normalized_score, 1),
            'issues': issues,
            'keyword_count': len(keywords),
            'agricultural_relevance': has_agricultural_term
        }
    
    def validate_title(self, title: str) -> Dict[str, any]:
        """Validate title quality for stock photos"""
        issues = []
        score = 100
        
        if not title:
            return {'score': 0, 'issues': ['No title provided']}
        
        # Check length
        if len(title) < 10:
            issues.append('Title too short (minimum 10 characters)')
            score -= 20
        elif len(title) > 100:
            issues.append('Title too long (maximum 100 characters)')
            score -= 10
        
        # Check for agricultural content
        agricultural_words = [
            'farm', 'agriculture', 'crop', 'livestock', 'rural',
            'farmer', 'rancher', 'tractor', 'field', 'barn'
        ]
        
        has_ag_content = any(word in title.lower() for word in agricultural_words)
        if not has_ag_content:
            issues.append('Title lacks agricultural context')
            score -= 30
        
        # Check capitalization
        if not title[0].isupper():
            issues.append('Title should start with capital letter')
            score -= 5
        
        return {
            'score': max(0, score),
            'issues': issues,
            'length': len(title),
            'agricultural_content': has_ag_content
        }

class DataQualityChecker:
    """Check data quality for batch processing"""
    
    @staticmethod
    def validate_csv_output(csv_path: str) -> Dict[str, any]:
        """Validate CSV output format and content"""
        try:
            df = pd.read_csv(csv_path)
            
            required_columns = ['filename', 'human_keywords', 'ai_keywords', 'ai_title', 'location']
            missing_columns = [col for col in required_columns if col not in df.columns]
            
            if missing_columns:
                return {
                    'valid': False,
                    'error': f'Missing required columns: {missing_columns}'
                }
            
            # Check for empty critical fields
            empty_ai_keywords = df['ai_keywords'].isna().sum()
            empty_ai_titles = df['ai_title'].isna().sum()
            
            return {
                'valid': True,
                'total_rows': len(df),
                'empty_ai_keywords': empty_ai_keywords,
                'empty_ai_titles': empty_ai_titles,
                'completion_rate': {
                    'keywords': round((len(df) - empty_ai_keywords) / len(df) * 100, 1),
                    'titles': round((len(df) - empty_ai_titles) / len(df) * 100, 1)
                }
            }
            
        except Exception as e:
            return {
                'valid': False,
                'error': f'Error reading CSV: {str(e)}'
            }
    
    @staticmethod
    def check_batch_performance(processing_times: List[float], image_count: int) -> Dict[str, any]:
        """Analyze batch processing performance"""
        if not processing_times:
            return {'error': 'No processing times provided'}
        
        avg_time = sum(processing_times) / len(processing_times)
        total_time = sum(processing_times)
        
        # Performance thresholds
        target_time_per_image = 5.0  # seconds
        performance_rating = 'excellent' if avg_time <= 2 else 'good' if avg_time <= 5 else 'needs_improvement'
        
        return {
            'total_images': image_count,
            'total_time_seconds': round(total_time, 2),
            'average_time_per_image': round(avg_time, 2),
            'performance_rating': performance_rating,
            'estimated_time_for_500': round(avg_time * 500 / 60, 1),  # minutes
            'estimated_time_for_1000': round(avg_time * 1000 / 60, 1)  # minutes
        }

def validate_image_file(file_path: str) -> bool:
    """Quick validation that file is a valid image"""
    try:
        from PIL import Image
        with Image.open(file_path) as img:
            img.verify()
        return True
    except:
        return False
Complete Enhanced Agricultural AI System - All Requirements Met 2025-07-16 20:35:20 +01:00			`"""`
			`Validation utilities for agricultural keyword tagging system`
			`"""`

			`import re`
			`from typing import List, Dict, Tuple`
			`import pandas as pd`

			`class KeywordValidator:`
			`"""Validates and scores keyword quality for agricultural photos"""`

			`def __init__(self):`
			`self.agricultural_terms = {`
			`'high_value': [`
			`'farmer', 'rancher', 'dairy farmer', 'chicken farmer',`
			`'tractor', 'combine', 'harvester', 'cattle', 'livestock',`
			`'corn', 'wheat', 'soybean', 'cotton', 'rice'`
			`],`
			`'medium_value': [`
			`'field', 'farm', 'barn', 'agriculture', 'farming',`
			`'rural', 'crop', 'harvest', 'planting', 'irrigation'`
			`],`
			`'low_value': [`
			`'outdoor', 'green', 'sunny', 'large', 'small', 'old', 'new'`
			`]`
			`}`

			`def validate_keywords(self, keywords: List[str]) -> Dict[str, any]:`
			`"""Validate keyword quality and relevance"""`
			`if not keywords:`
			`return {'score': 0, 'issues': ['No keywords provided']}`

			`issues = []`
			`score = 0`

			`# Check keyword count`
			`if len(keywords) < 5:`
			`issues.append(f'Only {len(keywords)} keywords (minimum 5 recommended)')`
			`elif len(keywords) > 10:`
			`issues.append(f'{len(keywords)} keywords (maximum 10 recommended)')`

			`# Score keywords based on agricultural relevance`
			`for keyword in keywords:`
			`if keyword in self.agricultural_terms['high_value']:`
			`score += 3`
			`elif keyword in self.agricultural_terms['medium_value']:`
			`score += 2`
			`elif keyword in self.agricultural_terms['low_value']:`
			`score += 1`
			`else:`
			`score += 0.5 # Generic terms`

			`# Check for required agricultural content`
			`has_agricultural_term = any(`
			`keyword in self.agricultural_terms['high_value'] + self.agricultural_terms['medium_value']`
			`for keyword in keywords`
			`)`

			`if not has_agricultural_term:`
			`issues.append('No clear agricultural terms detected')`
			`score *= 0.5`

			`# Normalize score (0-100)`
			`max_possible_score = len(keywords) * 3`
			`normalized_score = min(100, (score / max_possible_score) * 100) if max_possible_score > 0 else 0`

			`return {`
			`'score': round(normalized_score, 1),`
			`'issues': issues,`
			`'keyword_count': len(keywords),`
			`'agricultural_relevance': has_agricultural_term`
			`}`

			`def validate_title(self, title: str) -> Dict[str, any]:`
			`"""Validate title quality for stock photos"""`
			`issues = []`
			`score = 100`

			`if not title:`
			`return {'score': 0, 'issues': ['No title provided']}`

			`# Check length`
			`if len(title) < 10:`
			`issues.append('Title too short (minimum 10 characters)')`
			`score -= 20`
			`elif len(title) > 100:`
			`issues.append('Title too long (maximum 100 characters)')`
			`score -= 10`

			`# Check for agricultural content`
			`agricultural_words = [`
			`'farm', 'agriculture', 'crop', 'livestock', 'rural',`
			`'farmer', 'rancher', 'tractor', 'field', 'barn'`
			`]`

			`has_ag_content = any(word in title.lower() for word in agricultural_words)`
			`if not has_ag_content:`
			`issues.append('Title lacks agricultural context')`
			`score -= 30`

			`# Check capitalization`
			`if not title[0].isupper():`
			`issues.append('Title should start with capital letter')`
			`score -= 5`

			`return {`
			`'score': max(0, score),`
			`'issues': issues,`
			`'length': len(title),`
			`'agricultural_content': has_ag_content`
			`}`

			`class DataQualityChecker:`
			`"""Check data quality for batch processing"""`

			`@staticmethod`
			`def validate_csv_output(csv_path: str) -> Dict[str, any]:`
			`"""Validate CSV output format and content"""`
			`try:`
			`df = pd.read_csv(csv_path)`

			`required_columns = ['filename', 'human_keywords', 'ai_keywords', 'ai_title', 'location']`
			`missing_columns = [col for col in required_columns if col not in df.columns]`

			`if missing_columns:`
			`return {`
			`'valid': False,`
			`'error': f'Missing required columns: {missing_columns}'`
			`}`

			`# Check for empty critical fields`
			`empty_ai_keywords = df['ai_keywords'].isna().sum()`
			`empty_ai_titles = df['ai_title'].isna().sum()`

			`return {`
			`'valid': True,`
			`'total_rows': len(df),`
			`'empty_ai_keywords': empty_ai_keywords,`
			`'empty_ai_titles': empty_ai_titles,`
			`'completion_rate': {`
			`'keywords': round((len(df) - empty_ai_keywords) / len(df) * 100, 1),`
			`'titles': round((len(df) - empty_ai_titles) / len(df) * 100, 1)`
			`}`
			`}`

			`except Exception as e:`
			`return {`
			`'valid': False,`
			`'error': f'Error reading CSV: {str(e)}'`
			`}`

			`@staticmethod`
			`def check_batch_performance(processing_times: List[float], image_count: int) -> Dict[str, any]:`
			`"""Analyze batch processing performance"""`
			`if not processing_times:`
			`return {'error': 'No processing times provided'}`

			`avg_time = sum(processing_times) / len(processing_times)`
			`total_time = sum(processing_times)`

			`# Performance thresholds`
			`target_time_per_image = 5.0 # seconds`
			`performance_rating = 'excellent' if avg_time <= 2 else 'good' if avg_time <= 5 else 'needs_improvement'`

			`return {`
			`'total_images': image_count,`
			`'total_time_seconds': round(total_time, 2),`
			`'average_time_per_image': round(avg_time, 2),`
			`'performance_rating': performance_rating,`
			`'estimated_time_for_500': round(avg_time * 500 / 60, 1), # minutes`
			`'estimated_time_for_1000': round(avg_time * 1000 / 60, 1) # minutes`
			`}`

			`def validate_image_file(file_path: str) -> bool:`
			`"""Quick validation that file is a valid image"""`
			`try:`
			`from PIL import Image`
			`with Image.open(file_path) as img:`
			`img.verify()`
			`return True`
			`except:`
			`return False`