ds-smart-farm-project/src/model/keyword_generator.py

"""
Agricultural Photo Keyword Generator using BLIP-2 model
"""

import torch
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import re
from typing import List, Dict, Optional

class AgricultureKeywordGenerator:
    def __init__(self, model_path: Optional[str] = None):
        """
        Initialize the BLIP-2 model for image captioning and keyword generation

        Args:
            model_path: Path to fine-tuned model. If None, uses pre-trained model.
        """
        if model_path and os.path.exists(model_path):
            print(f"Loading fine-tuned agricultural model from: {model_path}")
            self.processor = BlipProcessor.from_pretrained(model_path)
            self.model = BlipForConditionalGeneration.from_pretrained(model_path)
            self.is_fine_tuned = True
        else:
            print("Loading pre-trained BLIP model for keyword generation...")
            self.processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
            self.model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
            self.is_fine_tuned = False
            if model_path:
                print(f"Warning: Fine-tuned model not found at {model_path}, using pre-trained model")

        # Enhanced agriculture-specific keywords with distinctions
        self.agriculture_keywords = {
            'people': {
                'farmer': ['farmer', 'crop farmer', 'grain farmer', 'vegetable farmer'],
                'rancher': ['rancher', 'cattle rancher', 'livestock rancher', 'beef rancher'],
                'dairy': ['dairy farmer', 'dairy worker', 'milker'],
                'poultry': ['chicken farmer', 'poultry farmer', 'egg farmer'],
                'worker': ['farm worker', 'agricultural worker', 'field worker', 'ranch hand'],
                'gender': ['male farmer', 'female farmer', 'man', 'woman', 'boy', 'girl']
            },
            'animals': {
                'cattle': ['cow', 'cattle', 'bull', 'calf', 'beef cattle', 'dairy cow', 'holstein', 'angus'],
                'poultry': ['chicken', 'rooster', 'hen', 'chick', 'turkey', 'duck', 'goose'],
                'swine': ['pig', 'hog', 'swine', 'piglet', 'boar', 'sow'],
                'sheep': ['sheep', 'lamb', 'ewe', 'ram', 'wool'],
                'goats': ['goat', 'kid', 'billy goat', 'nanny goat'],
                'horses': ['horse', 'mare', 'stallion', 'foal', 'pony']
            },
            'crops': {
                'grains': ['corn', 'wheat', 'rice', 'barley', 'oats', 'rye', 'sorghum'],
                'legumes': ['soybean', 'beans', 'peas', 'lentils', 'peanuts'],
                'vegetables': ['tomato', 'potato', 'carrot', 'onion', 'pepper', 'lettuce', 'cabbage'],
                'fruits': ['apple', 'orange', 'grape', 'strawberry', 'peach', 'cherry'],
                'cash_crops': ['cotton', 'tobacco', 'sugar beet', 'sunflower']
            },
            'equipment': {
                'tractors': ['tractor', 'farm tractor', 'john deere', 'case ih', 'new holland'],
                'harvest': ['combine', 'harvester', 'thresher', 'picker'],
                'tillage': ['plow', 'disc', 'cultivator', 'harrow', 'chisel plow'],
                'planting': ['planter', 'seeder', 'drill', 'transplanter'],
                'irrigation': ['sprinkler', 'pivot', 'irrigation', 'drip system'],
                'livestock': ['milking machine', 'feeder', 'water tank', 'barn equipment']
            },
            'locations': {
                'fields': ['field', 'cropland', 'farmland', 'pasture', 'meadow'],
                'buildings': ['barn', 'silo', 'grain bin', 'shed', 'farmhouse', 'greenhouse'],
                'areas': ['farm', 'ranch', 'dairy', 'feedlot', 'orchard', 'vineyard']
            },
            'activities': {
                'crop': ['planting', 'seeding', 'harvesting', 'cultivation', 'irrigation'],
                'livestock': ['feeding', 'milking', 'herding', 'breeding', 'grazing'],
                'general': ['farming', 'agriculture', 'rural work', 'field work']
            }
        }

        print("Model loaded successfully!")

    def generate_caption(self, image_path: str) -> str:
        """Generate a descriptive caption for the image"""
        try:
            image = Image.open(image_path).convert('RGB')
            inputs = self.processor(image, return_tensors="pt")

            with torch.no_grad():
                out = self.model.generate(**inputs, max_length=50, num_beams=5)

            caption = self.processor.decode(out[0], skip_special_tokens=True)
            return caption
        except Exception as e:
            print(f"Error generating caption for {image_path}: {e}")
            return ""

    def extract_keywords_from_caption(self, caption: str) -> List[str]:
        """Extract agriculture-relevant keywords from caption with enhanced distinctions"""
        keywords = []
        caption_lower = caption.lower()

        # Extract keywords from enhanced categories
        for main_category, subcategories in self.agriculture_keywords.items():
            if isinstance(subcategories, dict):
                for subcategory, terms in subcategories.items():
                    for term in terms:
                        if term in caption_lower:
                            keywords.append(term)
            else:
                # Handle old format if any remains
                for term in subcategories:
                    if term in caption_lower:
                        keywords.append(term)

        # Enhanced descriptive words with agricultural context
        descriptive_patterns = [
            r'\b(?:green|fresh|organic|natural|healthy|ripe|mature)\b',  # Quality
            r'\b(?:rural|outdoor|countryside|pastoral|agricultural)\b',   # Setting
            r'\b(?:sunny|cloudy|dawn|dusk|morning|evening)\b',           # Time/Weather
            r'\b(?:large|small|big|little|huge|tiny|vast|wide)\b',       # Size
            r'\b(?:young|old|new|vintage|modern|traditional)\b',         # Age/Style
            r'\b(?:male|female|man|woman|boy|girl)\b'                    # Gender
        ]

        for pattern in descriptive_patterns:
            matches = re.findall(pattern, caption_lower)
            keywords.extend(matches)

        # Apply agricultural distinctions
        keywords = self._apply_agricultural_distinctions(keywords, caption_lower)

        # Remove duplicates and prioritize agricultural terms
        keywords = self._prioritize_keywords(keywords)

        return keywords[:10]  # Limit to 10 keywords max

    def _apply_agricultural_distinctions(self, keywords: List[str], caption: str) -> List[str]:
        """Apply specific agricultural distinctions (farmer vs rancher, etc.)"""
        enhanced_keywords = keywords.copy()

        # Farmer vs Rancher distinction
        if any(term in caption for term in ['cattle', 'cow', 'beef', 'livestock', 'ranch']):
            if 'farmer' in enhanced_keywords:
                enhanced_keywords.remove('farmer')
                enhanced_keywords.append('rancher')
        elif any(term in caption for term in ['crop', 'grain', 'corn', 'wheat', 'field']):
            if 'rancher' in enhanced_keywords:
                enhanced_keywords.remove('rancher')
                enhanced_keywords.append('farmer')

        # Dairy farmer distinction
        if any(term in caption for term in ['milk', 'dairy', 'holstein']):
            if 'farmer' in enhanced_keywords:
                enhanced_keywords.remove('farmer')
                enhanced_keywords.append('dairy farmer')
            if 'rancher' in enhanced_keywords:
                enhanced_keywords.remove('rancher')
                enhanced_keywords.append('dairy farmer')

        # Chicken farmer (not rancher)
        if any(term in caption for term in ['chicken', 'poultry', 'hen', 'rooster']):
            if 'rancher' in enhanced_keywords:
                enhanced_keywords.remove('rancher')
                enhanced_keywords.append('chicken farmer')

        # Gender identification enhancement
        gender_indicators = {
            'male': ['man', 'boy', 'male', 'father', 'son', 'husband'],
            'female': ['woman', 'girl', 'female', 'mother', 'daughter', 'wife']
        }

        for gender, indicators in gender_indicators.items():
            if any(indicator in caption for indicator in indicators):
                if any(role in enhanced_keywords for role in ['farmer', 'rancher', 'dairy farmer']):
                    # Add gender specification
                    enhanced_keywords.append(f'{gender} farmer')

        return enhanced_keywords

    def _prioritize_keywords(self, keywords: List[str]) -> List[str]:
        """Prioritize agricultural keywords over generic ones"""
        # Define priority levels
        high_priority = ['farmer', 'rancher', 'dairy farmer', 'chicken farmer']
        medium_priority = ['tractor', 'cattle', 'corn', 'wheat', 'barn', 'field']

        prioritized = []

        # Add high priority keywords first
        for keyword in keywords:
            if any(hp in keyword for hp in high_priority):
                prioritized.append(keyword)

        # Add medium priority keywords
        for keyword in keywords:
            if keyword not in prioritized and any(mp in keyword for mp in medium_priority):
                prioritized.append(keyword)

        # Add remaining keywords
        for keyword in keywords:
            if keyword not in prioritized:
                prioritized.append(keyword)

        # Remove duplicates while preserving order
        seen = set()
        result = []
        for keyword in prioritized:
            if keyword not in seen:
                seen.add(keyword)
                result.append(keyword)

        return result

    def generate_keywords(self, image_path: str) -> Dict[str, any]:
        """Generate keywords and title for an agricultural image"""
        caption = self.generate_caption(image_path)
        keywords = self.extract_keywords_from_caption(caption)

        # If we don't have enough keywords, add some generic agricultural terms
        if len(keywords) < 5:
            generic_terms = ['agriculture', 'farming', 'rural', 'outdoor', 'field']
            for term in generic_terms:
                if term not in keywords:
                    keywords.append(term)
                if len(keywords) >= 5:
                    break

        return {
            'caption': caption,
            'keywords': keywords[:10],  # Limit to 10 keywords max
            'title': self.generate_title(caption)
        }

    def generate_title(self, caption: str) -> str:
        """Generate a product title from the caption"""
        # Clean up the caption to make it more title-like
        title = caption.strip()
        if title and not title[0].isupper():
            title = title[0].upper() + title[1:]

        # Add "Agricultural" prefix if not agriculture-related
        agriculture_terms = ['farm', 'agriculture', 'crop', 'livestock', 'rural']
        if not any(term in title.lower() for term in agriculture_terms):
            title = f"Agricultural scene: {title}"

        return title