src/services/entity_extractor.py

import logging
from typing import List, Dict, Any, Optional
import spacy
from spacy import displacy

from ..models.document import Entity, EntityType
from config.settings import settings


class EntityExtractor:
    """Entity extraction service for named entity recognition."""
    
    def __init__(self):
        """Initialize entity extractor."""
        self.logger = logging.getLogger(__name__)
        self.nlp = None
        self._load_model()
    
    def _load_model(self):
        """Load the spaCy NLP model."""
        try:
            self.nlp = spacy.load(settings.SPACY_MODEL)
            self.logger.info(f"Loaded spaCy model: {settings.SPACY_MODEL}")
        except OSError:
            self.logger.error(f"spaCy model {settings.SPACY_MODEL} not found. Please install it with: python -m spacy download {settings.SPACY_MODEL}")
            raise
    
    def extract_entities(self, text: str) -> List[Entity]:
        """
        Extract named entities from text.
        
        Args:
            text: Input text to process
            
        Returns:
            List of extracted entities
        """
        if not self.nlp:
            self._load_model()
        
        try:
            # Process text with spaCy
            doc = self.nlp(text)
            
            entities = []
            for ent in doc.ents:
                # Map spaCy labels to our EntityType enum
                entity_type = self._map_spacy_label(ent.label_)
                
                if entity_type:  # Only include mapped entity types
                    entity = Entity(
                        text=ent.text,
                        label=ent.label_,
                        entity_type=entity_type,
                        start_pos=ent.start_char,
                        end_pos=ent.end_char,
                        confidence=self._calculate_confidence(ent),
                        metadata={
                            "spacy_label": ent.label_,
                            "spacy_explanation": spacy.explain(ent.label_)
                        }
                    )
                    entities.append(entity)
            
            self.logger.debug(f"Extracted {len(entities)} entities from text")
            return entities
            
        except Exception as e:
            self.logger.error(f"Error extracting entities: {str(e)}")
            return []
    
    def extract_relationships(self, text: str) -> List[Dict[str, Any]]:
        """
        Extract relationships between entities.
        
        Args:
            text: Input text to process
            
        Returns:
            List of relationships between entities
        """
        if not self.nlp:
            self._load_model()
        
        try:
            doc = self.nlp(text)
            relationships = []
            
            # Simple relationship extraction based on dependency parsing
            for token in doc:
                if token.dep_ in ['nsubj', 'dobj', 'pobj']:  # Subject, direct object, prepositional object
                    head = token.head
                    
                    # Check if both token and head are part of named entities
                    token_ent = self._get_entity_for_token(token, doc.ents)
                    head_ent = self._get_entity_for_token(head, doc.ents)
                    
                    if token_ent and head_ent and token_ent != head_ent:
                        relationship = {
                            "subject": token_ent.text,
                            "subject_type": self._map_spacy_label(token_ent.label_),
                            "predicate": head.text,
                            "object": head_ent.text,
                            "object_type": self._map_spacy_label(head_ent.label_),
                            "relation_type": token.dep_,
                            "confidence": 0.7  # Basic confidence score
                        }
                        relationships.append(relationship)
            
            self.logger.debug(f"Extracted {len(relationships)} relationships from text")
            return relationships
            
        except Exception as e:
            self.logger.error(f"Error extracting relationships: {str(e)}")
            return []
    
    def _map_spacy_label(self, spacy_label: str) -> Optional[EntityType]:
        """
        Map spaCy entity labels to our EntityType enum.
        
        Args:
            spacy_label: spaCy entity label
            
        Returns:
            Corresponding EntityType or None if not mapped
        """
        mapping = {
            # Person
            'PERSON': EntityType.PERSON,
            
            # Places
            'GPE': EntityType.PLACE,  # Geopolitical entity
            'LOC': EntityType.PLACE,  # Location
            'FAC': EntityType.BUILDING,  # Facility/Building
            
            # Organizations
            'ORG': EntityType.ORGANIZATION,
            
            # Events
            'EVENT': EntityType.EVENT,
            
            # Dates
            'DATE': EntityType.DATE,
            'TIME': EntityType.DATE,
        }
        
        return mapping.get(spacy_label)
    
    def _calculate_confidence(self, entity) -> float:
        """
        Calculate confidence score for an entity.
        
        Args:
            entity: spaCy entity object
            
        Returns:
            Confidence score between 0 and 1
        """
        # Basic confidence calculation based on entity properties
        confidence = 0.5  # Base confidence
        
        # Increase confidence for longer entities
        if len(entity.text) > 3:
            confidence += 0.1
        
        # Increase confidence for capitalized entities
        if entity.text.istitle():
            confidence += 0.1
        
        # Increase confidence for certain entity types
        high_confidence_types = ['PERSON', 'GPE', 'ORG']
        if entity.label_ in high_confidence_types:
            confidence += 0.2
        
        return min(confidence, 1.0)
    
    def _get_entity_for_token(self, token, entities):
        """
        Get the entity that contains a specific token.
        
        Args:
            token: spaCy token
            entities: List of spaCy entities
            
        Returns:
            Entity containing the token or None
        """
        for ent in entities:
            if ent.start <= token.i < ent.end:
                return ent
        return None
    
    def get_entity_summary(self, entities: List[Entity]) -> Dict[str, int]:
        """
        Get summary statistics for extracted entities.
        
        Args:
            entities: List of entities
            
        Returns:
            Dictionary with entity type counts
        """
        summary = {}
        for entity in entities:
            entity_type = entity.entity_type.value
            summary[entity_type] = summary.get(entity_type, 0) + 1
        
        return summary
    
    def visualize_entities(self, text: str, output_path: Optional[str] = None) -> str:
        """
        Create HTML visualization of entities in text.
        
        Args:
            text: Input text
            output_path: Optional file path to save HTML
            
        Returns:
            HTML string with entity visualization
        """
        if not self.nlp:
            self._load_model()
        
        try:
            doc = self.nlp(text)
            
            # Generate HTML visualization
            html = displacy.render(doc, style="ent", jupyter=False)
            
            if output_path:
                with open(output_path, 'w', encoding='utf-8') as f:
                    f.write(html)
                self.logger.info(f"Entity visualization saved to: {output_path}")
            
            return html
            
        except Exception as e:
            self.logger.error(f"Error creating entity visualization: {str(e)}")
            return ""
Initial commit 2025-08-04 14:50:33 +01:00			`import logging`
			`from typing import List, Dict, Any, Optional`
			`import spacy`
			`from spacy import displacy`

			`from ..models.document import Entity, EntityType`
			`from config.settings import settings`


			`class EntityExtractor:`
			`"""Entity extraction service for named entity recognition."""`

			`def __init__(self):`
			`"""Initialize entity extractor."""`
			`self.logger = logging.getLogger(__name__)`
			`self.nlp = None`
			`self._load_model()`

			`def _load_model(self):`
			`"""Load the spaCy NLP model."""`
			`try:`
			`self.nlp = spacy.load(settings.SPACY_MODEL)`
			`self.logger.info(f"Loaded spaCy model: {settings.SPACY_MODEL}")`
			`except OSError:`
			`self.logger.error(f"spaCy model {settings.SPACY_MODEL} not found. Please install it with: python -m spacy download {settings.SPACY_MODEL}")`
			`raise`

			`def extract_entities(self, text: str) -> List[Entity]:`
			`"""`
			`Extract named entities from text.`

			`Args:`
			`text: Input text to process`

			`Returns:`
			`List of extracted entities`
			`"""`
			`if not self.nlp:`
			`self._load_model()`

			`try:`
			`# Process text with spaCy`
			`doc = self.nlp(text)`

			`entities = []`
			`for ent in doc.ents:`
			`# Map spaCy labels to our EntityType enum`
			`entity_type = self._map_spacy_label(ent.label_)`

			`if entity_type: # Only include mapped entity types`
			`entity = Entity(`
			`text=ent.text,`
			`label=ent.label_,`
			`entity_type=entity_type,`
			`start_pos=ent.start_char,`
			`end_pos=ent.end_char,`
			`confidence=self._calculate_confidence(ent),`
			`metadata={`
			`"spacy_label": ent.label_,`
			`"spacy_explanation": spacy.explain(ent.label_)`
			`}`
			`)`
			`entities.append(entity)`

			`self.logger.debug(f"Extracted {len(entities)} entities from text")`
			`return entities`

			`except Exception as e:`
			`self.logger.error(f"Error extracting entities: {str(e)}")`
			`return []`

			`def extract_relationships(self, text: str) -> List[Dict[str, Any]]:`
			`"""`
			`Extract relationships between entities.`

			`Args:`
			`text: Input text to process`

			`Returns:`
			`List of relationships between entities`
			`"""`
			`if not self.nlp:`
			`self._load_model()`

			`try:`
			`doc = self.nlp(text)`
			`relationships = []`

			`# Simple relationship extraction based on dependency parsing`
			`for token in doc:`
			`if token.dep_ in ['nsubj', 'dobj', 'pobj']: # Subject, direct object, prepositional object`
			`head = token.head`

			`# Check if both token and head are part of named entities`
			`token_ent = self._get_entity_for_token(token, doc.ents)`
			`head_ent = self._get_entity_for_token(head, doc.ents)`

			`if token_ent and head_ent and token_ent != head_ent:`
			`relationship = {`
			`"subject": token_ent.text,`
			`"subject_type": self._map_spacy_label(token_ent.label_),`
			`"predicate": head.text,`
			`"object": head_ent.text,`
			`"object_type": self._map_spacy_label(head_ent.label_),`
			`"relation_type": token.dep_,`
			`"confidence": 0.7 # Basic confidence score`
			`}`
			`relationships.append(relationship)`

			`self.logger.debug(f"Extracted {len(relationships)} relationships from text")`
			`return relationships`

			`except Exception as e:`
			`self.logger.error(f"Error extracting relationships: {str(e)}")`
			`return []`

			`def _map_spacy_label(self, spacy_label: str) -> Optional[EntityType]:`
			`"""`
			`Map spaCy entity labels to our EntityType enum.`

			`Args:`
			`spacy_label: spaCy entity label`

			`Returns:`
			`Corresponding EntityType or None if not mapped`
			`"""`
			`mapping = {`
			`# Person`
			`'PERSON': EntityType.PERSON,`

			`# Places`
			`'GPE': EntityType.PLACE, # Geopolitical entity`
			`'LOC': EntityType.PLACE, # Location`
			`'FAC': EntityType.BUILDING, # Facility/Building`

			`# Organizations`
			`'ORG': EntityType.ORGANIZATION,`

			`# Events`
			`'EVENT': EntityType.EVENT,`

			`# Dates`
			`'DATE': EntityType.DATE,`
			`'TIME': EntityType.DATE,`
			`}`

			`return mapping.get(spacy_label)`

			`def _calculate_confidence(self, entity) -> float:`
			`"""`
			`Calculate confidence score for an entity.`

			`Args:`
			`entity: spaCy entity object`

			`Returns:`
			`Confidence score between 0 and 1`
			`"""`
			`# Basic confidence calculation based on entity properties`
			`confidence = 0.5 # Base confidence`

			`# Increase confidence for longer entities`
			`if len(entity.text) > 3:`
			`confidence += 0.1`

			`# Increase confidence for capitalized entities`
			`if entity.text.istitle():`
			`confidence += 0.1`

			`# Increase confidence for certain entity types`
			`high_confidence_types = ['PERSON', 'GPE', 'ORG']`
			`if entity.label_ in high_confidence_types:`
			`confidence += 0.2`

			`return min(confidence, 1.0)`

			`def _get_entity_for_token(self, token, entities):`
			`"""`
			`Get the entity that contains a specific token.`

			`Args:`
			`token: spaCy token`
			`entities: List of spaCy entities`

			`Returns:`
			`Entity containing the token or None`
			`"""`
			`for ent in entities:`
			`if ent.start <= token.i < ent.end:`
			`return ent`
			`return None`

			`def get_entity_summary(self, entities: List[Entity]) -> Dict[str, int]:`
			`"""`
			`Get summary statistics for extracted entities.`

			`Args:`
			`entities: List of entities`

			`Returns:`
			`Dictionary with entity type counts`
			`"""`
			`summary = {}`
			`for entity in entities:`
			`entity_type = entity.entity_type.value`
			`summary[entity_type] = summary.get(entity_type, 0) + 1`

			`return summary`

			`def visualize_entities(self, text: str, output_path: Optional[str] = None) -> str:`
			`"""`
			`Create HTML visualization of entities in text.`

			`Args:`
			`text: Input text`
			`output_path: Optional file path to save HTML`

			`Returns:`
			`HTML string with entity visualization`
			`"""`
			`if not self.nlp:`
			`self._load_model()`

			`try:`
			`doc = self.nlp(text)`

			`# Generate HTML visualization`
			`html = displacy.render(doc, style="ent", jupyter=False)`

			`if output_path:`
			`with open(output_path, 'w', encoding='utf-8') as f:`
			`f.write(html)`
			`self.logger.info(f"Entity visualization saved to: {output_path}")`

			`return html`

			`except Exception as e:`
			`self.logger.error(f"Error creating entity visualization: {str(e)}")`
			`return ""`