import logging from typing import List, Dict, Any, Optional import spacy from spacy import displacy from ..models.document import Entity, EntityType from config.settings import settings class EntityExtractor: """Entity extraction service for named entity recognition.""" def __init__(self): """Initialize entity extractor.""" self.logger = logging.getLogger(__name__) self.nlp = None self._load_model() def _load_model(self): """Load the spaCy NLP model.""" try: self.nlp = spacy.load(settings.SPACY_MODEL) self.logger.info(f"Loaded spaCy model: {settings.SPACY_MODEL}") except OSError: self.logger.error(f"spaCy model {settings.SPACY_MODEL} not found. Please install it with: python -m spacy download {settings.SPACY_MODEL}") raise def extract_entities(self, text: str) -> List[Entity]: """ Extract named entities from text. Args: text: Input text to process Returns: List of extracted entities """ if not self.nlp: self._load_model() try: # Process text with spaCy doc = self.nlp(text) entities = [] for ent in doc.ents: # Map spaCy labels to our EntityType enum entity_type = self._map_spacy_label(ent.label_) if entity_type: # Only include mapped entity types entity = Entity( text=ent.text, label=ent.label_, entity_type=entity_type, start_pos=ent.start_char, end_pos=ent.end_char, confidence=self._calculate_confidence(ent), metadata={ "spacy_label": ent.label_, "spacy_explanation": spacy.explain(ent.label_) } ) entities.append(entity) self.logger.debug(f"Extracted {len(entities)} entities from text") return entities except Exception as e: self.logger.error(f"Error extracting entities: {str(e)}") return [] def extract_relationships(self, text: str) -> List[Dict[str, Any]]: """ Extract relationships between entities. Args: text: Input text to process Returns: List of relationships between entities """ if not self.nlp: self._load_model() try: doc = self.nlp(text) relationships = [] # Simple relationship extraction based on dependency parsing for token in doc: if token.dep_ in ['nsubj', 'dobj', 'pobj']: # Subject, direct object, prepositional object head = token.head # Check if both token and head are part of named entities token_ent = self._get_entity_for_token(token, doc.ents) head_ent = self._get_entity_for_token(head, doc.ents) if token_ent and head_ent and token_ent != head_ent: relationship = { "subject": token_ent.text, "subject_type": self._map_spacy_label(token_ent.label_), "predicate": head.text, "object": head_ent.text, "object_type": self._map_spacy_label(head_ent.label_), "relation_type": token.dep_, "confidence": 0.7 # Basic confidence score } relationships.append(relationship) self.logger.debug(f"Extracted {len(relationships)} relationships from text") return relationships except Exception as e: self.logger.error(f"Error extracting relationships: {str(e)}") return [] def _map_spacy_label(self, spacy_label: str) -> Optional[EntityType]: """ Map spaCy entity labels to our EntityType enum. Args: spacy_label: spaCy entity label Returns: Corresponding EntityType or None if not mapped """ mapping = { # Person 'PERSON': EntityType.PERSON, # Places 'GPE': EntityType.PLACE, # Geopolitical entity 'LOC': EntityType.PLACE, # Location 'FAC': EntityType.BUILDING, # Facility/Building # Organizations 'ORG': EntityType.ORGANIZATION, # Events 'EVENT': EntityType.EVENT, # Dates 'DATE': EntityType.DATE, 'TIME': EntityType.DATE, } return mapping.get(spacy_label) def _calculate_confidence(self, entity) -> float: """ Calculate confidence score for an entity. Args: entity: spaCy entity object Returns: Confidence score between 0 and 1 """ # Basic confidence calculation based on entity properties confidence = 0.5 # Base confidence # Increase confidence for longer entities if len(entity.text) > 3: confidence += 0.1 # Increase confidence for capitalized entities if entity.text.istitle(): confidence += 0.1 # Increase confidence for certain entity types high_confidence_types = ['PERSON', 'GPE', 'ORG'] if entity.label_ in high_confidence_types: confidence += 0.2 return min(confidence, 1.0) def _get_entity_for_token(self, token, entities): """ Get the entity that contains a specific token. Args: token: spaCy token entities: List of spaCy entities Returns: Entity containing the token or None """ for ent in entities: if ent.start <= token.i < ent.end: return ent return None def get_entity_summary(self, entities: List[Entity]) -> Dict[str, int]: """ Get summary statistics for extracted entities. Args: entities: List of entities Returns: Dictionary with entity type counts """ summary = {} for entity in entities: entity_type = entity.entity_type.value summary[entity_type] = summary.get(entity_type, 0) + 1 return summary def visualize_entities(self, text: str, output_path: Optional[str] = None) -> str: """ Create HTML visualization of entities in text. Args: text: Input text output_path: Optional file path to save HTML Returns: HTML string with entity visualization """ if not self.nlp: self._load_model() try: doc = self.nlp(text) # Generate HTML visualization html = displacy.render(doc, style="ent", jupyter=False) if output_path: with open(output_path, 'w', encoding='utf-8') as f: f.write(html) self.logger.info(f"Entity visualization saved to: {output_path}") return html except Exception as e: self.logger.error(f"Error creating entity visualization: {str(e)}") return ""