# Standards matching logic import re from typing import Dict, List, Set, Tuple, Optional from loguru import logger from app.core.models import Standard, Requirement class StandardsMatcher: """ Advanced matching logic to identify relevant standards for documents. This class implements sophisticated matching algorithms beyond simple text matching. """ def __init__(self): """Initialize the standards matcher.""" # Common stopwords to filter out when extracting keywords self.stopwords = { "the", "a", "an", "and", "or", "in", "on", "at", "to", "for", "with", "by", "of", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "do", "does", "did", "but", "if", "then", "else", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "can", "will", "just", "should", "now", "this", "that", "these", "those" } # Technical terms that indicate compliance requirements self.technical_indicators = [ "shall", "must", "required", "should", "recommended", "may", "optional", "compliant", "compliance", "conform", "standard", "specification", "requirement", "procedure", "process", "method", "test", "verify", "validate", "certification", "certified", "approved", "regulation", "regulatory", "guideline", "protocol" ] # Common standard prefixes and abbreviations self.standard_prefixes = [ "iso", "ieee", "astm", "ansi", "iec", "din", "bs", "en", "jis", "gb", "api", "asme", "nfpa", "ul", "mil", "std", "rfc", "itu" ] def extract_document_sections(self, document_content: str) -> Dict[str, str]: """ Extract sections from a document to improve matching. Args: document_content: The document content Returns: Dictionary of section name to section content """ sections = {} sections["full_document"] = document_content # Try to identify document sections using markdown headings heading_pattern = re.compile(r'^(#{1,3})\s+(.+)$', re.MULTILINE) matches = list(heading_pattern.finditer(document_content)) if matches: for i, match in enumerate(matches): section_name = match.group(2).strip() # Get section content (from this heading to the next, or to the end) start_pos = match.end() end_pos = matches[i+1].start() if i < len(matches) - 1 else len(document_content) section_content = document_content[start_pos:end_pos].strip() sections[section_name] = section_content # Look for common document sections by name common_sections = [ "introduction", "scope", "purpose", "references", "definitions", "requirements", "compliance", "standards", "conclusion", "summary", "appendix", "annex" ] for section in common_sections: pattern = re.compile(rf'(?i)(?:^|\n)(?:{section}|{section.capitalize()})(?:[\s:]+)(.*?)(?=\n\s*\n|\n\s*[A-Z]|\Z)', re.DOTALL) match = pattern.search(document_content) if match: sections[section] = match.group(1).strip() return sections def extract_key_terms(self, document_content: str) -> List[str]: """ Extract key technical terms from document content. Args: document_content: The document content Returns: List of key terms """ key_terms = [] # Split into sentences sentences = re.split(r'[.!?]\s+', document_content) for sentence in sentences: words = sentence.split() # Check if sentence contains technical indicators if any(indicator in sentence.lower() for indicator in self.technical_indicators): # Extract noun phrases (simplified approach) for i in range(len(words) - 1): if words[i].lower() not in self.stopwords and words[i+1].lower() not in self.stopwords: key_terms.append(f"{words[i]} {words[i+1]}".lower()) # Look for capitalized terms (often defined terms) cap_pattern = re.compile(r'\b[A-Z][A-Z0-9]+\b') cap_terms = cap_pattern.findall(document_content) key_terms.extend([term.lower() for term in cap_terms]) # Look for standard references (e.g., ISO-9001, IEEE 829) for prefix in self.standard_prefixes: pattern = re.compile(rf'\b{prefix}[-\s]?\d+\b', re.IGNORECASE) matches = pattern.findall(document_content) key_terms.extend([match.lower() for match in matches]) # Remove duplicates return list(set(key_terms)) def extract_standard_keywords(self, standard: Standard) -> List[str]: """ Extract keywords from a standard that can be used for matching. Args: standard: The standard to extract keywords from Returns: List of keywords associated with the standard """ keywords = [] # Add standard name and variations keywords.append(standard.name.lower()) keywords.append(standard.name.replace("-", "").lower()) keywords.append(standard.name.replace("-", " ").lower()) # Add standard description words (excluding common words) if standard.description: description_words = [word.lower() for word in standard.description.split() if word.lower() not in self.stopwords] keywords.extend(description_words) # Add requirement keywords for req in standard.requirements: # Add requirement ID keywords.append(req.id.lower()) # Add key phrases from requirement description if req.description: # Extract noun phrases and technical terms (simplified approach) phrases = [] words = req.description.split() for i in range(len(words) - 1): if words[i].lower() not in self.stopwords and words[i+1].lower() not in self.stopwords: phrases.append(f"{words[i]} {words[i+1]}".lower()) keywords.extend(phrases) # Add individual technical terms for word in words: if word.lower() in self.technical_indicators: keywords.append(word.lower()) # Remove duplicates and return return list(set(keywords)) def calculate_standard_relevance(self, standard: Standard, document_content: str, sections: Dict[str, str], key_terms: List[str]) -> float: """ Calculate a relevance score for a standard based on multiple factors. Args: standard: The standard to evaluate document_content: The document content sections: Document sections key_terms: Key terms extracted from the document Returns: Relevance score (0.0 to 1.0) """ document_content_lower = document_content.lower() # Extract keywords for this standard standard_keywords = self.extract_standard_keywords(standard) # Initialize scores for different matching components name_match_score = 0.0 keyword_match_score = 0.0 section_match_score = 0.0 term_match_score = 0.0 requirement_match_score = 0.0 # 1. Check for standard name matches (highest weight) if standard.name.lower() in document_content_lower: name_match_score = 0.5 elif standard.name.replace("-", "").lower() in document_content_lower: name_match_score = 0.4 elif standard.name.replace("-", " ").lower() in document_content_lower: name_match_score = 0.4 # 2. Check for keyword matches matched_keywords = 0 total_keywords = len(standard_keywords) if total_keywords > 0: for keyword in standard_keywords: if keyword in document_content_lower: matched_keywords += 1 keyword_match_score = matched_keywords / total_keywords * 0.3 # 3. Check for section-specific matches important_sections = ["introduction", "scope", "purpose", "references", "standards", "compliance", "requirements"] for section_name in important_sections: if section_name in sections: section_content = sections[section_name].lower() # Check for standard name in important sections if standard.name.lower() in section_content: section_match_score += 0.1 break # Check for standard name in section titles for section_name in sections.keys(): if standard.name.lower() in section_name.lower(): section_match_score += 0.2 break # 4. Check for key term matches matching_terms = 0 for term in key_terms: if any(kw in term or term in kw for kw in standard_keywords): matching_terms += 1 if len(key_terms) > 0: term_match_score = min(0.2, 0.01 * matching_terms) # 5. Check for requirement-specific matches for req in standard.requirements: req_desc_lower = req.description.lower() req_keywords = [word for word in req_desc_lower.split() if word not in self.stopwords and len(word) > 3] for keyword in req_keywords: if keyword in document_content_lower: requirement_match_score += 0.01 requirement_match_score = min(0.2, requirement_match_score) # Calculate final score (weighted sum of all components) final_score = ( name_match_score + keyword_match_score + section_match_score + term_match_score + requirement_match_score ) # Cap at 1.0 return min(final_score, 1.0) def find_relevant_standards(self, document_content: str, standards: List[Standard], threshold: float = 0.1, max_standards: int = 5) -> List[Tuple[str, float]]: """ Find standards relevant to a document with relevance scores. Args: document_content: The document content standards: List of available standards threshold: Minimum relevance score threshold max_standards: Maximum number of standards to return Returns: List of tuples (standard_name, relevance_score) sorted by relevance """ if not standards: return [] # Extract document sections and key terms sections = self.extract_document_sections(document_content) key_terms = self.extract_key_terms(document_content) # Calculate relevance scores for each standard standard_scores = [] for standard in standards: score = self.calculate_standard_relevance( standard, document_content, sections, key_terms ) if score >= threshold: standard_scores.append((standard.name, score)) logger.debug(f"Standard {standard.name} relevance score: {score:.2f}") # Sort by relevance score (highest first) standard_scores.sort(key=lambda x: x[1], reverse=True) # Limit to max_standards return standard_scores[:max_standards]