ds_scp_task_solution/app/services/standards_matcher.py

# Standards matching logic
import re
from typing import Dict, List, Set, Tuple, Optional
from loguru import logger

from app.core.models import Standard, Requirement


class StandardsMatcher:
    """
    Advanced matching logic to identify relevant standards for documents.
    This class implements sophisticated matching algorithms beyond simple text matching.
    """

    def __init__(self):
        """Initialize the standards matcher."""
        # Common stopwords to filter out when extracting keywords
        self.stopwords = {
            "the", "a", "an", "and", "or", "in", "on", "at", "to", "for", "with",
            "by", "of", "is", "are", "was", "were", "be", "been", "being", "have",
            "has", "had", "do", "does", "did", "but", "if", "then", "else", "when",
            "where", "why", "how", "all", "any", "both", "each", "few", "more",
            "most", "other", "some", "such", "no", "nor", "not", "only", "own",
            "same", "so", "than", "too", "very", "can", "will", "just", "should",
            "now", "this", "that", "these", "those"
        }

        # Technical terms that indicate compliance requirements
        self.technical_indicators = [
            "shall", "must", "required", "should", "recommended", "may", "optional",
            "compliant", "compliance", "conform", "standard", "specification", "requirement",
            "procedure", "process", "method", "test", "verify", "validate", "certification",
            "certified", "approved", "regulation", "regulatory", "guideline", "protocol"
        ]

        # Common standard prefixes and abbreviations
        self.standard_prefixes = [
            "iso", "ieee", "astm", "ansi", "iec", "din", "bs", "en", "jis",
            "gb", "api", "asme", "nfpa", "ul", "mil", "std", "rfc", "itu"
        ]

    def extract_document_sections(self, document_content: str) -> Dict[str, str]:
        """
        Extract sections from a document to improve matching.

        Args:
            document_content: The document content

        Returns:
            Dictionary of section name to section content
        """
        sections = {}
        sections["full_document"] = document_content

        # Try to identify document sections using markdown headings
        heading_pattern = re.compile(r'^(#{1,3})\s+(.+)$', re.MULTILINE)
        matches = list(heading_pattern.finditer(document_content))

        if matches:
            for i, match in enumerate(matches):
                section_name = match.group(2).strip()

                # Get section content (from this heading to the next, or to the end)
                start_pos = match.end()
                end_pos = matches[i+1].start() if i < len(matches) - 1 else len(document_content)

                section_content = document_content[start_pos:end_pos].strip()
                sections[section_name] = section_content

        # Look for common document sections by name
        common_sections = [
            "introduction", "scope", "purpose", "references", "definitions",
            "requirements", "compliance", "standards", "conclusion", "summary",
            "appendix", "annex"
        ]

        for section in common_sections:
            pattern = re.compile(rf'(?i)(?:^|\n)(?:{section}|{section.capitalize()})(?:[\s:]+)(.*?)(?=\n\s*\n|\n\s*[A-Z]|\Z)', re.DOTALL)
            match = pattern.search(document_content)
            if match:
                sections[section] = match.group(1).strip()

        return sections

    def extract_key_terms(self, document_content: str) -> List[str]:
        """
        Extract key technical terms from document content.

        Args:
            document_content: The document content

        Returns:
            List of key terms
        """
        key_terms = []

        # Split into sentences
        sentences = re.split(r'[.!?]\s+', document_content)

        for sentence in sentences:
            words = sentence.split()

            # Check if sentence contains technical indicators
            if any(indicator in sentence.lower() for indicator in self.technical_indicators):
                # Extract noun phrases (simplified approach)
                for i in range(len(words) - 1):
                    if words[i].lower() not in self.stopwords and words[i+1].lower() not in self.stopwords:
                        key_terms.append(f"{words[i]} {words[i+1]}".lower())

        # Look for capitalized terms (often defined terms)
        cap_pattern = re.compile(r'\b[A-Z][A-Z0-9]+\b')
        cap_terms = cap_pattern.findall(document_content)
        key_terms.extend([term.lower() for term in cap_terms])

        # Look for standard references (e.g., ISO-9001, IEEE 829)
        for prefix in self.standard_prefixes:
            pattern = re.compile(rf'\b{prefix}[-\s]?\d+\b', re.IGNORECASE)
            matches = pattern.findall(document_content)
            key_terms.extend([match.lower() for match in matches])

        # Remove duplicates
        return list(set(key_terms))

    def extract_standard_keywords(self, standard: Standard) -> List[str]:
        """
        Extract keywords from a standard that can be used for matching.

        Args:
            standard: The standard to extract keywords from

        Returns:
            List of keywords associated with the standard
        """
        keywords = []

        # Add standard name and variations
        keywords.append(standard.name.lower())
        keywords.append(standard.name.replace("-", "").lower())
        keywords.append(standard.name.replace("-", " ").lower())

        # Add standard description words (excluding common words)
        if standard.description:
            description_words = [word.lower() for word in standard.description.split()
                                if word.lower() not in self.stopwords]
            keywords.extend(description_words)

        # Add requirement keywords
        for req in standard.requirements:
            # Add requirement ID
            keywords.append(req.id.lower())

            # Add key phrases from requirement description
            if req.description:
                # Extract noun phrases and technical terms (simplified approach)
                phrases = []
                words = req.description.split()
                for i in range(len(words) - 1):
                    if words[i].lower() not in self.stopwords and words[i+1].lower() not in self.stopwords:
                        phrases.append(f"{words[i]} {words[i+1]}".lower())
                keywords.extend(phrases)

                # Add individual technical terms
                for word in words:
                    if word.lower() in self.technical_indicators:
                        keywords.append(word.lower())

        # Remove duplicates and return
        return list(set(keywords))

    def calculate_standard_relevance(self, standard: Standard, document_content: str,
                                    sections: Dict[str, str], key_terms: List[str]) -> float:
        """
        Calculate a relevance score for a standard based on multiple factors.

        Args:
            standard: The standard to evaluate
            document_content: The document content
            sections: Document sections
            key_terms: Key terms extracted from the document

        Returns:
            Relevance score (0.0 to 1.0)
        """
        document_content_lower = document_content.lower()

        # Extract keywords for this standard
        standard_keywords = self.extract_standard_keywords(standard)

        # Initialize scores for different matching components
        name_match_score = 0.0
        keyword_match_score = 0.0
        section_match_score = 0.0
        term_match_score = 0.0
        requirement_match_score = 0.0

        # 1. Check for standard name matches (highest weight)
        if standard.name.lower() in document_content_lower:
            name_match_score = 0.5
        elif standard.name.replace("-", "").lower() in document_content_lower:
            name_match_score = 0.4
        elif standard.name.replace("-", " ").lower() in document_content_lower:
            name_match_score = 0.4

        # 2. Check for keyword matches
        matched_keywords = 0
        total_keywords = len(standard_keywords)

        if total_keywords > 0:
            for keyword in standard_keywords:
                if keyword in document_content_lower:
                    matched_keywords += 1

            keyword_match_score = matched_keywords / total_keywords * 0.3

        # 3. Check for section-specific matches
        important_sections = ["introduction", "scope", "purpose", "references",
                             "standards", "compliance", "requirements"]

        for section_name in important_sections:
            if section_name in sections:
                section_content = sections[section_name].lower()

                # Check for standard name in important sections
                if standard.name.lower() in section_content:
                    section_match_score += 0.1
                    break

        # Check for standard name in section titles
        for section_name in sections.keys():
            if standard.name.lower() in section_name.lower():
                section_match_score += 0.2
                break

        # 4. Check for key term matches
        matching_terms = 0
        for term in key_terms:
            if any(kw in term or term in kw for kw in standard_keywords):
                matching_terms += 1

        if len(key_terms) > 0:
            term_match_score = min(0.2, 0.01 * matching_terms)

        # 5. Check for requirement-specific matches
        for req in standard.requirements:
            req_desc_lower = req.description.lower()
            req_keywords = [word for word in req_desc_lower.split()
                           if word not in self.stopwords and len(word) > 3]

            for keyword in req_keywords:
                if keyword in document_content_lower:
                    requirement_match_score += 0.01

        requirement_match_score = min(0.2, requirement_match_score)

        # Calculate final score (weighted sum of all components)
        final_score = (
            name_match_score +
            keyword_match_score +
            section_match_score +
            term_match_score +
            requirement_match_score
        )

        # Cap at 1.0
        return min(final_score, 1.0)

    def find_relevant_standards(self, document_content: str, standards: List[Standard],
                               threshold: float = 0.1, max_standards: int = 5) -> List[Tuple[str, float]]:
        """
        Find standards relevant to a document with relevance scores.

        Args:
            document_content: The document content
            standards: List of available standards
            threshold: Minimum relevance score threshold
            max_standards: Maximum number of standards to return

        Returns:
            List of tuples (standard_name, relevance_score) sorted by relevance
        """
        if not standards:
            return []

        # Extract document sections and key terms
        sections = self.extract_document_sections(document_content)
        key_terms = self.extract_key_terms(document_content)

        # Calculate relevance scores for each standard
        standard_scores = []

        for standard in standards:
            score = self.calculate_standard_relevance(
                standard, document_content, sections, key_terms
            )

            if score >= threshold:
                standard_scores.append((standard.name, score))
                logger.debug(f"Standard {standard.name} relevance score: {score:.2f}")

        # Sort by relevance score (highest first)
        standard_scores.sort(key=lambda x: x[1], reverse=True)

        # Limit to max_standards
        return standard_scores[:max_standards]