305 lines
12 KiB
Python
305 lines
12 KiB
Python
# Standards matching logic
|
|
import re
|
|
from typing import Dict, List, Set, Tuple, Optional
|
|
from loguru import logger
|
|
|
|
from app.core.models import Standard, Requirement
|
|
|
|
|
|
class StandardsMatcher:
|
|
"""
|
|
Advanced matching logic to identify relevant standards for documents.
|
|
This class implements sophisticated matching algorithms beyond simple text matching.
|
|
"""
|
|
|
|
def __init__(self):
|
|
"""Initialize the standards matcher."""
|
|
# Common stopwords to filter out when extracting keywords
|
|
self.stopwords = {
|
|
"the", "a", "an", "and", "or", "in", "on", "at", "to", "for", "with",
|
|
"by", "of", "is", "are", "was", "were", "be", "been", "being", "have",
|
|
"has", "had", "do", "does", "did", "but", "if", "then", "else", "when",
|
|
"where", "why", "how", "all", "any", "both", "each", "few", "more",
|
|
"most", "other", "some", "such", "no", "nor", "not", "only", "own",
|
|
"same", "so", "than", "too", "very", "can", "will", "just", "should",
|
|
"now", "this", "that", "these", "those"
|
|
}
|
|
|
|
# Technical terms that indicate compliance requirements
|
|
self.technical_indicators = [
|
|
"shall", "must", "required", "should", "recommended", "may", "optional",
|
|
"compliant", "compliance", "conform", "standard", "specification", "requirement",
|
|
"procedure", "process", "method", "test", "verify", "validate", "certification",
|
|
"certified", "approved", "regulation", "regulatory", "guideline", "protocol"
|
|
]
|
|
|
|
# Common standard prefixes and abbreviations
|
|
self.standard_prefixes = [
|
|
"iso", "ieee", "astm", "ansi", "iec", "din", "bs", "en", "jis",
|
|
"gb", "api", "asme", "nfpa", "ul", "mil", "std", "rfc", "itu"
|
|
]
|
|
|
|
def extract_document_sections(self, document_content: str) -> Dict[str, str]:
|
|
"""
|
|
Extract sections from a document to improve matching.
|
|
|
|
Args:
|
|
document_content: The document content
|
|
|
|
Returns:
|
|
Dictionary of section name to section content
|
|
"""
|
|
sections = {}
|
|
sections["full_document"] = document_content
|
|
|
|
# Try to identify document sections using markdown headings
|
|
heading_pattern = re.compile(r'^(#{1,3})\s+(.+)$', re.MULTILINE)
|
|
matches = list(heading_pattern.finditer(document_content))
|
|
|
|
if matches:
|
|
for i, match in enumerate(matches):
|
|
section_name = match.group(2).strip()
|
|
|
|
# Get section content (from this heading to the next, or to the end)
|
|
start_pos = match.end()
|
|
end_pos = matches[i+1].start() if i < len(matches) - 1 else len(document_content)
|
|
|
|
section_content = document_content[start_pos:end_pos].strip()
|
|
sections[section_name] = section_content
|
|
|
|
# Look for common document sections by name
|
|
common_sections = [
|
|
"introduction", "scope", "purpose", "references", "definitions",
|
|
"requirements", "compliance", "standards", "conclusion", "summary",
|
|
"appendix", "annex"
|
|
]
|
|
|
|
for section in common_sections:
|
|
pattern = re.compile(rf'(?i)(?:^|\n)(?:{section}|{section.capitalize()})(?:[\s:]+)(.*?)(?=\n\s*\n|\n\s*[A-Z]|\Z)', re.DOTALL)
|
|
match = pattern.search(document_content)
|
|
if match:
|
|
sections[section] = match.group(1).strip()
|
|
|
|
return sections
|
|
|
|
def extract_key_terms(self, document_content: str) -> List[str]:
|
|
"""
|
|
Extract key technical terms from document content.
|
|
|
|
Args:
|
|
document_content: The document content
|
|
|
|
Returns:
|
|
List of key terms
|
|
"""
|
|
key_terms = []
|
|
|
|
# Split into sentences
|
|
sentences = re.split(r'[.!?]\s+', document_content)
|
|
|
|
for sentence in sentences:
|
|
words = sentence.split()
|
|
|
|
# Check if sentence contains technical indicators
|
|
if any(indicator in sentence.lower() for indicator in self.technical_indicators):
|
|
# Extract noun phrases (simplified approach)
|
|
for i in range(len(words) - 1):
|
|
if words[i].lower() not in self.stopwords and words[i+1].lower() not in self.stopwords:
|
|
key_terms.append(f"{words[i]} {words[i+1]}".lower())
|
|
|
|
# Look for capitalized terms (often defined terms)
|
|
cap_pattern = re.compile(r'\b[A-Z][A-Z0-9]+\b')
|
|
cap_terms = cap_pattern.findall(document_content)
|
|
key_terms.extend([term.lower() for term in cap_terms])
|
|
|
|
# Look for standard references (e.g., ISO-9001, IEEE 829)
|
|
for prefix in self.standard_prefixes:
|
|
pattern = re.compile(rf'\b{prefix}[-\s]?\d+\b', re.IGNORECASE)
|
|
matches = pattern.findall(document_content)
|
|
key_terms.extend([match.lower() for match in matches])
|
|
|
|
# Remove duplicates
|
|
return list(set(key_terms))
|
|
|
|
def extract_standard_keywords(self, standard: Standard) -> List[str]:
|
|
"""
|
|
Extract keywords from a standard that can be used for matching.
|
|
|
|
Args:
|
|
standard: The standard to extract keywords from
|
|
|
|
Returns:
|
|
List of keywords associated with the standard
|
|
"""
|
|
keywords = []
|
|
|
|
# Add standard name and variations
|
|
keywords.append(standard.name.lower())
|
|
keywords.append(standard.name.replace("-", "").lower())
|
|
keywords.append(standard.name.replace("-", " ").lower())
|
|
|
|
# Add standard description words (excluding common words)
|
|
if standard.description:
|
|
description_words = [word.lower() for word in standard.description.split()
|
|
if word.lower() not in self.stopwords]
|
|
keywords.extend(description_words)
|
|
|
|
# Add requirement keywords
|
|
for req in standard.requirements:
|
|
# Add requirement ID
|
|
keywords.append(req.id.lower())
|
|
|
|
# Add key phrases from requirement description
|
|
if req.description:
|
|
# Extract noun phrases and technical terms (simplified approach)
|
|
phrases = []
|
|
words = req.description.split()
|
|
for i in range(len(words) - 1):
|
|
if words[i].lower() not in self.stopwords and words[i+1].lower() not in self.stopwords:
|
|
phrases.append(f"{words[i]} {words[i+1]}".lower())
|
|
keywords.extend(phrases)
|
|
|
|
# Add individual technical terms
|
|
for word in words:
|
|
if word.lower() in self.technical_indicators:
|
|
keywords.append(word.lower())
|
|
|
|
# Remove duplicates and return
|
|
return list(set(keywords))
|
|
|
|
def calculate_standard_relevance(self, standard: Standard, document_content: str,
|
|
sections: Dict[str, str], key_terms: List[str]) -> float:
|
|
"""
|
|
Calculate a relevance score for a standard based on multiple factors.
|
|
|
|
Args:
|
|
standard: The standard to evaluate
|
|
document_content: The document content
|
|
sections: Document sections
|
|
key_terms: Key terms extracted from the document
|
|
|
|
Returns:
|
|
Relevance score (0.0 to 1.0)
|
|
"""
|
|
document_content_lower = document_content.lower()
|
|
|
|
# Extract keywords for this standard
|
|
standard_keywords = self.extract_standard_keywords(standard)
|
|
|
|
# Initialize scores for different matching components
|
|
name_match_score = 0.0
|
|
keyword_match_score = 0.0
|
|
section_match_score = 0.0
|
|
term_match_score = 0.0
|
|
requirement_match_score = 0.0
|
|
|
|
# 1. Check for standard name matches (highest weight)
|
|
if standard.name.lower() in document_content_lower:
|
|
name_match_score = 0.5
|
|
elif standard.name.replace("-", "").lower() in document_content_lower:
|
|
name_match_score = 0.4
|
|
elif standard.name.replace("-", " ").lower() in document_content_lower:
|
|
name_match_score = 0.4
|
|
|
|
# 2. Check for keyword matches
|
|
matched_keywords = 0
|
|
total_keywords = len(standard_keywords)
|
|
|
|
if total_keywords > 0:
|
|
for keyword in standard_keywords:
|
|
if keyword in document_content_lower:
|
|
matched_keywords += 1
|
|
|
|
keyword_match_score = matched_keywords / total_keywords * 0.3
|
|
|
|
# 3. Check for section-specific matches
|
|
important_sections = ["introduction", "scope", "purpose", "references",
|
|
"standards", "compliance", "requirements"]
|
|
|
|
for section_name in important_sections:
|
|
if section_name in sections:
|
|
section_content = sections[section_name].lower()
|
|
|
|
# Check for standard name in important sections
|
|
if standard.name.lower() in section_content:
|
|
section_match_score += 0.1
|
|
break
|
|
|
|
# Check for standard name in section titles
|
|
for section_name in sections.keys():
|
|
if standard.name.lower() in section_name.lower():
|
|
section_match_score += 0.2
|
|
break
|
|
|
|
# 4. Check for key term matches
|
|
matching_terms = 0
|
|
for term in key_terms:
|
|
if any(kw in term or term in kw for kw in standard_keywords):
|
|
matching_terms += 1
|
|
|
|
if len(key_terms) > 0:
|
|
term_match_score = min(0.2, 0.01 * matching_terms)
|
|
|
|
# 5. Check for requirement-specific matches
|
|
for req in standard.requirements:
|
|
req_desc_lower = req.description.lower()
|
|
req_keywords = [word for word in req_desc_lower.split()
|
|
if word not in self.stopwords and len(word) > 3]
|
|
|
|
for keyword in req_keywords:
|
|
if keyword in document_content_lower:
|
|
requirement_match_score += 0.01
|
|
|
|
requirement_match_score = min(0.2, requirement_match_score)
|
|
|
|
# Calculate final score (weighted sum of all components)
|
|
final_score = (
|
|
name_match_score +
|
|
keyword_match_score +
|
|
section_match_score +
|
|
term_match_score +
|
|
requirement_match_score
|
|
)
|
|
|
|
# Cap at 1.0
|
|
return min(final_score, 1.0)
|
|
|
|
def find_relevant_standards(self, document_content: str, standards: List[Standard],
|
|
threshold: float = 0.1, max_standards: int = 5) -> List[Tuple[str, float]]:
|
|
"""
|
|
Find standards relevant to a document with relevance scores.
|
|
|
|
Args:
|
|
document_content: The document content
|
|
standards: List of available standards
|
|
threshold: Minimum relevance score threshold
|
|
max_standards: Maximum number of standards to return
|
|
|
|
Returns:
|
|
List of tuples (standard_name, relevance_score) sorted by relevance
|
|
"""
|
|
if not standards:
|
|
return []
|
|
|
|
# Extract document sections and key terms
|
|
sections = self.extract_document_sections(document_content)
|
|
key_terms = self.extract_key_terms(document_content)
|
|
|
|
# Calculate relevance scores for each standard
|
|
standard_scores = []
|
|
|
|
for standard in standards:
|
|
score = self.calculate_standard_relevance(
|
|
standard, document_content, sections, key_terms
|
|
)
|
|
|
|
if score >= threshold:
|
|
standard_scores.append((standard.name, score))
|
|
logger.debug(f"Standard {standard.name} relevance score: {score:.2f}")
|
|
|
|
# Sort by relevance score (highest first)
|
|
standard_scores.sort(key=lambda x: x[1], reverse=True)
|
|
|
|
# Limit to max_standards
|
|
return standard_scores[:max_standards]
|