# Utility functions import re from typing import Dict, List, Any, Optional import os from datetime import datetime import json def extract_sections_from_markdown(markdown_text: str) -> Dict[str, str]: """ Extract sections from a markdown document. Args: markdown_text: The markdown text to parse Returns: Dictionary mapping section names to section content """ sections = {} # Add the whole document as one section sections["full_document"] = markdown_text # Split by markdown headings heading_pattern = re.compile(r'^(#{1,6})\s+(.+)$', re.MULTILINE) matches = list(heading_pattern.finditer(markdown_text)) if matches: for i, match in enumerate(matches): heading_level = len(match.group(1)) section_name = match.group(2).strip() # Get section content (from this heading to the next, or to the end) start_pos = match.end() end_pos = matches[i+1].start() if i < len(matches) - 1 else len(markdown_text) section_content = markdown_text[start_pos:end_pos].strip() section_key = f"h{heading_level}_{section_name}" sections[section_key] = section_content return sections def detect_file_type(filename: str) -> str: """ Detect file type from filename extension. Args: filename: Name of the file Returns: File type (markdown, text, etc.) """ _, extension = os.path.splitext(filename) ext = extension.lower().lstrip('.') if ext in ['md', 'markdown']: return 'markdown' elif ext in ['txt', 'text']: return 'text' elif ext in ['json']: return 'json' elif ext in ['yaml', 'yml']: return 'yaml' elif ext in ['html', 'htm']: return 'html' else: return 'unknown' def parse_code_blocks(content: str) -> List[Dict[str, str]]: """ Extract code blocks from markdown content. Args: content: Markdown content with code blocks Returns: List of dictionaries with language and code """ # Pattern to match code blocks with optional language pattern = r'```(\w*)\n([\s\S]*?)```' matches = re.findall(pattern, content) code_blocks = [] for language, code in matches: code_blocks.append({ 'language': language.strip() or 'text', 'code': code.strip() }) return code_blocks def format_timestamp(timestamp: datetime) -> str: """ Format timestamp for display. Args: timestamp: Datetime object Returns: Formatted timestamp string """ return timestamp.strftime("%Y-%m-%d %H:%M:%S") def calculate_readability_score(text: str) -> float: """ Calculate a simple readability score for text. Args: text: The text to analyze Returns: Readability score (0.0-1.0) """ if not text: return 0.0 # Split into sentences and words sentences = re.split(r'[.!?]+', text) words = re.findall(r'\b\w+\b', text) if not words or not sentences: return 0.0 # Average words per sentence avg_words_per_sentence = len(words) / len(sentences) # Simple readability score based on average words per sentence # Optimal is around 15-20 words per sentence if avg_words_per_sentence <= 10: score = 0.7 # Very short sentences elif 10 < avg_words_per_sentence <= 20: score = 1.0 # Optimal elif 20 < avg_words_per_sentence <= 30: score = 0.8 # Getting long else: score = 0.5 # Too long return score def sanitize_filename(filename: str) -> str: """ Sanitize filename to be safe for filesystem. Args: filename: Original filename Returns: Sanitized filename """ # Replace illegal characters sanitized = re.sub(r'[<>:"/\\|?*]', '_', filename) # Ensure it's not too long if len(sanitized) > 255: base, ext = os.path.splitext(sanitized) sanitized = base[:255-len(ext)] + ext return sanitized def load_standards_from_file(file_path: str) -> List[Dict[str, Any]]: """ Load compliance standards from a JSON file. Args: file_path: Path to the standards JSON file Returns: List of standard dictionaries """ try: with open(file_path, 'r') as f: standards = json.load(f) return standards except (FileNotFoundError, json.JSONDecodeError): # Return empty list if file not found or invalid return [] def _render_applied_standards(standards: List[str]) -> str: """ Render HTML for applied standards section. Args: standards: List of standard names Returns: HTML string for the applied standards section """ if not standards: return "" html = """

Applied Standards

" return html def generate_html_report(report_data: Dict[str, Any]) -> str: """ Generate HTML for compliance report. Args: report_data: Report data dictionary Returns: HTML string for the report """ # Simple HTML template for the report html = f""" Compliance Report

Compliance Report

Document: {report_data.get('document_name', 'Unknown')}

Generated: {report_data.get('timestamp', datetime.now().strftime('%Y-%m-%d %H:%M:%S'))}

Compliance Score: {report_data.get('compliance_score', 0) * 100:.1f}%

Summary

{report_data.get('summary', 'No summary available.')}

{_render_applied_standards(report_data.get('applied_standards', []))}

Compliance Issues

""" # Add issues issues = report_data.get('issues', []) if not issues: html += "

No compliance issues found.

" else: for issue in issues: level = issue.get('level', 'info').lower() html += f"""

{issue.get('section', 'Unknown Section')}

{level.upper()} {issue.get('description', 'No description')}

Recommendation: {issue.get('recommendation', 'No recommendation')}

""" # Close HTML html += """
""" return html