app/utils/helpers.py

# Utility functions
import re
from typing import Dict, List, Any, Optional
import os
from datetime import datetime
import json

def extract_sections_from_markdown(markdown_text: str) -> Dict[str, str]:
    """
    Extract sections from a markdown document.

    Args:
        markdown_text: The markdown text to parse

    Returns:
        Dictionary mapping section names to section content
    """
    sections = {}

    # Add the whole document as one section
    sections["full_document"] = markdown_text

    # Split by markdown headings
    heading_pattern = re.compile(r'^(#{1,6})\s+(.+)$', re.MULTILINE)
    matches = list(heading_pattern.finditer(markdown_text))

    if matches:
        for i, match in enumerate(matches):
            heading_level = len(match.group(1))
            section_name = match.group(2).strip()

            # Get section content (from this heading to the next, or to the end)
            start_pos = match.end()
            end_pos = matches[i+1].start() if i < len(matches) - 1 else len(markdown_text)

            section_content = markdown_text[start_pos:end_pos].strip()
            section_key = f"h{heading_level}_{section_name}"

            sections[section_key] = section_content

    return sections

def detect_file_type(filename: str) -> str:
    """
    Detect file type from filename extension.

    Args:
        filename: Name of the file

    Returns:
        File type (markdown, text, etc.)
    """
    _, extension = os.path.splitext(filename)
    ext = extension.lower().lstrip('.')

    if ext in ['md', 'markdown']:
        return 'markdown'
    elif ext in ['txt', 'text']:
        return 'text'
    elif ext in ['json']:
        return 'json'
    elif ext in ['yaml', 'yml']:
        return 'yaml'
    elif ext in ['html', 'htm']:
        return 'html'
    else:
        return 'unknown'

def parse_code_blocks(content: str) -> List[Dict[str, str]]:
    """
    Extract code blocks from markdown content.

    Args:
        content: Markdown content with code blocks

    Returns:
        List of dictionaries with language and code
    """
    # Pattern to match code blocks with optional language
    pattern = r'```(\w*)\n([\s\S]*?)```'
    matches = re.findall(pattern, content)

    code_blocks = []
    for language, code in matches:
        code_blocks.append({
            'language': language.strip() or 'text',
            'code': code.strip()
        })

    return code_blocks

def format_timestamp(timestamp: datetime) -> str:
    """
    Format timestamp for display.

    Args:
        timestamp: Datetime object

    Returns:
        Formatted timestamp string
    """
    return timestamp.strftime("%Y-%m-%d %H:%M:%S")

def calculate_readability_score(text: str) -> float:
    """
    Calculate a simple readability score for text.

    Args:
        text: The text to analyze

    Returns:
        Readability score (0.0-1.0)
    """
    if not text:
        return 0.0

    # Split into sentences and words
    sentences = re.split(r'[.!?]+', text)
    words = re.findall(r'\b\w+\b', text)

    if not words or not sentences:
        return 0.0

    # Average words per sentence
    avg_words_per_sentence = len(words) / len(sentences)

    # Simple readability score based on average words per sentence
    # Optimal is around 15-20 words per sentence
    if avg_words_per_sentence <= 10:
        score = 0.7  # Very short sentences
    elif 10 < avg_words_per_sentence <= 20:
        score = 1.0  # Optimal
    elif 20 < avg_words_per_sentence <= 30:
        score = 0.8  # Getting long
    else:
        score = 0.5  # Too long

    return score

def sanitize_filename(filename: str) -> str:
    """
    Sanitize filename to be safe for filesystem.

    Args:
        filename: Original filename

    Returns:
        Sanitized filename
    """
    # Replace illegal characters
    sanitized = re.sub(r'[<>:"/\\|?*]', '_', filename)

    # Ensure it's not too long
    if len(sanitized) > 255:
        base, ext = os.path.splitext(sanitized)
        sanitized = base[:255-len(ext)] + ext

    return sanitized

def load_standards_from_file(file_path: str) -> List[Dict[str, Any]]:
    """
    Load compliance standards from a JSON file.

    Args:
        file_path: Path to the standards JSON file

    Returns:
        List of standard dictionaries
    """
    try:
        with open(file_path, 'r') as f:
            standards = json.load(f)
        return standards
    except (FileNotFoundError, json.JSONDecodeError):
        # Return empty list if file not found or invalid
        return []

def _render_applied_standards(standards: List[str]) -> str:
    """
    Render HTML for applied standards section.

    Args:
        standards: List of standard names

    Returns:
        HTML string for the applied standards section
    """
    if not standards:
        return ""

    html = """<div style="margin-top: 15px; padding: 10px; background-color: #f8f9fa; border-radius: 5px;">
        <h3 style="margin-top: 0; font-size: 16px; color: #495057;">Applied Standards</h3>
        <ul style="margin: 5px 0 0 20px; padding: 0;">
    """

    for standard in standards:
        html += f"<li style=\"margin-bottom: 3px;\">{standard}</li>\n"

    html += "</ul></div>"
    return html

def generate_html_report(report_data: Dict[str, Any]) -> str:
    """
    Generate HTML for compliance report.

    Args:
        report_data: Report data dictionary

    Returns:
        HTML string for the report
    """
    # Simple HTML template for the report
    html = f"""
    <!DOCTYPE html>
    <html>
    <head>
        <title>Compliance Report</title>
        <style>
            body {{ font-family: Arial, sans-serif; margin: 0; padding: 20px; color: #333; }}
            .header {{ background-color: #f5f5f5; padding: 15px; border-bottom: 1px solid #ddd; }}
            .summary {{ margin: 20px 0; padding: 15px; background-color: #e9f7ef; border-left: 4px solid #27ae60; }}
            .issues {{ margin: 20px 0; }}
            .issue {{ margin-bottom: 15px; padding: 15px; background-color: #f9f9f9; border-left: 4px solid #3498db; }}
            .issue.critical {{ background-color: #fdedec; border-left-color: #c0392b; }}
            .issue.major {{ background-color: #fef9e7; border-left-color: #f1c40f; }}
            .issue.minor {{ background-color: #eafaf1; border-left-color: #2ecc71; }}
            .issue.info {{ background-color: #ebf5fb; border-left-color: #3498db; }}
            .issue h3 {{ margin-top: 0; }}
            .issue p {{ margin: 5px 0; }}
            .badge {{ display: inline-block; padding: 3px 7px; border-radius: 3px; font-size: 12px; color: white; }}
            .badge.critical {{ background-color: #c0392b; }}
            .badge.major {{ background-color: #f1c40f; color: #333; }}
            .badge.minor {{ background-color: #2ecc71; }}
            .badge.info {{ background-color: #3498db; }}
            .score {{ font-size: 24px; font-weight: bold; }}
            .score-container {{ text-align: right; }}
        </style>
    </head>
    <body>
        <div class="header">
            <h1>Compliance Report</h1>
            <p>Document: {report_data.get('document_name', 'Unknown')}</p>
            <p>Generated: {report_data.get('timestamp', datetime.now().strftime('%Y-%m-%d %H:%M:%S'))}</p>
            <div class="score-container">
                <span>Compliance Score: </span>
                <span class="score">{report_data.get('compliance_score', 0) * 100:.1f}%</span>
            </div>
        </div>

        <div class="summary">
            <h2>Summary</h2>
            <p>{report_data.get('summary', 'No summary available.')}</p>

            {_render_applied_standards(report_data.get('applied_standards', []))}
        </div>

        <div class="issues">
            <h2>Compliance Issues</h2>
    """

    # Add issues
    issues = report_data.get('issues', [])
    if not issues:
        html += "<p>No compliance issues found.</p>"
    else:
        for issue in issues:
            level = issue.get('level', 'info').lower()
            html += f"""
            <div class="issue {level}">
                <h3>{issue.get('section', 'Unknown Section')}</h3>
                <p><span class="badge {level}">{level.upper()}</span> {issue.get('description', 'No description')}</p>
                <p><strong>Recommendation:</strong> {issue.get('recommendation', 'No recommendation')}</p>
            </div>
            """

    # Close HTML
    html += """
        </div>
    </body>
    </html>
    """

    return html
Initial commit 2025-07-17 22:20:25 +01:00			`# Utility functions`
			`import re`
			`from typing import Dict, List, Any, Optional`
			`import os`
			`from datetime import datetime`
			`import json`

			`def extract_sections_from_markdown(markdown_text: str) -> Dict[str, str]:`
			`"""`
			`Extract sections from a markdown document.`

			`Args:`
			`markdown_text: The markdown text to parse`

			`Returns:`
			`Dictionary mapping section names to section content`
			`"""`
			`sections = {}`

			`# Add the whole document as one section`
			`sections["full_document"] = markdown_text`

			`# Split by markdown headings`
			`heading_pattern = re.compile(r'^(#{1,6})\s+(.+)$', re.MULTILINE)`
			`matches = list(heading_pattern.finditer(markdown_text))`

			`if matches:`
			`for i, match in enumerate(matches):`
			`heading_level = len(match.group(1))`
			`section_name = match.group(2).strip()`

			`# Get section content (from this heading to the next, or to the end)`
			`start_pos = match.end()`
			`end_pos = matches[i+1].start() if i < len(matches) - 1 else len(markdown_text)`

			`section_content = markdown_text[start_pos:end_pos].strip()`
			`section_key = f"h{heading_level}_{section_name}"`

			`sections[section_key] = section_content`

			`return sections`

			`def detect_file_type(filename: str) -> str:`
			`"""`
			`Detect file type from filename extension.`

			`Args:`
			`filename: Name of the file`

			`Returns:`
			`File type (markdown, text, etc.)`
			`"""`
			`_, extension = os.path.splitext(filename)`
			`ext = extension.lower().lstrip('.')`

			`if ext in ['md', 'markdown']:`
			`return 'markdown'`
			`elif ext in ['txt', 'text']:`
			`return 'text'`
			`elif ext in ['json']:`
			`return 'json'`
			`elif ext in ['yaml', 'yml']:`
			`return 'yaml'`
			`elif ext in ['html', 'htm']:`
			`return 'html'`
			`else:`
			`return 'unknown'`

			`def parse_code_blocks(content: str) -> List[Dict[str, str]]:`
			`"""`
			`Extract code blocks from markdown content.`

			`Args:`
			`content: Markdown content with code blocks`

			`Returns:`
			`List of dictionaries with language and code`
			`"""`
			`# Pattern to match code blocks with optional language`
			pattern = r'```(\w)\n([\s\S]?)```'
			`matches = re.findall(pattern, content)`

			`code_blocks = []`
			`for language, code in matches:`
			`code_blocks.append({`
			`'language': language.strip() or 'text',`
			`'code': code.strip()`
			`})`

			`return code_blocks`

			`def format_timestamp(timestamp: datetime) -> str:`
			`"""`
			`Format timestamp for display.`

			`Args:`
			`timestamp: Datetime object`

			`Returns:`
			`Formatted timestamp string`
			`"""`
			`return timestamp.strftime("%Y-%m-%d %H:%M:%S")`

			`def calculate_readability_score(text: str) -> float:`
			`"""`
			`Calculate a simple readability score for text.`

			`Args:`
			`text: The text to analyze`

			`Returns:`
			`Readability score (0.0-1.0)`
			`"""`
			`if not text:`
			`return 0.0`

			`# Split into sentences and words`
			`sentences = re.split(r'[.!?]+', text)`
			`words = re.findall(r'\b\w+\b', text)`

			`if not words or not sentences:`
			`return 0.0`

			`# Average words per sentence`
			`avg_words_per_sentence = len(words) / len(sentences)`

			`# Simple readability score based on average words per sentence`
			`# Optimal is around 15-20 words per sentence`
			`if avg_words_per_sentence <= 10:`
			`score = 0.7 # Very short sentences`
			`elif 10 < avg_words_per_sentence <= 20:`
			`score = 1.0 # Optimal`
			`elif 20 < avg_words_per_sentence <= 30:`
			`score = 0.8 # Getting long`
			`else:`
			`score = 0.5 # Too long`

			`return score`

			`def sanitize_filename(filename: str) -> str:`
			`"""`
			`Sanitize filename to be safe for filesystem.`

			`Args:`
			`filename: Original filename`

			`Returns:`
			`Sanitized filename`
			`"""`
			`# Replace illegal characters`
			`sanitized = re.sub(r'[<>:"/\\\|?*]', '_', filename)`

			`# Ensure it's not too long`
			`if len(sanitized) > 255:`
			`base, ext = os.path.splitext(sanitized)`
			`sanitized = base[:255-len(ext)] + ext`

			`return sanitized`

			`def load_standards_from_file(file_path: str) -> List[Dict[str, Any]]:`
			`"""`
			`Load compliance standards from a JSON file.`

			`Args:`
			`file_path: Path to the standards JSON file`

			`Returns:`
			`List of standard dictionaries`
			`"""`
			`try:`
			`with open(file_path, 'r') as f:`
			`standards = json.load(f)`
			`return standards`
			`except (FileNotFoundError, json.JSONDecodeError):`
			`# Return empty list if file not found or invalid`
			`return []`

			`def _render_applied_standards(standards: List[str]) -> str:`
			`"""`
			`Render HTML for applied standards section.`

			`Args:`
			`standards: List of standard names`

			`Returns:`
			`HTML string for the applied standards section`
			`"""`
			`if not standards:`
			`return ""`

			`html = """<div style="margin-top: 15px; padding: 10px; background-color: #f8f9fa; border-radius: 5px;">`
			`<h3 style="margin-top: 0; font-size: 16px; color: #495057;">Applied Standards</h3>`
			`<ul style="margin: 5px 0 0 20px; padding: 0;">`
			`"""`

			`for standard in standards:`
			`html += f"<li style=\"margin-bottom: 3px;\">{standard}</li>\n"`

			`html += "</ul></div>"`
			`return html`

			`def generate_html_report(report_data: Dict[str, Any]) -> str:`
			`"""`
			`Generate HTML for compliance report.`

			`Args:`
			`report_data: Report data dictionary`

			`Returns:`
			`HTML string for the report`
			`"""`
			`# Simple HTML template for the report`
			`html = f"""`
			`<!DOCTYPE html>`
			`<html>`
			`<head>`
			`<title>Compliance Report</title>`
			`<style>`
			`body {{ font-family: Arial, sans-serif; margin: 0; padding: 20px; color: #333; }}`
			`.header {{ background-color: #f5f5f5; padding: 15px; border-bottom: 1px solid #ddd; }}`
			`.summary {{ margin: 20px 0; padding: 15px; background-color: #e9f7ef; border-left: 4px solid #27ae60; }}`
			`.issues {{ margin: 20px 0; }}`
			`.issue {{ margin-bottom: 15px; padding: 15px; background-color: #f9f9f9; border-left: 4px solid #3498db; }}`
			`.issue.critical {{ background-color: #fdedec; border-left-color: #c0392b; }}`
			`.issue.major {{ background-color: #fef9e7; border-left-color: #f1c40f; }}`
			`.issue.minor {{ background-color: #eafaf1; border-left-color: #2ecc71; }}`
			`.issue.info {{ background-color: #ebf5fb; border-left-color: #3498db; }}`
			`.issue h3 {{ margin-top: 0; }}`
			`.issue p {{ margin: 5px 0; }}`
			`.badge {{ display: inline-block; padding: 3px 7px; border-radius: 3px; font-size: 12px; color: white; }}`
			`.badge.critical {{ background-color: #c0392b; }}`
			`.badge.major {{ background-color: #f1c40f; color: #333; }}`
			`.badge.minor {{ background-color: #2ecc71; }}`
			`.badge.info {{ background-color: #3498db; }}`
			`.score {{ font-size: 24px; font-weight: bold; }}`
			`.score-container {{ text-align: right; }}`
			`</style>`
			`</head>`
			`<body>`
			`<div class="header">`
			`<h1>Compliance Report</h1>`
			`<p>Document: {report_data.get('document_name', 'Unknown')}</p>`
			`<p>Generated: {report_data.get('timestamp', datetime.now().strftime('%Y-%m-%d %H:%M:%S'))}</p>`
			`<div class="score-container">`
			`<span>Compliance Score: </span>`
			`<span class="score">{report_data.get('compliance_score', 0) * 100:.1f}%</span>`
			`</div>`
			`</div>`

			`<div class="summary">`
			`<h2>Summary</h2>`
			`<p>{report_data.get('summary', 'No summary available.')}</p>`

			`{_render_applied_standards(report_data.get('applied_standards', []))}`
			`</div>`

			`<div class="issues">`
			`<h2>Compliance Issues</h2>`
			`"""`

			`# Add issues`
			`issues = report_data.get('issues', [])`
			`if not issues:`
			`html += "<p>No compliance issues found.</p>"`
			`else:`
			`for issue in issues:`
			`level = issue.get('level', 'info').lower()`
			`html += f"""`
			`<div class="issue {level}">`
			`<h3>{issue.get('section', 'Unknown Section')}</h3>`
			`<p><span class="badge {level}">{level.upper()}</span> {issue.get('description', 'No description')}</p>`
			`<p><strong>Recommendation:</strong> {issue.get('recommendation', 'No recommendation')}</p>`
			`</div>`
			`"""`

			`# Close HTML`
			`html += """`
			`</div>`
			`</body>`
			`</html>`
			`"""`

			`return html`