Initial commit

2025-07-17 22:20:25 +01:00
commit 0e3e22e8cb
39 changed files with 13295 additions and 0 deletions
@@ -0,0 +1 @@
+"""Utility functions for the Mini SpecsComply Pro application."""
@@ -0,0 +1,283 @@
+# Utility functions
+import re
+from typing import Dict, List, Any, Optional
+import os
+from datetime import datetime
+import json
+
+def extract_sections_from_markdown(markdown_text: str) -> Dict[str, str]:
+    """
+    Extract sections from a markdown document.
+
+    Args:
+        markdown_text: The markdown text to parse
+
+    Returns:
+        Dictionary mapping section names to section content
+    """
+    sections = {}
+
+    # Add the whole document as one section
+    sections["full_document"] = markdown_text
+
+    # Split by markdown headings
+    heading_pattern = re.compile(r'^(#{1,6})\s+(.+)$', re.MULTILINE)
+    matches = list(heading_pattern.finditer(markdown_text))
+
+    if matches:
+        for i, match in enumerate(matches):
+            heading_level = len(match.group(1))
+            section_name = match.group(2).strip()
+
+            # Get section content (from this heading to the next, or to the end)
+            start_pos = match.end()
+            end_pos = matches[i+1].start() if i < len(matches) - 1 else len(markdown_text)
+
+            section_content = markdown_text[start_pos:end_pos].strip()
+            section_key = f"h{heading_level}_{section_name}"
+
+            sections[section_key] = section_content
+
+    return sections
+
+def detect_file_type(filename: str) -> str:
+    """
+    Detect file type from filename extension.
+
+    Args:
+        filename: Name of the file
+
+    Returns:
+        File type (markdown, text, etc.)
+    """
+    _, extension = os.path.splitext(filename)
+    ext = extension.lower().lstrip('.')
+
+    if ext in ['md', 'markdown']:
+        return 'markdown'
+    elif ext in ['txt', 'text']:
+        return 'text'
+    elif ext in ['json']:
+        return 'json'
+    elif ext in ['yaml', 'yml']:
+        return 'yaml'
+    elif ext in ['html', 'htm']:
+        return 'html'
+    else:
+        return 'unknown'
+
+def parse_code_blocks(content: str) -> List[Dict[str, str]]:
+    """
+    Extract code blocks from markdown content.
+
+    Args:
+        content: Markdown content with code blocks
+
+    Returns:
+        List of dictionaries with language and code
+    """
+    # Pattern to match code blocks with optional language
+    pattern = r'```(\w*)\n([\s\S]*?)```'
+    matches = re.findall(pattern, content)
+
+    code_blocks = []
+    for language, code in matches:
+        code_blocks.append({
+            'language': language.strip() or 'text',
+            'code': code.strip()
+        })
+
+    return code_blocks
+
+def format_timestamp(timestamp: datetime) -> str:
+    """
+    Format timestamp for display.
+
+    Args:
+        timestamp: Datetime object
+
+    Returns:
+        Formatted timestamp string
+    """
+    return timestamp.strftime("%Y-%m-%d %H:%M:%S")
+
+def calculate_readability_score(text: str) -> float:
+    """
+    Calculate a simple readability score for text.
+
+    Args:
+        text: The text to analyze
+
+    Returns:
+        Readability score (0.0-1.0)
+    """
+    if not text:
+        return 0.0
+
+    # Split into sentences and words
+    sentences = re.split(r'[.!?]+', text)
+    words = re.findall(r'\b\w+\b', text)
+
+    if not words or not sentences:
+        return 0.0
+
+    # Average words per sentence
+    avg_words_per_sentence = len(words) / len(sentences)
+
+    # Simple readability score based on average words per sentence
+    # Optimal is around 15-20 words per sentence
+    if avg_words_per_sentence <= 10:
+        score = 0.7  # Very short sentences
+    elif 10 < avg_words_per_sentence <= 20:
+        score = 1.0  # Optimal
+    elif 20 < avg_words_per_sentence <= 30:
+        score = 0.8  # Getting long
+    else:
+        score = 0.5  # Too long
+
+    return score
+
+def sanitize_filename(filename: str) -> str:
+    """
+    Sanitize filename to be safe for filesystem.
+
+    Args:
+        filename: Original filename
+
+    Returns:
+        Sanitized filename
+    """
+    # Replace illegal characters
+    sanitized = re.sub(r'[<>:"/\\|?*]', '_', filename)
+
+    # Ensure it's not too long
+    if len(sanitized) > 255:
+        base, ext = os.path.splitext(sanitized)
+        sanitized = base[:255-len(ext)] + ext
+
+    return sanitized
+
+def load_standards_from_file(file_path: str) -> List[Dict[str, Any]]:
+    """
+    Load compliance standards from a JSON file.
+
+    Args:
+        file_path: Path to the standards JSON file
+
+    Returns:
+        List of standard dictionaries
+    """
+    try:
+        with open(file_path, 'r') as f:
+            standards = json.load(f)
+        return standards
+    except (FileNotFoundError, json.JSONDecodeError):
+        # Return empty list if file not found or invalid
+        return []
+
+def _render_applied_standards(standards: List[str]) -> str:
+    """
+    Render HTML for applied standards section.
+
+    Args:
+        standards: List of standard names
+
+    Returns:
+        HTML string for the applied standards section
+    """
+    if not standards:
+        return ""
+
+    html = """<div style="margin-top: 15px; padding: 10px; background-color: #f8f9fa; border-radius: 5px;">
+        <h3 style="margin-top: 0; font-size: 16px; color: #495057;">Applied Standards</h3>
+        <ul style="margin: 5px 0 0 20px; padding: 0;">
+    """
+
+    for standard in standards:
+        html += f"<li style=\"margin-bottom: 3px;\">{standard}</li>\n"
+
+    html += "</ul></div>"
+    return html
+
+def generate_html_report(report_data: Dict[str, Any]) -> str:
+    """
+    Generate HTML for compliance report.
+
+    Args:
+        report_data: Report data dictionary
+
+    Returns:
+        HTML string for the report
+    """
+    # Simple HTML template for the report
+    html = f"""
+    <!DOCTYPE html>
+    <html>
+    <head>
+        <title>Compliance Report</title>
+        <style>
+            body {{ font-family: Arial, sans-serif; margin: 0; padding: 20px; color: #333; }}
+            .header {{ background-color: #f5f5f5; padding: 15px; border-bottom: 1px solid #ddd; }}
+            .summary {{ margin: 20px 0; padding: 15px; background-color: #e9f7ef; border-left: 4px solid #27ae60; }}
+            .issues {{ margin: 20px 0; }}
+            .issue {{ margin-bottom: 15px; padding: 15px; background-color: #f9f9f9; border-left: 4px solid #3498db; }}
+            .issue.critical {{ background-color: #fdedec; border-left-color: #c0392b; }}
+            .issue.major {{ background-color: #fef9e7; border-left-color: #f1c40f; }}
+            .issue.minor {{ background-color: #eafaf1; border-left-color: #2ecc71; }}
+            .issue.info {{ background-color: #ebf5fb; border-left-color: #3498db; }}
+            .issue h3 {{ margin-top: 0; }}
+            .issue p {{ margin: 5px 0; }}
+            .badge {{ display: inline-block; padding: 3px 7px; border-radius: 3px; font-size: 12px; color: white; }}
+            .badge.critical {{ background-color: #c0392b; }}
+            .badge.major {{ background-color: #f1c40f; color: #333; }}
+            .badge.minor {{ background-color: #2ecc71; }}
+            .badge.info {{ background-color: #3498db; }}
+            .score {{ font-size: 24px; font-weight: bold; }}
+            .score-container {{ text-align: right; }}
+        </style>
+    </head>
+    <body>
+        <div class="header">
+            <h1>Compliance Report</h1>
+            <p>Document: {report_data.get('document_name', 'Unknown')}</p>
+            <p>Generated: {report_data.get('timestamp', datetime.now().strftime('%Y-%m-%d %H:%M:%S'))}</p>
+            <div class="score-container">
+                <span>Compliance Score: </span>
+                <span class="score">{report_data.get('compliance_score', 0) * 100:.1f}%</span>
+            </div>
+        </div>
+
+        <div class="summary">
+            <h2>Summary</h2>
+            <p>{report_data.get('summary', 'No summary available.')}</p>
+
+            {_render_applied_standards(report_data.get('applied_standards', []))}
+        </div>
+
+        <div class="issues">
+            <h2>Compliance Issues</h2>
+    """
+
+    # Add issues
+    issues = report_data.get('issues', [])
+    if not issues:
+        html += "<p>No compliance issues found.</p>"
+    else:
+        for issue in issues:
+            level = issue.get('level', 'info').lower()
+            html += f"""
+            <div class="issue {level}">
+                <h3>{issue.get('section', 'Unknown Section')}</h3>
+                <p><span class="badge {level}">{level.upper()}</span> {issue.get('description', 'No description')}</p>
+                <p><strong>Recommendation:</strong> {issue.get('recommendation', 'No recommendation')}</p>
+            </div>
+            """
+
+    # Close HTML
+    html += """
+        </div>
+    </body>
+    </html>
+    """
+
+    return html
@@ -0,0 +1,80 @@
+"""
+Token counting utilities for document processing.
+"""
+import tiktoken
+from typing import Dict, List, Optional, Union
+from loguru import logger
+
+# Default models to use for token counting
+DEFAULT_MODEL = "gpt-4o"
+
+def count_tokens(text: str, model: str = DEFAULT_MODEL) -> int:
+    """
+    Count the number of tokens in a text string using tiktoken.
+    
+    Args:
+        text: The text to count tokens for
+        model: The model to use for token counting (default: gpt-4o)
+        
+    Returns:
+        Number of tokens in the text
+    """
+    try:
+        encoding = tiktoken.encoding_for_model(model)
+        return len(encoding.encode(text))
+    except Exception as e:
+        logger.warning(f"Error counting tokens with model {model}: {str(e)}")
+        # Fallback to cl100k_base encoding if model-specific encoding fails
+        try:
+            encoding = tiktoken.get_encoding("cl100k_base")
+            return len(encoding.encode(text))
+        except Exception as e:
+            logger.error(f"Error counting tokens with fallback encoding: {str(e)}")
+            # If all else fails, use a rough approximation (4 chars per token)
+            return len(text) // 4
+
+def truncate_by_tokens(text: str, max_tokens: int, model: str = DEFAULT_MODEL) -> str:
+    """
+    Truncate text to fit within a maximum token count.
+    
+    Args:
+        text: The text to truncate
+        max_tokens: Maximum number of tokens to allow
+        model: The model to use for token counting (default: gpt-4o)
+        
+    Returns:
+        Truncated text that fits within max_tokens
+    """
+    try:
+        encoding = tiktoken.encoding_for_model(model)
+        tokens = encoding.encode(text)
+        
+        if len(tokens) <= max_tokens:
+            return text
+            
+        # Truncate tokens and decode
+        truncated_tokens = tokens[:max_tokens]
+        truncated_text = encoding.decode(truncated_tokens)
+        
+        # Add truncation indicator
+        return truncated_text + "...(truncated)"
+    except Exception as e:
+        logger.warning(f"Error truncating by tokens with model {model}: {str(e)}")
+        # Fallback to character-based truncation if token-based fails
+        approx_chars = max_tokens * 4  # Rough approximation
+        if len(text) <= approx_chars:
+            return text
+        return text[:approx_chars] + "...(truncated)"
+
+def estimate_tokens_from_chars(char_count: int) -> int:
+    """
+    Estimate the number of tokens from character count.
+    This is a rough approximation (4 chars per token on average).
+    
+    Args:
+        char_count: Number of characters
+        
+    Returns:
+        Estimated number of tokens
+    """
+    return char_count // 4
				`@@ -0,0 +1 @@`
				`"""Utility functions for the Mini SpecsComply Pro application."""`