283 lines
8.4 KiB
Python
283 lines
8.4 KiB
Python
|
|
# Utility functions
|
||
|
|
import re
|
||
|
|
from typing import Dict, List, Any, Optional
|
||
|
|
import os
|
||
|
|
from datetime import datetime
|
||
|
|
import json
|
||
|
|
|
||
|
|
def extract_sections_from_markdown(markdown_text: str) -> Dict[str, str]:
|
||
|
|
"""
|
||
|
|
Extract sections from a markdown document.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
markdown_text: The markdown text to parse
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
Dictionary mapping section names to section content
|
||
|
|
"""
|
||
|
|
sections = {}
|
||
|
|
|
||
|
|
# Add the whole document as one section
|
||
|
|
sections["full_document"] = markdown_text
|
||
|
|
|
||
|
|
# Split by markdown headings
|
||
|
|
heading_pattern = re.compile(r'^(#{1,6})\s+(.+)$', re.MULTILINE)
|
||
|
|
matches = list(heading_pattern.finditer(markdown_text))
|
||
|
|
|
||
|
|
if matches:
|
||
|
|
for i, match in enumerate(matches):
|
||
|
|
heading_level = len(match.group(1))
|
||
|
|
section_name = match.group(2).strip()
|
||
|
|
|
||
|
|
# Get section content (from this heading to the next, or to the end)
|
||
|
|
start_pos = match.end()
|
||
|
|
end_pos = matches[i+1].start() if i < len(matches) - 1 else len(markdown_text)
|
||
|
|
|
||
|
|
section_content = markdown_text[start_pos:end_pos].strip()
|
||
|
|
section_key = f"h{heading_level}_{section_name}"
|
||
|
|
|
||
|
|
sections[section_key] = section_content
|
||
|
|
|
||
|
|
return sections
|
||
|
|
|
||
|
|
def detect_file_type(filename: str) -> str:
|
||
|
|
"""
|
||
|
|
Detect file type from filename extension.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
filename: Name of the file
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
File type (markdown, text, etc.)
|
||
|
|
"""
|
||
|
|
_, extension = os.path.splitext(filename)
|
||
|
|
ext = extension.lower().lstrip('.')
|
||
|
|
|
||
|
|
if ext in ['md', 'markdown']:
|
||
|
|
return 'markdown'
|
||
|
|
elif ext in ['txt', 'text']:
|
||
|
|
return 'text'
|
||
|
|
elif ext in ['json']:
|
||
|
|
return 'json'
|
||
|
|
elif ext in ['yaml', 'yml']:
|
||
|
|
return 'yaml'
|
||
|
|
elif ext in ['html', 'htm']:
|
||
|
|
return 'html'
|
||
|
|
else:
|
||
|
|
return 'unknown'
|
||
|
|
|
||
|
|
def parse_code_blocks(content: str) -> List[Dict[str, str]]:
|
||
|
|
"""
|
||
|
|
Extract code blocks from markdown content.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
content: Markdown content with code blocks
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
List of dictionaries with language and code
|
||
|
|
"""
|
||
|
|
# Pattern to match code blocks with optional language
|
||
|
|
pattern = r'```(\w*)\n([\s\S]*?)```'
|
||
|
|
matches = re.findall(pattern, content)
|
||
|
|
|
||
|
|
code_blocks = []
|
||
|
|
for language, code in matches:
|
||
|
|
code_blocks.append({
|
||
|
|
'language': language.strip() or 'text',
|
||
|
|
'code': code.strip()
|
||
|
|
})
|
||
|
|
|
||
|
|
return code_blocks
|
||
|
|
|
||
|
|
def format_timestamp(timestamp: datetime) -> str:
|
||
|
|
"""
|
||
|
|
Format timestamp for display.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
timestamp: Datetime object
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
Formatted timestamp string
|
||
|
|
"""
|
||
|
|
return timestamp.strftime("%Y-%m-%d %H:%M:%S")
|
||
|
|
|
||
|
|
def calculate_readability_score(text: str) -> float:
|
||
|
|
"""
|
||
|
|
Calculate a simple readability score for text.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
text: The text to analyze
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
Readability score (0.0-1.0)
|
||
|
|
"""
|
||
|
|
if not text:
|
||
|
|
return 0.0
|
||
|
|
|
||
|
|
# Split into sentences and words
|
||
|
|
sentences = re.split(r'[.!?]+', text)
|
||
|
|
words = re.findall(r'\b\w+\b', text)
|
||
|
|
|
||
|
|
if not words or not sentences:
|
||
|
|
return 0.0
|
||
|
|
|
||
|
|
# Average words per sentence
|
||
|
|
avg_words_per_sentence = len(words) / len(sentences)
|
||
|
|
|
||
|
|
# Simple readability score based on average words per sentence
|
||
|
|
# Optimal is around 15-20 words per sentence
|
||
|
|
if avg_words_per_sentence <= 10:
|
||
|
|
score = 0.7 # Very short sentences
|
||
|
|
elif 10 < avg_words_per_sentence <= 20:
|
||
|
|
score = 1.0 # Optimal
|
||
|
|
elif 20 < avg_words_per_sentence <= 30:
|
||
|
|
score = 0.8 # Getting long
|
||
|
|
else:
|
||
|
|
score = 0.5 # Too long
|
||
|
|
|
||
|
|
return score
|
||
|
|
|
||
|
|
def sanitize_filename(filename: str) -> str:
|
||
|
|
"""
|
||
|
|
Sanitize filename to be safe for filesystem.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
filename: Original filename
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
Sanitized filename
|
||
|
|
"""
|
||
|
|
# Replace illegal characters
|
||
|
|
sanitized = re.sub(r'[<>:"/\\|?*]', '_', filename)
|
||
|
|
|
||
|
|
# Ensure it's not too long
|
||
|
|
if len(sanitized) > 255:
|
||
|
|
base, ext = os.path.splitext(sanitized)
|
||
|
|
sanitized = base[:255-len(ext)] + ext
|
||
|
|
|
||
|
|
return sanitized
|
||
|
|
|
||
|
|
def load_standards_from_file(file_path: str) -> List[Dict[str, Any]]:
|
||
|
|
"""
|
||
|
|
Load compliance standards from a JSON file.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
file_path: Path to the standards JSON file
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
List of standard dictionaries
|
||
|
|
"""
|
||
|
|
try:
|
||
|
|
with open(file_path, 'r') as f:
|
||
|
|
standards = json.load(f)
|
||
|
|
return standards
|
||
|
|
except (FileNotFoundError, json.JSONDecodeError):
|
||
|
|
# Return empty list if file not found or invalid
|
||
|
|
return []
|
||
|
|
|
||
|
|
def _render_applied_standards(standards: List[str]) -> str:
|
||
|
|
"""
|
||
|
|
Render HTML for applied standards section.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
standards: List of standard names
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
HTML string for the applied standards section
|
||
|
|
"""
|
||
|
|
if not standards:
|
||
|
|
return ""
|
||
|
|
|
||
|
|
html = """<div style="margin-top: 15px; padding: 10px; background-color: #f8f9fa; border-radius: 5px;">
|
||
|
|
<h3 style="margin-top: 0; font-size: 16px; color: #495057;">Applied Standards</h3>
|
||
|
|
<ul style="margin: 5px 0 0 20px; padding: 0;">
|
||
|
|
"""
|
||
|
|
|
||
|
|
for standard in standards:
|
||
|
|
html += f"<li style=\"margin-bottom: 3px;\">{standard}</li>\n"
|
||
|
|
|
||
|
|
html += "</ul></div>"
|
||
|
|
return html
|
||
|
|
|
||
|
|
def generate_html_report(report_data: Dict[str, Any]) -> str:
|
||
|
|
"""
|
||
|
|
Generate HTML for compliance report.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
report_data: Report data dictionary
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
HTML string for the report
|
||
|
|
"""
|
||
|
|
# Simple HTML template for the report
|
||
|
|
html = f"""
|
||
|
|
<!DOCTYPE html>
|
||
|
|
<html>
|
||
|
|
<head>
|
||
|
|
<title>Compliance Report</title>
|
||
|
|
<style>
|
||
|
|
body {{ font-family: Arial, sans-serif; margin: 0; padding: 20px; color: #333; }}
|
||
|
|
.header {{ background-color: #f5f5f5; padding: 15px; border-bottom: 1px solid #ddd; }}
|
||
|
|
.summary {{ margin: 20px 0; padding: 15px; background-color: #e9f7ef; border-left: 4px solid #27ae60; }}
|
||
|
|
.issues {{ margin: 20px 0; }}
|
||
|
|
.issue {{ margin-bottom: 15px; padding: 15px; background-color: #f9f9f9; border-left: 4px solid #3498db; }}
|
||
|
|
.issue.critical {{ background-color: #fdedec; border-left-color: #c0392b; }}
|
||
|
|
.issue.major {{ background-color: #fef9e7; border-left-color: #f1c40f; }}
|
||
|
|
.issue.minor {{ background-color: #eafaf1; border-left-color: #2ecc71; }}
|
||
|
|
.issue.info {{ background-color: #ebf5fb; border-left-color: #3498db; }}
|
||
|
|
.issue h3 {{ margin-top: 0; }}
|
||
|
|
.issue p {{ margin: 5px 0; }}
|
||
|
|
.badge {{ display: inline-block; padding: 3px 7px; border-radius: 3px; font-size: 12px; color: white; }}
|
||
|
|
.badge.critical {{ background-color: #c0392b; }}
|
||
|
|
.badge.major {{ background-color: #f1c40f; color: #333; }}
|
||
|
|
.badge.minor {{ background-color: #2ecc71; }}
|
||
|
|
.badge.info {{ background-color: #3498db; }}
|
||
|
|
.score {{ font-size: 24px; font-weight: bold; }}
|
||
|
|
.score-container {{ text-align: right; }}
|
||
|
|
</style>
|
||
|
|
</head>
|
||
|
|
<body>
|
||
|
|
<div class="header">
|
||
|
|
<h1>Compliance Report</h1>
|
||
|
|
<p>Document: {report_data.get('document_name', 'Unknown')}</p>
|
||
|
|
<p>Generated: {report_data.get('timestamp', datetime.now().strftime('%Y-%m-%d %H:%M:%S'))}</p>
|
||
|
|
<div class="score-container">
|
||
|
|
<span>Compliance Score: </span>
|
||
|
|
<span class="score">{report_data.get('compliance_score', 0) * 100:.1f}%</span>
|
||
|
|
</div>
|
||
|
|
</div>
|
||
|
|
|
||
|
|
<div class="summary">
|
||
|
|
<h2>Summary</h2>
|
||
|
|
<p>{report_data.get('summary', 'No summary available.')}</p>
|
||
|
|
|
||
|
|
{_render_applied_standards(report_data.get('applied_standards', []))}
|
||
|
|
</div>
|
||
|
|
|
||
|
|
<div class="issues">
|
||
|
|
<h2>Compliance Issues</h2>
|
||
|
|
"""
|
||
|
|
|
||
|
|
# Add issues
|
||
|
|
issues = report_data.get('issues', [])
|
||
|
|
if not issues:
|
||
|
|
html += "<p>No compliance issues found.</p>"
|
||
|
|
else:
|
||
|
|
for issue in issues:
|
||
|
|
level = issue.get('level', 'info').lower()
|
||
|
|
html += f"""
|
||
|
|
<div class="issue {level}">
|
||
|
|
<h3>{issue.get('section', 'Unknown Section')}</h3>
|
||
|
|
<p><span class="badge {level}">{level.upper()}</span> {issue.get('description', 'No description')}</p>
|
||
|
|
<p><strong>Recommendation:</strong> {issue.get('recommendation', 'No recommendation')}</p>
|
||
|
|
</div>
|
||
|
|
"""
|
||
|
|
|
||
|
|
# Close HTML
|
||
|
|
html += """
|
||
|
|
</div>
|
||
|
|
</body>
|
||
|
|
</html>
|
||
|
|
"""
|
||
|
|
|
||
|
|
return html
|