Files
ds_scp_task_solution/app/utils/helpers.py
T

283 lines
8.4 KiB
Python
Raw Normal View History

2025-07-17 22:20:25 +01:00
# Utility functions
import re
from typing import Dict, List, Any, Optional
import os
from datetime import datetime
import json
def extract_sections_from_markdown(markdown_text: str) -> Dict[str, str]:
"""
Extract sections from a markdown document.
Args:
markdown_text: The markdown text to parse
Returns:
Dictionary mapping section names to section content
"""
sections = {}
# Add the whole document as one section
sections["full_document"] = markdown_text
# Split by markdown headings
heading_pattern = re.compile(r'^(#{1,6})\s+(.+)$', re.MULTILINE)
matches = list(heading_pattern.finditer(markdown_text))
if matches:
for i, match in enumerate(matches):
heading_level = len(match.group(1))
section_name = match.group(2).strip()
# Get section content (from this heading to the next, or to the end)
start_pos = match.end()
end_pos = matches[i+1].start() if i < len(matches) - 1 else len(markdown_text)
section_content = markdown_text[start_pos:end_pos].strip()
section_key = f"h{heading_level}_{section_name}"
sections[section_key] = section_content
return sections
def detect_file_type(filename: str) -> str:
"""
Detect file type from filename extension.
Args:
filename: Name of the file
Returns:
File type (markdown, text, etc.)
"""
_, extension = os.path.splitext(filename)
ext = extension.lower().lstrip('.')
if ext in ['md', 'markdown']:
return 'markdown'
elif ext in ['txt', 'text']:
return 'text'
elif ext in ['json']:
return 'json'
elif ext in ['yaml', 'yml']:
return 'yaml'
elif ext in ['html', 'htm']:
return 'html'
else:
return 'unknown'
def parse_code_blocks(content: str) -> List[Dict[str, str]]:
"""
Extract code blocks from markdown content.
Args:
content: Markdown content with code blocks
Returns:
List of dictionaries with language and code
"""
# Pattern to match code blocks with optional language
pattern = r'```(\w*)\n([\s\S]*?)```'
matches = re.findall(pattern, content)
code_blocks = []
for language, code in matches:
code_blocks.append({
'language': language.strip() or 'text',
'code': code.strip()
})
return code_blocks
def format_timestamp(timestamp: datetime) -> str:
"""
Format timestamp for display.
Args:
timestamp: Datetime object
Returns:
Formatted timestamp string
"""
return timestamp.strftime("%Y-%m-%d %H:%M:%S")
def calculate_readability_score(text: str) -> float:
"""
Calculate a simple readability score for text.
Args:
text: The text to analyze
Returns:
Readability score (0.0-1.0)
"""
if not text:
return 0.0
# Split into sentences and words
sentences = re.split(r'[.!?]+', text)
words = re.findall(r'\b\w+\b', text)
if not words or not sentences:
return 0.0
# Average words per sentence
avg_words_per_sentence = len(words) / len(sentences)
# Simple readability score based on average words per sentence
# Optimal is around 15-20 words per sentence
if avg_words_per_sentence <= 10:
score = 0.7 # Very short sentences
elif 10 < avg_words_per_sentence <= 20:
score = 1.0 # Optimal
elif 20 < avg_words_per_sentence <= 30:
score = 0.8 # Getting long
else:
score = 0.5 # Too long
return score
def sanitize_filename(filename: str) -> str:
"""
Sanitize filename to be safe for filesystem.
Args:
filename: Original filename
Returns:
Sanitized filename
"""
# Replace illegal characters
sanitized = re.sub(r'[<>:"/\\|?*]', '_', filename)
# Ensure it's not too long
if len(sanitized) > 255:
base, ext = os.path.splitext(sanitized)
sanitized = base[:255-len(ext)] + ext
return sanitized
def load_standards_from_file(file_path: str) -> List[Dict[str, Any]]:
"""
Load compliance standards from a JSON file.
Args:
file_path: Path to the standards JSON file
Returns:
List of standard dictionaries
"""
try:
with open(file_path, 'r') as f:
standards = json.load(f)
return standards
except (FileNotFoundError, json.JSONDecodeError):
# Return empty list if file not found or invalid
return []
def _render_applied_standards(standards: List[str]) -> str:
"""
Render HTML for applied standards section.
Args:
standards: List of standard names
Returns:
HTML string for the applied standards section
"""
if not standards:
return ""
html = """<div style="margin-top: 15px; padding: 10px; background-color: #f8f9fa; border-radius: 5px;">
<h3 style="margin-top: 0; font-size: 16px; color: #495057;">Applied Standards</h3>
<ul style="margin: 5px 0 0 20px; padding: 0;">
"""
for standard in standards:
html += f"<li style=\"margin-bottom: 3px;\">{standard}</li>\n"
html += "</ul></div>"
return html
def generate_html_report(report_data: Dict[str, Any]) -> str:
"""
Generate HTML for compliance report.
Args:
report_data: Report data dictionary
Returns:
HTML string for the report
"""
# Simple HTML template for the report
html = f"""
<!DOCTYPE html>
<html>
<head>
<title>Compliance Report</title>
<style>
body {{ font-family: Arial, sans-serif; margin: 0; padding: 20px; color: #333; }}
.header {{ background-color: #f5f5f5; padding: 15px; border-bottom: 1px solid #ddd; }}
.summary {{ margin: 20px 0; padding: 15px; background-color: #e9f7ef; border-left: 4px solid #27ae60; }}
.issues {{ margin: 20px 0; }}
.issue {{ margin-bottom: 15px; padding: 15px; background-color: #f9f9f9; border-left: 4px solid #3498db; }}
.issue.critical {{ background-color: #fdedec; border-left-color: #c0392b; }}
.issue.major {{ background-color: #fef9e7; border-left-color: #f1c40f; }}
.issue.minor {{ background-color: #eafaf1; border-left-color: #2ecc71; }}
.issue.info {{ background-color: #ebf5fb; border-left-color: #3498db; }}
.issue h3 {{ margin-top: 0; }}
.issue p {{ margin: 5px 0; }}
.badge {{ display: inline-block; padding: 3px 7px; border-radius: 3px; font-size: 12px; color: white; }}
.badge.critical {{ background-color: #c0392b; }}
.badge.major {{ background-color: #f1c40f; color: #333; }}
.badge.minor {{ background-color: #2ecc71; }}
.badge.info {{ background-color: #3498db; }}
.score {{ font-size: 24px; font-weight: bold; }}
.score-container {{ text-align: right; }}
</style>
</head>
<body>
<div class="header">
<h1>Compliance Report</h1>
<p>Document: {report_data.get('document_name', 'Unknown')}</p>
<p>Generated: {report_data.get('timestamp', datetime.now().strftime('%Y-%m-%d %H:%M:%S'))}</p>
<div class="score-container">
<span>Compliance Score: </span>
<span class="score">{report_data.get('compliance_score', 0) * 100:.1f}%</span>
</div>
</div>
<div class="summary">
<h2>Summary</h2>
<p>{report_data.get('summary', 'No summary available.')}</p>
{_render_applied_standards(report_data.get('applied_standards', []))}
</div>
<div class="issues">
<h2>Compliance Issues</h2>
"""
# Add issues
issues = report_data.get('issues', [])
if not issues:
html += "<p>No compliance issues found.</p>"
else:
for issue in issues:
level = issue.get('level', 'info').lower()
html += f"""
<div class="issue {level}">
<h3>{issue.get('section', 'Unknown Section')}</h3>
<p><span class="badge {level}">{level.upper()}</span> {issue.get('description', 'No description')}</p>
<p><strong>Recommendation:</strong> {issue.get('recommendation', 'No recommendation')}</p>
</div>
"""
# Close HTML
html += """
</div>
</body>
</html>
"""
return html