Initial commit
This commit is contained in:
@@ -0,0 +1 @@
|
||||
"""Utility functions for the Mini SpecsComply Pro application."""
|
||||
@@ -0,0 +1,283 @@
|
||||
# Utility functions
|
||||
import re
|
||||
from typing import Dict, List, Any, Optional
|
||||
import os
|
||||
from datetime import datetime
|
||||
import json
|
||||
|
||||
def extract_sections_from_markdown(markdown_text: str) -> Dict[str, str]:
|
||||
"""
|
||||
Extract sections from a markdown document.
|
||||
|
||||
Args:
|
||||
markdown_text: The markdown text to parse
|
||||
|
||||
Returns:
|
||||
Dictionary mapping section names to section content
|
||||
"""
|
||||
sections = {}
|
||||
|
||||
# Add the whole document as one section
|
||||
sections["full_document"] = markdown_text
|
||||
|
||||
# Split by markdown headings
|
||||
heading_pattern = re.compile(r'^(#{1,6})\s+(.+)$', re.MULTILINE)
|
||||
matches = list(heading_pattern.finditer(markdown_text))
|
||||
|
||||
if matches:
|
||||
for i, match in enumerate(matches):
|
||||
heading_level = len(match.group(1))
|
||||
section_name = match.group(2).strip()
|
||||
|
||||
# Get section content (from this heading to the next, or to the end)
|
||||
start_pos = match.end()
|
||||
end_pos = matches[i+1].start() if i < len(matches) - 1 else len(markdown_text)
|
||||
|
||||
section_content = markdown_text[start_pos:end_pos].strip()
|
||||
section_key = f"h{heading_level}_{section_name}"
|
||||
|
||||
sections[section_key] = section_content
|
||||
|
||||
return sections
|
||||
|
||||
def detect_file_type(filename: str) -> str:
|
||||
"""
|
||||
Detect file type from filename extension.
|
||||
|
||||
Args:
|
||||
filename: Name of the file
|
||||
|
||||
Returns:
|
||||
File type (markdown, text, etc.)
|
||||
"""
|
||||
_, extension = os.path.splitext(filename)
|
||||
ext = extension.lower().lstrip('.')
|
||||
|
||||
if ext in ['md', 'markdown']:
|
||||
return 'markdown'
|
||||
elif ext in ['txt', 'text']:
|
||||
return 'text'
|
||||
elif ext in ['json']:
|
||||
return 'json'
|
||||
elif ext in ['yaml', 'yml']:
|
||||
return 'yaml'
|
||||
elif ext in ['html', 'htm']:
|
||||
return 'html'
|
||||
else:
|
||||
return 'unknown'
|
||||
|
||||
def parse_code_blocks(content: str) -> List[Dict[str, str]]:
|
||||
"""
|
||||
Extract code blocks from markdown content.
|
||||
|
||||
Args:
|
||||
content: Markdown content with code blocks
|
||||
|
||||
Returns:
|
||||
List of dictionaries with language and code
|
||||
"""
|
||||
# Pattern to match code blocks with optional language
|
||||
pattern = r'```(\w*)\n([\s\S]*?)```'
|
||||
matches = re.findall(pattern, content)
|
||||
|
||||
code_blocks = []
|
||||
for language, code in matches:
|
||||
code_blocks.append({
|
||||
'language': language.strip() or 'text',
|
||||
'code': code.strip()
|
||||
})
|
||||
|
||||
return code_blocks
|
||||
|
||||
def format_timestamp(timestamp: datetime) -> str:
|
||||
"""
|
||||
Format timestamp for display.
|
||||
|
||||
Args:
|
||||
timestamp: Datetime object
|
||||
|
||||
Returns:
|
||||
Formatted timestamp string
|
||||
"""
|
||||
return timestamp.strftime("%Y-%m-%d %H:%M:%S")
|
||||
|
||||
def calculate_readability_score(text: str) -> float:
|
||||
"""
|
||||
Calculate a simple readability score for text.
|
||||
|
||||
Args:
|
||||
text: The text to analyze
|
||||
|
||||
Returns:
|
||||
Readability score (0.0-1.0)
|
||||
"""
|
||||
if not text:
|
||||
return 0.0
|
||||
|
||||
# Split into sentences and words
|
||||
sentences = re.split(r'[.!?]+', text)
|
||||
words = re.findall(r'\b\w+\b', text)
|
||||
|
||||
if not words or not sentences:
|
||||
return 0.0
|
||||
|
||||
# Average words per sentence
|
||||
avg_words_per_sentence = len(words) / len(sentences)
|
||||
|
||||
# Simple readability score based on average words per sentence
|
||||
# Optimal is around 15-20 words per sentence
|
||||
if avg_words_per_sentence <= 10:
|
||||
score = 0.7 # Very short sentences
|
||||
elif 10 < avg_words_per_sentence <= 20:
|
||||
score = 1.0 # Optimal
|
||||
elif 20 < avg_words_per_sentence <= 30:
|
||||
score = 0.8 # Getting long
|
||||
else:
|
||||
score = 0.5 # Too long
|
||||
|
||||
return score
|
||||
|
||||
def sanitize_filename(filename: str) -> str:
|
||||
"""
|
||||
Sanitize filename to be safe for filesystem.
|
||||
|
||||
Args:
|
||||
filename: Original filename
|
||||
|
||||
Returns:
|
||||
Sanitized filename
|
||||
"""
|
||||
# Replace illegal characters
|
||||
sanitized = re.sub(r'[<>:"/\\|?*]', '_', filename)
|
||||
|
||||
# Ensure it's not too long
|
||||
if len(sanitized) > 255:
|
||||
base, ext = os.path.splitext(sanitized)
|
||||
sanitized = base[:255-len(ext)] + ext
|
||||
|
||||
return sanitized
|
||||
|
||||
def load_standards_from_file(file_path: str) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Load compliance standards from a JSON file.
|
||||
|
||||
Args:
|
||||
file_path: Path to the standards JSON file
|
||||
|
||||
Returns:
|
||||
List of standard dictionaries
|
||||
"""
|
||||
try:
|
||||
with open(file_path, 'r') as f:
|
||||
standards = json.load(f)
|
||||
return standards
|
||||
except (FileNotFoundError, json.JSONDecodeError):
|
||||
# Return empty list if file not found or invalid
|
||||
return []
|
||||
|
||||
def _render_applied_standards(standards: List[str]) -> str:
|
||||
"""
|
||||
Render HTML for applied standards section.
|
||||
|
||||
Args:
|
||||
standards: List of standard names
|
||||
|
||||
Returns:
|
||||
HTML string for the applied standards section
|
||||
"""
|
||||
if not standards:
|
||||
return ""
|
||||
|
||||
html = """<div style="margin-top: 15px; padding: 10px; background-color: #f8f9fa; border-radius: 5px;">
|
||||
<h3 style="margin-top: 0; font-size: 16px; color: #495057;">Applied Standards</h3>
|
||||
<ul style="margin: 5px 0 0 20px; padding: 0;">
|
||||
"""
|
||||
|
||||
for standard in standards:
|
||||
html += f"<li style=\"margin-bottom: 3px;\">{standard}</li>\n"
|
||||
|
||||
html += "</ul></div>"
|
||||
return html
|
||||
|
||||
def generate_html_report(report_data: Dict[str, Any]) -> str:
|
||||
"""
|
||||
Generate HTML for compliance report.
|
||||
|
||||
Args:
|
||||
report_data: Report data dictionary
|
||||
|
||||
Returns:
|
||||
HTML string for the report
|
||||
"""
|
||||
# Simple HTML template for the report
|
||||
html = f"""
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>Compliance Report</title>
|
||||
<style>
|
||||
body {{ font-family: Arial, sans-serif; margin: 0; padding: 20px; color: #333; }}
|
||||
.header {{ background-color: #f5f5f5; padding: 15px; border-bottom: 1px solid #ddd; }}
|
||||
.summary {{ margin: 20px 0; padding: 15px; background-color: #e9f7ef; border-left: 4px solid #27ae60; }}
|
||||
.issues {{ margin: 20px 0; }}
|
||||
.issue {{ margin-bottom: 15px; padding: 15px; background-color: #f9f9f9; border-left: 4px solid #3498db; }}
|
||||
.issue.critical {{ background-color: #fdedec; border-left-color: #c0392b; }}
|
||||
.issue.major {{ background-color: #fef9e7; border-left-color: #f1c40f; }}
|
||||
.issue.minor {{ background-color: #eafaf1; border-left-color: #2ecc71; }}
|
||||
.issue.info {{ background-color: #ebf5fb; border-left-color: #3498db; }}
|
||||
.issue h3 {{ margin-top: 0; }}
|
||||
.issue p {{ margin: 5px 0; }}
|
||||
.badge {{ display: inline-block; padding: 3px 7px; border-radius: 3px; font-size: 12px; color: white; }}
|
||||
.badge.critical {{ background-color: #c0392b; }}
|
||||
.badge.major {{ background-color: #f1c40f; color: #333; }}
|
||||
.badge.minor {{ background-color: #2ecc71; }}
|
||||
.badge.info {{ background-color: #3498db; }}
|
||||
.score {{ font-size: 24px; font-weight: bold; }}
|
||||
.score-container {{ text-align: right; }}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="header">
|
||||
<h1>Compliance Report</h1>
|
||||
<p>Document: {report_data.get('document_name', 'Unknown')}</p>
|
||||
<p>Generated: {report_data.get('timestamp', datetime.now().strftime('%Y-%m-%d %H:%M:%S'))}</p>
|
||||
<div class="score-container">
|
||||
<span>Compliance Score: </span>
|
||||
<span class="score">{report_data.get('compliance_score', 0) * 100:.1f}%</span>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="summary">
|
||||
<h2>Summary</h2>
|
||||
<p>{report_data.get('summary', 'No summary available.')}</p>
|
||||
|
||||
{_render_applied_standards(report_data.get('applied_standards', []))}
|
||||
</div>
|
||||
|
||||
<div class="issues">
|
||||
<h2>Compliance Issues</h2>
|
||||
"""
|
||||
|
||||
# Add issues
|
||||
issues = report_data.get('issues', [])
|
||||
if not issues:
|
||||
html += "<p>No compliance issues found.</p>"
|
||||
else:
|
||||
for issue in issues:
|
||||
level = issue.get('level', 'info').lower()
|
||||
html += f"""
|
||||
<div class="issue {level}">
|
||||
<h3>{issue.get('section', 'Unknown Section')}</h3>
|
||||
<p><span class="badge {level}">{level.upper()}</span> {issue.get('description', 'No description')}</p>
|
||||
<p><strong>Recommendation:</strong> {issue.get('recommendation', 'No recommendation')}</p>
|
||||
</div>
|
||||
"""
|
||||
|
||||
# Close HTML
|
||||
html += """
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
return html
|
||||
@@ -0,0 +1,80 @@
|
||||
"""
|
||||
Token counting utilities for document processing.
|
||||
"""
|
||||
import tiktoken
|
||||
from typing import Dict, List, Optional, Union
|
||||
from loguru import logger
|
||||
|
||||
# Default models to use for token counting
|
||||
DEFAULT_MODEL = "gpt-4o"
|
||||
|
||||
def count_tokens(text: str, model: str = DEFAULT_MODEL) -> int:
|
||||
"""
|
||||
Count the number of tokens in a text string using tiktoken.
|
||||
|
||||
Args:
|
||||
text: The text to count tokens for
|
||||
model: The model to use for token counting (default: gpt-4o)
|
||||
|
||||
Returns:
|
||||
Number of tokens in the text
|
||||
"""
|
||||
try:
|
||||
encoding = tiktoken.encoding_for_model(model)
|
||||
return len(encoding.encode(text))
|
||||
except Exception as e:
|
||||
logger.warning(f"Error counting tokens with model {model}: {str(e)}")
|
||||
# Fallback to cl100k_base encoding if model-specific encoding fails
|
||||
try:
|
||||
encoding = tiktoken.get_encoding("cl100k_base")
|
||||
return len(encoding.encode(text))
|
||||
except Exception as e:
|
||||
logger.error(f"Error counting tokens with fallback encoding: {str(e)}")
|
||||
# If all else fails, use a rough approximation (4 chars per token)
|
||||
return len(text) // 4
|
||||
|
||||
def truncate_by_tokens(text: str, max_tokens: int, model: str = DEFAULT_MODEL) -> str:
|
||||
"""
|
||||
Truncate text to fit within a maximum token count.
|
||||
|
||||
Args:
|
||||
text: The text to truncate
|
||||
max_tokens: Maximum number of tokens to allow
|
||||
model: The model to use for token counting (default: gpt-4o)
|
||||
|
||||
Returns:
|
||||
Truncated text that fits within max_tokens
|
||||
"""
|
||||
try:
|
||||
encoding = tiktoken.encoding_for_model(model)
|
||||
tokens = encoding.encode(text)
|
||||
|
||||
if len(tokens) <= max_tokens:
|
||||
return text
|
||||
|
||||
# Truncate tokens and decode
|
||||
truncated_tokens = tokens[:max_tokens]
|
||||
truncated_text = encoding.decode(truncated_tokens)
|
||||
|
||||
# Add truncation indicator
|
||||
return truncated_text + "...(truncated)"
|
||||
except Exception as e:
|
||||
logger.warning(f"Error truncating by tokens with model {model}: {str(e)}")
|
||||
# Fallback to character-based truncation if token-based fails
|
||||
approx_chars = max_tokens * 4 # Rough approximation
|
||||
if len(text) <= approx_chars:
|
||||
return text
|
||||
return text[:approx_chars] + "...(truncated)"
|
||||
|
||||
def estimate_tokens_from_chars(char_count: int) -> int:
|
||||
"""
|
||||
Estimate the number of tokens from character count.
|
||||
This is a rough approximation (4 chars per token on average).
|
||||
|
||||
Args:
|
||||
char_count: Number of characters
|
||||
|
||||
Returns:
|
||||
Estimated number of tokens
|
||||
"""
|
||||
return char_count // 4
|
||||
Reference in New Issue
Block a user