Initial commit
This commit is contained in:
@@ -0,0 +1,461 @@
|
||||
# Document processing
|
||||
import os
|
||||
import uuid
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Dict, List, Optional, BinaryIO, Tuple
|
||||
import re
|
||||
from loguru import logger
|
||||
|
||||
from app.core.models import (
|
||||
Document,
|
||||
DocumentMetadata,
|
||||
DocumentStatus,
|
||||
ComplianceReport,
|
||||
ComplianceIssue,
|
||||
ComplianceLevel,
|
||||
DocumentEmbedding
|
||||
)
|
||||
from app.services.embedding import EmbeddingService
|
||||
from app.services.reasoning import ReasoningService
|
||||
from app.services.standards import StandardsService
|
||||
from app.utils.token_counter import count_tokens, truncate_by_tokens
|
||||
|
||||
class DocumentService:
|
||||
"""Service for handling document processing and storage."""
|
||||
|
||||
def __init__(self, embedding_service: EmbeddingService, reasoning_service: ReasoningService, standards_service: Optional[StandardsService] = None):
|
||||
"""Initialize with required services."""
|
||||
self.embedding_service = embedding_service
|
||||
self.reasoning_service = reasoning_service
|
||||
self.standards_service = standards_service or StandardsService()
|
||||
self.documents = {} # In-memory storage for documents (replace with DB in production)
|
||||
self.reports = {} # In-memory storage for reports (replace with DB in production)
|
||||
|
||||
async def upload_document(self, file: BinaryIO, filename: str) -> Document:
|
||||
"""
|
||||
Process an uploaded document.
|
||||
|
||||
Args:
|
||||
file: The document file
|
||||
filename: Name of the uploaded file
|
||||
|
||||
Returns:
|
||||
Document object with metadata
|
||||
"""
|
||||
# Validate file type
|
||||
if not self._validate_file_type(filename):
|
||||
raise ValueError(f"Unsupported file type. Supported types: .txt, .md, .rst, .doc, .docx, .pdf")
|
||||
|
||||
# Get file content
|
||||
content = await self._read_file_content(file)
|
||||
|
||||
# Extract file metadata
|
||||
file_size = len(content)
|
||||
file_type = self._get_file_type(filename)
|
||||
|
||||
# Create document metadata
|
||||
metadata = DocumentMetadata(
|
||||
filename=filename,
|
||||
file_type=file_type,
|
||||
file_size=file_size,
|
||||
upload_timestamp=datetime.now(),
|
||||
last_modified=datetime.now()
|
||||
)
|
||||
|
||||
# Create document object
|
||||
document_id = str(uuid.uuid4())
|
||||
document = Document(
|
||||
id=document_id,
|
||||
metadata=metadata,
|
||||
status=DocumentStatus.PENDING,
|
||||
version=1
|
||||
)
|
||||
|
||||
# Store document in memory
|
||||
self.documents[document_id] = document
|
||||
|
||||
# Start processing
|
||||
try:
|
||||
await self._process_document(document_id, content)
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing document {document_id}: {str(e)}")
|
||||
document.status = DocumentStatus.FAILED
|
||||
raise
|
||||
|
||||
return document
|
||||
|
||||
async def get_document(self, document_id: str) -> Optional[Document]:
|
||||
"""
|
||||
Retrieve a document by ID.
|
||||
|
||||
Args:
|
||||
document_id: The ID of the document to retrieve
|
||||
|
||||
Returns:
|
||||
Document object if found, None otherwise
|
||||
"""
|
||||
return self.documents.get(document_id)
|
||||
|
||||
async def get_report(self, report_id: str) -> Optional[ComplianceReport]:
|
||||
"""
|
||||
Retrieve a compliance report by ID.
|
||||
|
||||
Args:
|
||||
report_id: The ID of the report to retrieve
|
||||
|
||||
Returns:
|
||||
ComplianceReport object if found, None otherwise
|
||||
"""
|
||||
return self.reports.get(report_id)
|
||||
|
||||
async def resubmit_document(self, document_id: str, file: BinaryIO) -> Document:
|
||||
"""
|
||||
Resubmit a document with changes.
|
||||
|
||||
Args:
|
||||
document_id: The ID of the document to resubmit
|
||||
file: The updated document file
|
||||
|
||||
Returns:
|
||||
Updated Document object
|
||||
"""
|
||||
# Check if document exists
|
||||
document = await self.get_document(document_id)
|
||||
if not document:
|
||||
raise ValueError(f"Document with ID {document_id} not found")
|
||||
|
||||
# Get file content
|
||||
content = await self._read_file_content(file)
|
||||
|
||||
# Update document metadata
|
||||
document.metadata.file_size = len(content)
|
||||
document.metadata.last_modified = datetime.now()
|
||||
document.version += 1
|
||||
document.status = DocumentStatus.PENDING
|
||||
|
||||
# Process the updated document
|
||||
try:
|
||||
await self._process_document(document_id, content)
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing resubmitted document {document_id}: {str(e)}")
|
||||
document.status = DocumentStatus.FAILED
|
||||
|
||||
return document
|
||||
|
||||
async def process_document(self, document_id: str, content: str) -> ComplianceReport:
|
||||
"""
|
||||
Process document and generate compliance report.
|
||||
|
||||
Args:
|
||||
document_id: The ID of the document
|
||||
content: Document content
|
||||
|
||||
Returns:
|
||||
ComplianceReport object
|
||||
"""
|
||||
try:
|
||||
# Get the document
|
||||
document = self.documents.get(document_id)
|
||||
if not document:
|
||||
raise ValueError(f"Document {document_id} not found")
|
||||
|
||||
# Split document into sections
|
||||
sections = self._split_into_sections(content)
|
||||
|
||||
# Generate embeddings for sections
|
||||
document.embedding = await self.embedding_service.embed_document(document_id, sections)
|
||||
|
||||
# Identify relevant standards for the document
|
||||
if self.standards_service:
|
||||
# Log the standards service instance ID to verify singleton pattern
|
||||
logger.info(f"Using StandardsService instance: {id(self.standards_service)}")
|
||||
logger.info(f"Standards count before matching: {len(self.standards_service.standards)}")
|
||||
|
||||
standard_names = await self.standards_service.get_standard_names_for_document(content)
|
||||
logger.info(f"Identified standards for document {document_id}: {standard_names}")
|
||||
else:
|
||||
logger.warning(f"No StandardsService available for document {document_id}")
|
||||
standard_names = ["ISO-9001", "IEEE-829", "RFC-2119"]
|
||||
|
||||
# Use reasoning service for compliance analysis
|
||||
report = await self.reasoning_service.analyze_document(document_id, sections, standard_names)
|
||||
|
||||
# Store the report
|
||||
self.reports[report.report_id] = report
|
||||
|
||||
return report
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in document processing: {str(e)}")
|
||||
raise
|
||||
|
||||
async def _read_file_content(self, file: BinaryIO) -> str:
|
||||
"""
|
||||
Read and decode file content.
|
||||
|
||||
Args:
|
||||
file: The file to read
|
||||
|
||||
Returns:
|
||||
File content as string
|
||||
"""
|
||||
file_content = file.read()
|
||||
|
||||
# Try to decode as UTF-8
|
||||
try:
|
||||
return file_content.decode('utf-8')
|
||||
except UnicodeDecodeError:
|
||||
# Try other encodings if UTF-8 fails
|
||||
try:
|
||||
return file_content.decode('latin-1')
|
||||
except:
|
||||
raise ValueError("Unable to decode file content. Please ensure file is text-based.")
|
||||
|
||||
def _get_file_type(self, filename: str) -> str:
|
||||
"""
|
||||
Determine file type from filename.
|
||||
|
||||
Args:
|
||||
filename: The name of the file
|
||||
|
||||
Returns:
|
||||
File type (extension)
|
||||
"""
|
||||
_, extension = os.path.splitext(filename)
|
||||
return extension.lstrip('.').lower()
|
||||
|
||||
def _validate_file_type(self, filename: str) -> bool:
|
||||
"""
|
||||
Validate if the file type is supported.
|
||||
|
||||
Args:
|
||||
filename: Name of the file to validate
|
||||
|
||||
Returns:
|
||||
bool: True if file type is supported, False otherwise
|
||||
"""
|
||||
SUPPORTED_EXTENSIONS = {'.txt', '.md', '.rst', '.doc', '.docx', '.pdf'}
|
||||
_, ext = os.path.splitext(filename)
|
||||
return ext.lower() in SUPPORTED_EXTENSIONS
|
||||
|
||||
def _split_into_sections(self, content: str) -> Dict[str, str]:
|
||||
"""
|
||||
Split document content into sections.
|
||||
|
||||
Args:
|
||||
content: The document content
|
||||
|
||||
Returns:
|
||||
Dictionary mapping section names to section content
|
||||
"""
|
||||
# This is a simple implementation - in production, you would use more advanced
|
||||
# techniques like heading detection, markdown parsing, etc.
|
||||
|
||||
# For simplicity, we'll just split by markdown headings
|
||||
sections = {}
|
||||
|
||||
# Add the whole document as one section
|
||||
sections["full_document"] = content
|
||||
|
||||
# Try to split by markdown headings
|
||||
heading_pattern = re.compile(r'^(#{1,3})\s+(.+)$', re.MULTILINE)
|
||||
matches = list(heading_pattern.finditer(content))
|
||||
|
||||
if matches:
|
||||
for i, match in enumerate(matches):
|
||||
heading_level = len(match.group(1))
|
||||
section_name = match.group(2).strip()
|
||||
|
||||
# Get section content (from this heading to the next, or to the end)
|
||||
start_pos = match.end()
|
||||
end_pos = matches[i+1].start() if i < len(matches) - 1 else len(content)
|
||||
|
||||
section_content = content[start_pos:end_pos].strip()
|
||||
section_key = f"h{heading_level}_{section_name}"
|
||||
|
||||
sections[section_key] = section_content
|
||||
else:
|
||||
# No headings found, try to split by newlines into paragraphs
|
||||
paragraphs = [p for p in content.split('\n\n') if p.strip()]
|
||||
|
||||
for i, paragraph in enumerate(paragraphs):
|
||||
if len(paragraph) > 100: # Only include substantial paragraphs
|
||||
sections[f"paragraph_{i+1}"] = paragraph
|
||||
|
||||
return sections
|
||||
|
||||
async def _generate_mock_report(self, document_id: str, sections: Dict[str, str]) -> ComplianceReport:
|
||||
"""
|
||||
Generate a mock compliance report for development/testing.
|
||||
|
||||
Args:
|
||||
document_id: The ID of the document
|
||||
sections: Dictionary of document sections
|
||||
|
||||
Returns:
|
||||
ComplianceReport object
|
||||
"""
|
||||
# In production, this would use the reasoning service
|
||||
# For now, we'll generate a simple mock report
|
||||
|
||||
# Create some mock issues
|
||||
issues = []
|
||||
|
||||
if "full_document" in sections:
|
||||
content = sections["full_document"]
|
||||
|
||||
# Check for missing sections (mock check)
|
||||
if "introduction" not in content.lower():
|
||||
issues.append(ComplianceIssue(
|
||||
section="Document Structure",
|
||||
description="Missing introduction section",
|
||||
level=ComplianceLevel.MAJOR,
|
||||
recommendation="Add an introduction section to provide context for the document"
|
||||
))
|
||||
|
||||
# Check for formatting issues (mock check)
|
||||
if content.count('#') < 3:
|
||||
issues.append(ComplianceIssue(
|
||||
section="Formatting",
|
||||
description="Insufficient section headings",
|
||||
level=ComplianceLevel.MINOR,
|
||||
recommendation="Use markdown headings to better structure the document"
|
||||
))
|
||||
|
||||
# Check for technical compliance (mock check)
|
||||
if "compliance" in content.lower() and "standard" not in content.lower():
|
||||
issues.append(ComplianceIssue(
|
||||
section="Technical Content",
|
||||
description="Mentions compliance but doesn't reference specific standards",
|
||||
level=ComplianceLevel.CRITICAL,
|
||||
recommendation="Specify which standards or regulations the document complies with"
|
||||
))
|
||||
|
||||
# Calculate mock compliance score
|
||||
if issues:
|
||||
compliance_score = max(0.0, 1.0 - (len(issues) * 0.1))
|
||||
else:
|
||||
compliance_score = 1.0
|
||||
|
||||
# Create summary based on issues
|
||||
if not issues:
|
||||
summary = "The document meets all compliance requirements. No issues found."
|
||||
else:
|
||||
critical_count = sum(1 for i in issues if i.level == ComplianceLevel.CRITICAL)
|
||||
major_count = sum(1 for i in issues if i.level == ComplianceLevel.MAJOR)
|
||||
minor_count = sum(1 for i in issues if i.level == ComplianceLevel.MINOR)
|
||||
|
||||
summary = f"The document has {len(issues)} compliance issues: "
|
||||
if critical_count:
|
||||
summary += f"{critical_count} critical, "
|
||||
if major_count:
|
||||
summary += f"{major_count} major, "
|
||||
if minor_count:
|
||||
summary += f"{minor_count} minor."
|
||||
else:
|
||||
summary = summary.rstrip(", ") + "."
|
||||
|
||||
summary += " See detailed report for recommendations."
|
||||
|
||||
# Create report
|
||||
report = ComplianceReport(
|
||||
document_id=document_id,
|
||||
compliance_score=compliance_score,
|
||||
summary=summary,
|
||||
issues=issues
|
||||
)
|
||||
|
||||
return report
|
||||
|
||||
async def _process_document(self, document_id: str, content: str) -> None:
|
||||
"""
|
||||
Internal method to process a document and update its status.
|
||||
|
||||
Args:
|
||||
document_id: The ID of the document to process
|
||||
content: The document content
|
||||
"""
|
||||
try:
|
||||
# Get the document
|
||||
document = self.documents.get(document_id)
|
||||
if not document:
|
||||
raise ValueError(f"Document {document_id} not found")
|
||||
|
||||
# Update status to processing
|
||||
document.status = DocumentStatus.PROCESSING
|
||||
|
||||
# Generate compliance report
|
||||
report = await self.process_document(document_id, content)
|
||||
|
||||
# Store report ID in document
|
||||
document.reports.append(report.report_id)
|
||||
|
||||
# Update document status
|
||||
document.status = DocumentStatus.COMPLETED
|
||||
|
||||
except Exception as e:
|
||||
# Update document status to failed
|
||||
if document:
|
||||
document.status = DocumentStatus.FAILED
|
||||
raise
|
||||
|
||||
async def get_document_stats(self, document_id: str) -> Dict[str, any]:
|
||||
"""
|
||||
Get statistics for a document.
|
||||
|
||||
Args:
|
||||
document_id: The ID of the document
|
||||
|
||||
Returns:
|
||||
Dictionary containing document statistics
|
||||
"""
|
||||
document = await self.get_document(document_id)
|
||||
if not document:
|
||||
raise ValueError(f"Document {document_id} not found")
|
||||
|
||||
latest_report = None
|
||||
if document.reports:
|
||||
latest_report = await self.get_report(document.reports[-1])
|
||||
|
||||
stats = {
|
||||
"document_id": document_id,
|
||||
"version": document.version,
|
||||
"status": document.status,
|
||||
"file_size": document.metadata.file_size,
|
||||
"upload_date": document.metadata.upload_timestamp,
|
||||
"last_modified": document.metadata.last_modified,
|
||||
"num_reports": len(document.reports),
|
||||
"latest_compliance_score": latest_report.compliance_score if latest_report else None,
|
||||
"critical_issues": latest_report.critical_issues_count if latest_report else 0,
|
||||
"major_issues": latest_report.major_issues_count if latest_report else 0,
|
||||
"minor_issues": latest_report.minor_issues_count if latest_report else 0
|
||||
}
|
||||
|
||||
return stats
|
||||
|
||||
async def cleanup_old_documents(self, days: int = 30) -> List[str]:
|
||||
"""
|
||||
Remove documents older than specified days.
|
||||
|
||||
Args:
|
||||
days: Number of days after which documents should be removed
|
||||
|
||||
Returns:
|
||||
List of removed document IDs
|
||||
"""
|
||||
cutoff_date = datetime.now() - timedelta(days=days)
|
||||
removed_ids = []
|
||||
|
||||
for doc_id, document in list(self.documents.items()):
|
||||
if document.metadata.upload_timestamp < cutoff_date:
|
||||
# Remove associated reports
|
||||
for report_id in document.reports:
|
||||
self.reports.pop(report_id, None)
|
||||
|
||||
# Remove document
|
||||
self.documents.pop(doc_id)
|
||||
removed_ids.append(doc_id)
|
||||
|
||||
return removed_ids
|
||||
|
||||
|
||||
Reference in New Issue
Block a user