# Document processing import os import uuid from datetime import datetime, timedelta from typing import Dict, List, Optional, BinaryIO, Tuple import re from loguru import logger from app.core.models import ( Document, DocumentMetadata, DocumentStatus, ComplianceReport, ComplianceIssue, ComplianceLevel, DocumentEmbedding ) from app.services.embedding import EmbeddingService from app.services.reasoning import ReasoningService from app.services.standards import StandardsService from app.utils.token_counter import count_tokens, truncate_by_tokens class DocumentService: """Service for handling document processing and storage.""" def __init__(self, embedding_service: EmbeddingService, reasoning_service: ReasoningService, standards_service: Optional[StandardsService] = None): """Initialize with required services.""" self.embedding_service = embedding_service self.reasoning_service = reasoning_service self.standards_service = standards_service or StandardsService() self.documents = {} # In-memory storage for documents (replace with DB in production) self.reports = {} # In-memory storage for reports (replace with DB in production) async def upload_document(self, file: BinaryIO, filename: str) -> Document: """ Process an uploaded document. Args: file: The document file filename: Name of the uploaded file Returns: Document object with metadata """ # Validate file type if not self._validate_file_type(filename): raise ValueError(f"Unsupported file type. Supported types: .txt, .md, .rst, .doc, .docx, .pdf") # Get file content content = await self._read_file_content(file) # Extract file metadata file_size = len(content) file_type = self._get_file_type(filename) # Create document metadata metadata = DocumentMetadata( filename=filename, file_type=file_type, file_size=file_size, upload_timestamp=datetime.now(), last_modified=datetime.now() ) # Create document object document_id = str(uuid.uuid4()) document = Document( id=document_id, metadata=metadata, status=DocumentStatus.PENDING, version=1 ) # Store document in memory self.documents[document_id] = document # Start processing try: await self._process_document(document_id, content) except Exception as e: logger.error(f"Error processing document {document_id}: {str(e)}") document.status = DocumentStatus.FAILED raise return document async def get_document(self, document_id: str) -> Optional[Document]: """ Retrieve a document by ID. Args: document_id: The ID of the document to retrieve Returns: Document object if found, None otherwise """ return self.documents.get(document_id) async def get_report(self, report_id: str) -> Optional[ComplianceReport]: """ Retrieve a compliance report by ID. Args: report_id: The ID of the report to retrieve Returns: ComplianceReport object if found, None otherwise """ return self.reports.get(report_id) async def resubmit_document(self, document_id: str, file: BinaryIO) -> Document: """ Resubmit a document with changes. Args: document_id: The ID of the document to resubmit file: The updated document file Returns: Updated Document object """ # Check if document exists document = await self.get_document(document_id) if not document: raise ValueError(f"Document with ID {document_id} not found") # Get file content content = await self._read_file_content(file) # Update document metadata document.metadata.file_size = len(content) document.metadata.last_modified = datetime.now() document.version += 1 document.status = DocumentStatus.PENDING # Process the updated document try: await self._process_document(document_id, content) except Exception as e: logger.error(f"Error processing resubmitted document {document_id}: {str(e)}") document.status = DocumentStatus.FAILED return document async def process_document(self, document_id: str, content: str) -> ComplianceReport: """ Process document and generate compliance report. Args: document_id: The ID of the document content: Document content Returns: ComplianceReport object """ try: # Get the document document = self.documents.get(document_id) if not document: raise ValueError(f"Document {document_id} not found") # Split document into sections sections = self._split_into_sections(content) # Generate embeddings for sections document.embedding = await self.embedding_service.embed_document(document_id, sections) # Identify relevant standards for the document if self.standards_service: # Log the standards service instance ID to verify singleton pattern logger.info(f"Using StandardsService instance: {id(self.standards_service)}") logger.info(f"Standards count before matching: {len(self.standards_service.standards)}") standard_names = await self.standards_service.get_standard_names_for_document(content) logger.info(f"Identified standards for document {document_id}: {standard_names}") else: logger.warning(f"No StandardsService available for document {document_id}") standard_names = ["ISO-9001", "IEEE-829", "RFC-2119"] # Use reasoning service for compliance analysis report = await self.reasoning_service.analyze_document(document_id, sections, standard_names) # Store the report self.reports[report.report_id] = report return report except Exception as e: logger.error(f"Error in document processing: {str(e)}") raise async def _read_file_content(self, file: BinaryIO) -> str: """ Read and decode file content. Args: file: The file to read Returns: File content as string """ file_content = file.read() # Try to decode as UTF-8 try: return file_content.decode('utf-8') except UnicodeDecodeError: # Try other encodings if UTF-8 fails try: return file_content.decode('latin-1') except: raise ValueError("Unable to decode file content. Please ensure file is text-based.") def _get_file_type(self, filename: str) -> str: """ Determine file type from filename. Args: filename: The name of the file Returns: File type (extension) """ _, extension = os.path.splitext(filename) return extension.lstrip('.').lower() def _validate_file_type(self, filename: str) -> bool: """ Validate if the file type is supported. Args: filename: Name of the file to validate Returns: bool: True if file type is supported, False otherwise """ SUPPORTED_EXTENSIONS = {'.txt', '.md', '.rst', '.doc', '.docx', '.pdf'} _, ext = os.path.splitext(filename) return ext.lower() in SUPPORTED_EXTENSIONS def _split_into_sections(self, content: str) -> Dict[str, str]: """ Split document content into sections. Args: content: The document content Returns: Dictionary mapping section names to section content """ # This is a simple implementation - in production, you would use more advanced # techniques like heading detection, markdown parsing, etc. # For simplicity, we'll just split by markdown headings sections = {} # Add the whole document as one section sections["full_document"] = content # Try to split by markdown headings heading_pattern = re.compile(r'^(#{1,3})\s+(.+)$', re.MULTILINE) matches = list(heading_pattern.finditer(content)) if matches: for i, match in enumerate(matches): heading_level = len(match.group(1)) section_name = match.group(2).strip() # Get section content (from this heading to the next, or to the end) start_pos = match.end() end_pos = matches[i+1].start() if i < len(matches) - 1 else len(content) section_content = content[start_pos:end_pos].strip() section_key = f"h{heading_level}_{section_name}" sections[section_key] = section_content else: # No headings found, try to split by newlines into paragraphs paragraphs = [p for p in content.split('\n\n') if p.strip()] for i, paragraph in enumerate(paragraphs): if len(paragraph) > 100: # Only include substantial paragraphs sections[f"paragraph_{i+1}"] = paragraph return sections async def _generate_mock_report(self, document_id: str, sections: Dict[str, str]) -> ComplianceReport: """ Generate a mock compliance report for development/testing. Args: document_id: The ID of the document sections: Dictionary of document sections Returns: ComplianceReport object """ # In production, this would use the reasoning service # For now, we'll generate a simple mock report # Create some mock issues issues = [] if "full_document" in sections: content = sections["full_document"] # Check for missing sections (mock check) if "introduction" not in content.lower(): issues.append(ComplianceIssue( section="Document Structure", description="Missing introduction section", level=ComplianceLevel.MAJOR, recommendation="Add an introduction section to provide context for the document" )) # Check for formatting issues (mock check) if content.count('#') < 3: issues.append(ComplianceIssue( section="Formatting", description="Insufficient section headings", level=ComplianceLevel.MINOR, recommendation="Use markdown headings to better structure the document" )) # Check for technical compliance (mock check) if "compliance" in content.lower() and "standard" not in content.lower(): issues.append(ComplianceIssue( section="Technical Content", description="Mentions compliance but doesn't reference specific standards", level=ComplianceLevel.CRITICAL, recommendation="Specify which standards or regulations the document complies with" )) # Calculate mock compliance score if issues: compliance_score = max(0.0, 1.0 - (len(issues) * 0.1)) else: compliance_score = 1.0 # Create summary based on issues if not issues: summary = "The document meets all compliance requirements. No issues found." else: critical_count = sum(1 for i in issues if i.level == ComplianceLevel.CRITICAL) major_count = sum(1 for i in issues if i.level == ComplianceLevel.MAJOR) minor_count = sum(1 for i in issues if i.level == ComplianceLevel.MINOR) summary = f"The document has {len(issues)} compliance issues: " if critical_count: summary += f"{critical_count} critical, " if major_count: summary += f"{major_count} major, " if minor_count: summary += f"{minor_count} minor." else: summary = summary.rstrip(", ") + "." summary += " See detailed report for recommendations." # Create report report = ComplianceReport( document_id=document_id, compliance_score=compliance_score, summary=summary, issues=issues ) return report async def _process_document(self, document_id: str, content: str) -> None: """ Internal method to process a document and update its status. Args: document_id: The ID of the document to process content: The document content """ try: # Get the document document = self.documents.get(document_id) if not document: raise ValueError(f"Document {document_id} not found") # Update status to processing document.status = DocumentStatus.PROCESSING # Generate compliance report report = await self.process_document(document_id, content) # Store report ID in document document.reports.append(report.report_id) # Update document status document.status = DocumentStatus.COMPLETED except Exception as e: # Update document status to failed if document: document.status = DocumentStatus.FAILED raise async def get_document_stats(self, document_id: str) -> Dict[str, any]: """ Get statistics for a document. Args: document_id: The ID of the document Returns: Dictionary containing document statistics """ document = await self.get_document(document_id) if not document: raise ValueError(f"Document {document_id} not found") latest_report = None if document.reports: latest_report = await self.get_report(document.reports[-1]) stats = { "document_id": document_id, "version": document.version, "status": document.status, "file_size": document.metadata.file_size, "upload_date": document.metadata.upload_timestamp, "last_modified": document.metadata.last_modified, "num_reports": len(document.reports), "latest_compliance_score": latest_report.compliance_score if latest_report else None, "critical_issues": latest_report.critical_issues_count if latest_report else 0, "major_issues": latest_report.major_issues_count if latest_report else 0, "minor_issues": latest_report.minor_issues_count if latest_report else 0 } return stats async def cleanup_old_documents(self, days: int = 30) -> List[str]: """ Remove documents older than specified days. Args: days: Number of days after which documents should be removed Returns: List of removed document IDs """ cutoff_date = datetime.now() - timedelta(days=days) removed_ids = [] for doc_id, document in list(self.documents.items()): if document.metadata.upload_timestamp < cutoff_date: # Remove associated reports for report_id in document.reports: self.reports.pop(report_id, None) # Remove document self.documents.pop(doc_id) removed_ids.append(doc_id) return removed_ids