Initial commit

2025-07-17 22:20:25 +01:00
commit 0e3e22e8cb
39 changed files with 13295 additions and 0 deletions
@@ -0,0 +1,461 @@
+# Document processing
+import os
+import uuid
+from datetime import datetime, timedelta
+from typing import Dict, List, Optional, BinaryIO, Tuple
+import re
+from loguru import logger
+
+from app.core.models import (
+    Document,
+    DocumentMetadata,
+    DocumentStatus,
+    ComplianceReport,
+    ComplianceIssue,
+    ComplianceLevel,
+    DocumentEmbedding
+)
+from app.services.embedding import EmbeddingService
+from app.services.reasoning import ReasoningService
+from app.services.standards import StandardsService
+from app.utils.token_counter import count_tokens, truncate_by_tokens
+
+class DocumentService:
+    """Service for handling document processing and storage."""
+
+    def __init__(self, embedding_service: EmbeddingService, reasoning_service: ReasoningService, standards_service: Optional[StandardsService] = None):
+        """Initialize with required services."""
+        self.embedding_service = embedding_service
+        self.reasoning_service = reasoning_service
+        self.standards_service = standards_service or StandardsService()
+        self.documents = {}  # In-memory storage for documents (replace with DB in production)
+        self.reports = {}  # In-memory storage for reports (replace with DB in production)
+
+    async def upload_document(self, file: BinaryIO, filename: str) -> Document:
+        """
+        Process an uploaded document.
+
+        Args:
+            file: The document file
+            filename: Name of the uploaded file
+
+        Returns:
+            Document object with metadata
+        """
+        # Validate file type
+        if not self._validate_file_type(filename):
+            raise ValueError(f"Unsupported file type. Supported types: .txt, .md, .rst, .doc, .docx, .pdf")
+
+        # Get file content
+        content = await self._read_file_content(file)
+
+        # Extract file metadata
+        file_size = len(content)
+        file_type = self._get_file_type(filename)
+
+        # Create document metadata
+        metadata = DocumentMetadata(
+            filename=filename,
+            file_type=file_type,
+            file_size=file_size,
+            upload_timestamp=datetime.now(),
+            last_modified=datetime.now()
+        )
+
+        # Create document object
+        document_id = str(uuid.uuid4())
+        document = Document(
+            id=document_id,
+            metadata=metadata,
+            status=DocumentStatus.PENDING,
+            version=1
+        )
+
+        # Store document in memory
+        self.documents[document_id] = document
+
+        # Start processing
+        try:
+            await self._process_document(document_id, content)
+        except Exception as e:
+            logger.error(f"Error processing document {document_id}: {str(e)}")
+            document.status = DocumentStatus.FAILED
+            raise
+
+        return document
+
+    async def get_document(self, document_id: str) -> Optional[Document]:
+        """
+        Retrieve a document by ID.
+
+        Args:
+            document_id: The ID of the document to retrieve
+
+        Returns:
+            Document object if found, None otherwise
+        """
+        return self.documents.get(document_id)
+
+    async def get_report(self, report_id: str) -> Optional[ComplianceReport]:
+        """
+        Retrieve a compliance report by ID.
+
+        Args:
+            report_id: The ID of the report to retrieve
+
+        Returns:
+            ComplianceReport object if found, None otherwise
+        """
+        return self.reports.get(report_id)
+
+    async def resubmit_document(self, document_id: str, file: BinaryIO) -> Document:
+        """
+        Resubmit a document with changes.
+
+        Args:
+            document_id: The ID of the document to resubmit
+            file: The updated document file
+
+        Returns:
+            Updated Document object
+        """
+        # Check if document exists
+        document = await self.get_document(document_id)
+        if not document:
+            raise ValueError(f"Document with ID {document_id} not found")
+
+        # Get file content
+        content = await self._read_file_content(file)
+
+        # Update document metadata
+        document.metadata.file_size = len(content)
+        document.metadata.last_modified = datetime.now()
+        document.version += 1
+        document.status = DocumentStatus.PENDING
+
+        # Process the updated document
+        try:
+            await self._process_document(document_id, content)
+        except Exception as e:
+            logger.error(f"Error processing resubmitted document {document_id}: {str(e)}")
+            document.status = DocumentStatus.FAILED
+
+        return document
+
+    async def process_document(self, document_id: str, content: str) -> ComplianceReport:
+        """
+        Process document and generate compliance report.
+
+        Args:
+            document_id: The ID of the document
+            content: Document content
+
+        Returns:
+            ComplianceReport object
+        """
+        try:
+            # Get the document
+            document = self.documents.get(document_id)
+            if not document:
+                raise ValueError(f"Document {document_id} not found")
+
+            # Split document into sections
+            sections = self._split_into_sections(content)
+
+            # Generate embeddings for sections
+            document.embedding = await self.embedding_service.embed_document(document_id, sections)
+
+            # Identify relevant standards for the document
+            if self.standards_service:
+                # Log the standards service instance ID to verify singleton pattern
+                logger.info(f"Using StandardsService instance: {id(self.standards_service)}")
+                logger.info(f"Standards count before matching: {len(self.standards_service.standards)}")
+
+                standard_names = await self.standards_service.get_standard_names_for_document(content)
+                logger.info(f"Identified standards for document {document_id}: {standard_names}")
+            else:
+                logger.warning(f"No StandardsService available for document {document_id}")
+                standard_names = ["ISO-9001", "IEEE-829", "RFC-2119"]
+
+            # Use reasoning service for compliance analysis
+            report = await self.reasoning_service.analyze_document(document_id, sections, standard_names)
+
+            # Store the report
+            self.reports[report.report_id] = report
+
+            return report
+
+        except Exception as e:
+            logger.error(f"Error in document processing: {str(e)}")
+            raise
+
+    async def _read_file_content(self, file: BinaryIO) -> str:
+        """
+        Read and decode file content.
+
+        Args:
+            file: The file to read
+
+        Returns:
+            File content as string
+        """
+        file_content = file.read()
+
+        # Try to decode as UTF-8
+        try:
+            return file_content.decode('utf-8')
+        except UnicodeDecodeError:
+            # Try other encodings if UTF-8 fails
+            try:
+                return file_content.decode('latin-1')
+            except:
+                raise ValueError("Unable to decode file content. Please ensure file is text-based.")
+
+    def _get_file_type(self, filename: str) -> str:
+        """
+        Determine file type from filename.
+
+        Args:
+            filename: The name of the file
+
+        Returns:
+            File type (extension)
+        """
+        _, extension = os.path.splitext(filename)
+        return extension.lstrip('.').lower()
+
+    def _validate_file_type(self, filename: str) -> bool:
+        """
+        Validate if the file type is supported.
+
+        Args:
+            filename: Name of the file to validate
+
+        Returns:
+            bool: True if file type is supported, False otherwise
+        """
+        SUPPORTED_EXTENSIONS = {'.txt', '.md', '.rst', '.doc', '.docx', '.pdf'}
+        _, ext = os.path.splitext(filename)
+        return ext.lower() in SUPPORTED_EXTENSIONS
+
+    def _split_into_sections(self, content: str) -> Dict[str, str]:
+        """
+        Split document content into sections.
+
+        Args:
+            content: The document content
+
+        Returns:
+            Dictionary mapping section names to section content
+        """
+        # This is a simple implementation - in production, you would use more advanced
+        # techniques like heading detection, markdown parsing, etc.
+
+        # For simplicity, we'll just split by markdown headings
+        sections = {}
+
+        # Add the whole document as one section
+        sections["full_document"] = content
+
+        # Try to split by markdown headings
+        heading_pattern = re.compile(r'^(#{1,3})\s+(.+)$', re.MULTILINE)
+        matches = list(heading_pattern.finditer(content))
+
+        if matches:
+            for i, match in enumerate(matches):
+                heading_level = len(match.group(1))
+                section_name = match.group(2).strip()
+
+                # Get section content (from this heading to the next, or to the end)
+                start_pos = match.end()
+                end_pos = matches[i+1].start() if i < len(matches) - 1 else len(content)
+
+                section_content = content[start_pos:end_pos].strip()
+                section_key = f"h{heading_level}_{section_name}"
+
+                sections[section_key] = section_content
+        else:
+            # No headings found, try to split by newlines into paragraphs
+            paragraphs = [p for p in content.split('\n\n') if p.strip()]
+
+            for i, paragraph in enumerate(paragraphs):
+                if len(paragraph) > 100:  # Only include substantial paragraphs
+                    sections[f"paragraph_{i+1}"] = paragraph
+
+        return sections
+
+    async def _generate_mock_report(self, document_id: str, sections: Dict[str, str]) -> ComplianceReport:
+        """
+        Generate a mock compliance report for development/testing.
+
+        Args:
+            document_id: The ID of the document
+            sections: Dictionary of document sections
+
+        Returns:
+            ComplianceReport object
+        """
+        # In production, this would use the reasoning service
+        # For now, we'll generate a simple mock report
+
+        # Create some mock issues
+        issues = []
+
+        if "full_document" in sections:
+            content = sections["full_document"]
+
+            # Check for missing sections (mock check)
+            if "introduction" not in content.lower():
+                issues.append(ComplianceIssue(
+                    section="Document Structure",
+                    description="Missing introduction section",
+                    level=ComplianceLevel.MAJOR,
+                    recommendation="Add an introduction section to provide context for the document"
+                ))
+
+            # Check for formatting issues (mock check)
+            if content.count('#') < 3:
+                issues.append(ComplianceIssue(
+                    section="Formatting",
+                    description="Insufficient section headings",
+                    level=ComplianceLevel.MINOR,
+                    recommendation="Use markdown headings to better structure the document"
+                ))
+
+            # Check for technical compliance (mock check)
+            if "compliance" in content.lower() and "standard" not in content.lower():
+                issues.append(ComplianceIssue(
+                    section="Technical Content",
+                    description="Mentions compliance but doesn't reference specific standards",
+                    level=ComplianceLevel.CRITICAL,
+                    recommendation="Specify which standards or regulations the document complies with"
+                ))
+
+        # Calculate mock compliance score
+        if issues:
+            compliance_score = max(0.0, 1.0 - (len(issues) * 0.1))
+        else:
+            compliance_score = 1.0
+
+        # Create summary based on issues
+        if not issues:
+            summary = "The document meets all compliance requirements. No issues found."
+        else:
+            critical_count = sum(1 for i in issues if i.level == ComplianceLevel.CRITICAL)
+            major_count = sum(1 for i in issues if i.level == ComplianceLevel.MAJOR)
+            minor_count = sum(1 for i in issues if i.level == ComplianceLevel.MINOR)
+
+            summary = f"The document has {len(issues)} compliance issues: "
+            if critical_count:
+                summary += f"{critical_count} critical, "
+            if major_count:
+                summary += f"{major_count} major, "
+            if minor_count:
+                summary += f"{minor_count} minor."
+            else:
+                summary = summary.rstrip(", ") + "."
+
+            summary += " See detailed report for recommendations."
+
+        # Create report
+        report = ComplianceReport(
+            document_id=document_id,
+            compliance_score=compliance_score,
+            summary=summary,
+            issues=issues
+        )
+
+        return report
+
+    async def _process_document(self, document_id: str, content: str) -> None:
+        """
+        Internal method to process a document and update its status.
+
+        Args:
+            document_id: The ID of the document to process
+            content: The document content
+        """
+        try:
+            # Get the document
+            document = self.documents.get(document_id)
+            if not document:
+                raise ValueError(f"Document {document_id} not found")
+
+            # Update status to processing
+            document.status = DocumentStatus.PROCESSING
+
+            # Generate compliance report
+            report = await self.process_document(document_id, content)
+
+            # Store report ID in document
+            document.reports.append(report.report_id)
+
+            # Update document status
+            document.status = DocumentStatus.COMPLETED
+
+        except Exception as e:
+            # Update document status to failed
+            if document:
+                document.status = DocumentStatus.FAILED
+            raise
+
+    async def get_document_stats(self, document_id: str) -> Dict[str, any]:
+        """
+        Get statistics for a document.
+
+        Args:
+            document_id: The ID of the document
+
+        Returns:
+            Dictionary containing document statistics
+        """
+        document = await self.get_document(document_id)
+        if not document:
+            raise ValueError(f"Document {document_id} not found")
+
+        latest_report = None
+        if document.reports:
+            latest_report = await self.get_report(document.reports[-1])
+
+        stats = {
+            "document_id": document_id,
+            "version": document.version,
+            "status": document.status,
+            "file_size": document.metadata.file_size,
+            "upload_date": document.metadata.upload_timestamp,
+            "last_modified": document.metadata.last_modified,
+            "num_reports": len(document.reports),
+            "latest_compliance_score": latest_report.compliance_score if latest_report else None,
+            "critical_issues": latest_report.critical_issues_count if latest_report else 0,
+            "major_issues": latest_report.major_issues_count if latest_report else 0,
+            "minor_issues": latest_report.minor_issues_count if latest_report else 0
+        }
+
+        return stats
+
+    async def cleanup_old_documents(self, days: int = 30) -> List[str]:
+        """
+        Remove documents older than specified days.
+
+        Args:
+            days: Number of days after which documents should be removed
+
+        Returns:
+            List of removed document IDs
+        """
+        cutoff_date = datetime.now() - timedelta(days=days)
+        removed_ids = []
+
+        for doc_id, document in list(self.documents.items()):
+            if document.metadata.upload_timestamp < cutoff_date:
+                # Remove associated reports
+                for report_id in document.reports:
+                    self.reports.pop(report_id, None)
+
+                # Remove document
+                self.documents.pop(doc_id)
+                removed_ids.append(doc_id)
+
+        return removed_ids
+
+