ds_scp_task_solution/app/services/document.py

# Document processing
import os
import uuid
from datetime import datetime, timedelta
from typing import Dict, List, Optional, BinaryIO, Tuple
import re
from loguru import logger

from app.core.models import (
    Document,
    DocumentMetadata,
    DocumentStatus,
    ComplianceReport,
    ComplianceIssue,
    ComplianceLevel,
    DocumentEmbedding
)
from app.services.embedding import EmbeddingService
from app.services.reasoning import ReasoningService
from app.services.standards import StandardsService
from app.utils.token_counter import count_tokens, truncate_by_tokens

class DocumentService:
    """Service for handling document processing and storage."""

    def __init__(self, embedding_service: EmbeddingService, reasoning_service: ReasoningService, standards_service: Optional[StandardsService] = None):
        """Initialize with required services."""
        self.embedding_service = embedding_service
        self.reasoning_service = reasoning_service
        self.standards_service = standards_service or StandardsService()
        self.documents = {}  # In-memory storage for documents (replace with DB in production)
        self.reports = {}  # In-memory storage for reports (replace with DB in production)

    async def upload_document(self, file: BinaryIO, filename: str) -> Document:
        """
        Process an uploaded document.

        Args:
            file: The document file
            filename: Name of the uploaded file

        Returns:
            Document object with metadata
        """
        # Validate file type
        if not self._validate_file_type(filename):
            raise ValueError(f"Unsupported file type. Supported types: .txt, .md, .rst, .doc, .docx, .pdf")

        # Get file content
        content = await self._read_file_content(file)

        # Extract file metadata
        file_size = len(content)
        file_type = self._get_file_type(filename)

        # Create document metadata
        metadata = DocumentMetadata(
            filename=filename,
            file_type=file_type,
            file_size=file_size,
            upload_timestamp=datetime.now(),
            last_modified=datetime.now()
        )

        # Create document object
        document_id = str(uuid.uuid4())
        document = Document(
            id=document_id,
            metadata=metadata,
            status=DocumentStatus.PENDING,
            version=1
        )

        # Store document in memory
        self.documents[document_id] = document

        # Start processing
        try:
            await self._process_document(document_id, content)
        except Exception as e:
            logger.error(f"Error processing document {document_id}: {str(e)}")
            document.status = DocumentStatus.FAILED
            raise

        return document

    async def get_document(self, document_id: str) -> Optional[Document]:
        """
        Retrieve a document by ID.

        Args:
            document_id: The ID of the document to retrieve

        Returns:
            Document object if found, None otherwise
        """
        return self.documents.get(document_id)

    async def get_report(self, report_id: str) -> Optional[ComplianceReport]:
        """
        Retrieve a compliance report by ID.

        Args:
            report_id: The ID of the report to retrieve

        Returns:
            ComplianceReport object if found, None otherwise
        """
        return self.reports.get(report_id)

    async def resubmit_document(self, document_id: str, file: BinaryIO) -> Document:
        """
        Resubmit a document with changes.

        Args:
            document_id: The ID of the document to resubmit
            file: The updated document file

        Returns:
            Updated Document object
        """
        # Check if document exists
        document = await self.get_document(document_id)
        if not document:
            raise ValueError(f"Document with ID {document_id} not found")

        # Get file content
        content = await self._read_file_content(file)

        # Update document metadata
        document.metadata.file_size = len(content)
        document.metadata.last_modified = datetime.now()
        document.version += 1
        document.status = DocumentStatus.PENDING

        # Process the updated document
        try:
            await self._process_document(document_id, content)
        except Exception as e:
            logger.error(f"Error processing resubmitted document {document_id}: {str(e)}")
            document.status = DocumentStatus.FAILED

        return document

    async def process_document(self, document_id: str, content: str) -> ComplianceReport:
        """
        Process document and generate compliance report.

        Args:
            document_id: The ID of the document
            content: Document content

        Returns:
            ComplianceReport object
        """
        try:
            # Get the document
            document = self.documents.get(document_id)
            if not document:
                raise ValueError(f"Document {document_id} not found")

            # Split document into sections
            sections = self._split_into_sections(content)

            # Generate embeddings for sections
            document.embedding = await self.embedding_service.embed_document(document_id, sections)

            # Identify relevant standards for the document
            if self.standards_service:
                # Log the standards service instance ID to verify singleton pattern
                logger.info(f"Using StandardsService instance: {id(self.standards_service)}")
                logger.info(f"Standards count before matching: {len(self.standards_service.standards)}")

                standard_names = await self.standards_service.get_standard_names_for_document(content)
                logger.info(f"Identified standards for document {document_id}: {standard_names}")
            else:
                logger.warning(f"No StandardsService available for document {document_id}")
                standard_names = ["ISO-9001", "IEEE-829", "RFC-2119"]

            # Use reasoning service for compliance analysis
            report = await self.reasoning_service.analyze_document(document_id, sections, standard_names)

            # Store the report
            self.reports[report.report_id] = report

            return report

        except Exception as e:
            logger.error(f"Error in document processing: {str(e)}")
            raise

    async def _read_file_content(self, file: BinaryIO) -> str:
        """
        Read and decode file content.

        Args:
            file: The file to read

        Returns:
            File content as string
        """
        file_content = file.read()

        # Try to decode as UTF-8
        try:
            return file_content.decode('utf-8')
        except UnicodeDecodeError:
            # Try other encodings if UTF-8 fails
            try:
                return file_content.decode('latin-1')
            except:
                raise ValueError("Unable to decode file content. Please ensure file is text-based.")

    def _get_file_type(self, filename: str) -> str:
        """
        Determine file type from filename.

        Args:
            filename: The name of the file

        Returns:
            File type (extension)
        """
        _, extension = os.path.splitext(filename)
        return extension.lstrip('.').lower()

    def _validate_file_type(self, filename: str) -> bool:
        """
        Validate if the file type is supported.

        Args:
            filename: Name of the file to validate

        Returns:
            bool: True if file type is supported, False otherwise
        """
        SUPPORTED_EXTENSIONS = {'.txt', '.md', '.rst', '.doc', '.docx', '.pdf'}
        _, ext = os.path.splitext(filename)
        return ext.lower() in SUPPORTED_EXTENSIONS

    def _split_into_sections(self, content: str) -> Dict[str, str]:
        """
        Split document content into sections.

        Args:
            content: The document content

        Returns:
            Dictionary mapping section names to section content
        """
        # This is a simple implementation - in production, you would use more advanced
        # techniques like heading detection, markdown parsing, etc.

        # For simplicity, we'll just split by markdown headings
        sections = {}

        # Add the whole document as one section
        sections["full_document"] = content

        # Try to split by markdown headings
        heading_pattern = re.compile(r'^(#{1,3})\s+(.+)$', re.MULTILINE)
        matches = list(heading_pattern.finditer(content))

        if matches:
            for i, match in enumerate(matches):
                heading_level = len(match.group(1))
                section_name = match.group(2).strip()

                # Get section content (from this heading to the next, or to the end)
                start_pos = match.end()
                end_pos = matches[i+1].start() if i < len(matches) - 1 else len(content)

                section_content = content[start_pos:end_pos].strip()
                section_key = f"h{heading_level}_{section_name}"

                sections[section_key] = section_content
        else:
            # No headings found, try to split by newlines into paragraphs
            paragraphs = [p for p in content.split('\n\n') if p.strip()]

            for i, paragraph in enumerate(paragraphs):
                if len(paragraph) > 100:  # Only include substantial paragraphs
                    sections[f"paragraph_{i+1}"] = paragraph

        return sections

    async def _generate_mock_report(self, document_id: str, sections: Dict[str, str]) -> ComplianceReport:
        """
        Generate a mock compliance report for development/testing.

        Args:
            document_id: The ID of the document
            sections: Dictionary of document sections

        Returns:
            ComplianceReport object
        """
        # In production, this would use the reasoning service
        # For now, we'll generate a simple mock report

        # Create some mock issues
        issues = []

        if "full_document" in sections:
            content = sections["full_document"]

            # Check for missing sections (mock check)
            if "introduction" not in content.lower():
                issues.append(ComplianceIssue(
                    section="Document Structure",
                    description="Missing introduction section",
                    level=ComplianceLevel.MAJOR,
                    recommendation="Add an introduction section to provide context for the document"
                ))

            # Check for formatting issues (mock check)
            if content.count('#') < 3:
                issues.append(ComplianceIssue(
                    section="Formatting",
                    description="Insufficient section headings",
                    level=ComplianceLevel.MINOR,
                    recommendation="Use markdown headings to better structure the document"
                ))

            # Check for technical compliance (mock check)
            if "compliance" in content.lower() and "standard" not in content.lower():
                issues.append(ComplianceIssue(
                    section="Technical Content",
                    description="Mentions compliance but doesn't reference specific standards",
                    level=ComplianceLevel.CRITICAL,
                    recommendation="Specify which standards or regulations the document complies with"
                ))

        # Calculate mock compliance score
        if issues:
            compliance_score = max(0.0, 1.0 - (len(issues) * 0.1))
        else:
            compliance_score = 1.0

        # Create summary based on issues
        if not issues:
            summary = "The document meets all compliance requirements. No issues found."
        else:
            critical_count = sum(1 for i in issues if i.level == ComplianceLevel.CRITICAL)
            major_count = sum(1 for i in issues if i.level == ComplianceLevel.MAJOR)
            minor_count = sum(1 for i in issues if i.level == ComplianceLevel.MINOR)

            summary = f"The document has {len(issues)} compliance issues: "
            if critical_count:
                summary += f"{critical_count} critical, "
            if major_count:
                summary += f"{major_count} major, "
            if minor_count:
                summary += f"{minor_count} minor."
            else:
                summary = summary.rstrip(", ") + "."

            summary += " See detailed report for recommendations."

        # Create report
        report = ComplianceReport(
            document_id=document_id,
            compliance_score=compliance_score,
            summary=summary,
            issues=issues
        )

        return report

    async def _process_document(self, document_id: str, content: str) -> None:
        """
        Internal method to process a document and update its status.

        Args:
            document_id: The ID of the document to process
            content: The document content
        """
        try:
            # Get the document
            document = self.documents.get(document_id)
            if not document:
                raise ValueError(f"Document {document_id} not found")

            # Update status to processing
            document.status = DocumentStatus.PROCESSING

            # Generate compliance report
            report = await self.process_document(document_id, content)

            # Store report ID in document
            document.reports.append(report.report_id)

            # Update document status
            document.status = DocumentStatus.COMPLETED

        except Exception as e:
            # Update document status to failed
            if document:
                document.status = DocumentStatus.FAILED
            raise

    async def get_document_stats(self, document_id: str) -> Dict[str, any]:
        """
        Get statistics for a document.

        Args:
            document_id: The ID of the document

        Returns:
            Dictionary containing document statistics
        """
        document = await self.get_document(document_id)
        if not document:
            raise ValueError(f"Document {document_id} not found")

        latest_report = None
        if document.reports:
            latest_report = await self.get_report(document.reports[-1])

        stats = {
            "document_id": document_id,
            "version": document.version,
            "status": document.status,
            "file_size": document.metadata.file_size,
            "upload_date": document.metadata.upload_timestamp,
            "last_modified": document.metadata.last_modified,
            "num_reports": len(document.reports),
            "latest_compliance_score": latest_report.compliance_score if latest_report else None,
            "critical_issues": latest_report.critical_issues_count if latest_report else 0,
            "major_issues": latest_report.major_issues_count if latest_report else 0,
            "minor_issues": latest_report.minor_issues_count if latest_report else 0
        }

        return stats

    async def cleanup_old_documents(self, days: int = 30) -> List[str]:
        """
        Remove documents older than specified days.

        Args:
            days: Number of days after which documents should be removed

        Returns:
            List of removed document IDs
        """
        cutoff_date = datetime.now() - timedelta(days=days)
        removed_ids = []

        for doc_id, document in list(self.documents.items()):
            if document.metadata.upload_timestamp < cutoff_date:
                # Remove associated reports
                for report_id in document.reports:
                    self.reports.pop(report_id, None)

                # Remove document
                self.documents.pop(doc_id)
                removed_ids.append(doc_id)

        return removed_ids