src/services/sop_generator.py

import os
import json
from openai import OpenAI
from pydantic import BaseModel, Field
from typing import List, Dict, Optional

class SOPs(BaseModel):
    must: Optional[List[str]] = Field(default_factory=list)
    shall: Optional[List[str]] = Field(default_factory=list)
    will: Optional[List[str]] = Field(default_factory=list)

class RoleSOPs(BaseModel):
    sops: SOPs

class SOPsFound(BaseModel):
    message: str
    status: bool

class RolesResponse(BaseModel):
    roles: List[str]

class SOPsResponse(BaseModel):
    roles_sops: Dict[str, SOPs] = Field(default_factory=dict)

class SopGenerator:
    def __init__(self):
        self.api_key = os.getenv("OPENAI_API_KEY")
        self.client = OpenAI(api_key=self.api_key)
        self.model = "gpt-4o-mini"

    def _extract_text_from_docs(self, docs):
        """Extract text content from document objects."""
        return [doc.page_content for doc in docs]

    def get_roles(self, docs) -> RolesResponse:
        docs_text = self._extract_text_from_docs(docs)
        response = self.client.beta.chat.completions.parse(
            model=self.model,
            messages=[
                {
                    "role": "system",
                    "content": '''Suppose you are a role/position extractor from a company document. 
                                  You extract the roles as a list, e.g., ["financial analyst", "data scientist", etc.].
                                  If no roles are found, return an empty list.''',
                },
                {
                    "role": "user",
                    "content": [{"type": "text", "text": text} for text in docs_text],
                }
            ],
            response_format=RolesResponse,
            max_tokens=1024,
            temperature=0.1
        )
        return json.loads(response.choices[0].message.content)

    def check_role_sop(self, roles: str, docs) -> SOPsFound:
        docs_text = self._extract_text_from_docs(docs)
        response = self.client.beta.chat.completions.parse(
            model=self.model,
            messages=[
                {
                    "role": "system",
                    "content": f'''Your role is to check if the SOPs for the provided roles "{roles}" are found in the document.
                                   You are validating if the document can provide the SOPs.
                                   Return status=True with a proper message if found, and status=False with a proper message if not.
                                   Keep the message short, e.g., "SOPs found for the role: {roles}" or "SOPs not found for the role: {roles}".'''
                },
                {
                    "role": "user",
                    "content": [{"type": "text", "text": text} for text in docs_text],
                }
            ],
            response_format=SOPsFound,
            max_tokens=1024,
            temperature=0.1
        )
        return json.loads(response.choices[0].message.content)

    def generate_sops(self, roles: List[str], docs) -> SOPsResponse:
        roles_sops_all = {}

        docs_text = self._extract_text_from_docs(docs)

        for role in roles:
            response = self.client.beta.chat.completions.parse(
                model=self.model,
                messages=[
                    {
                        "role": "system",
                        "content": f'''You are a Standard Operating Procedure (SOP) extractor. 
                                       Your task is to find SOPs for the role "{role}" in the provided text.
                                       SOPs should be categorized under "must", "shall", and "will".
                                       If the SOPs for the role are not explicitly stated, you are required to infer them from the context provided in the document, 
                                       but only if there is clear evidence within the text. 
                                       Do not generate or assume SOPs that are not directly supported by the document.
                                        Your extraction should strictly adhere to the content of the document, ensuring that no information is fabricated or inferred beyond what is present.
                                       If no SOPs are found for the role, return an empty list for each category.''',
                    },
                    {
                        "role": "user",
                        "content": [{"type": "text", "text": text} for text in docs_text],
                    }
                ],
                response_format=RoleSOPs,
                max_tokens=1024,
                temperature=0.1
            )
            role_sop = json.loads(response.choices[0].message.content)
            roles_sops_all[role] = role_sop

        return roles_sops_all
role extracion and sop generation added 2024-08-31 01:29:39 +00:00			`import os`
			`import json`
			`from openai import OpenAI`
			`from pydantic import BaseModel, Field`
			`from typing import List, Dict, Optional`

			`class SOPs(BaseModel):`
			`must: Optional[List[str]] = Field(default_factory=list)`
			`shall: Optional[List[str]] = Field(default_factory=list)`
			`will: Optional[List[str]] = Field(default_factory=list)`

			`class RoleSOPs(BaseModel):`
			`sops: SOPs`

			`class SOPsFound(BaseModel):`
			`message: str`
			`status: bool`

			`class RolesResponse(BaseModel):`
			`roles: List[str]`

			`class SOPsResponse(BaseModel):`
			`roles_sops: Dict[str, SOPs] = Field(default_factory=dict)`

			`class SopGenerator:`
			`def __init__(self):`
			`self.api_key = os.getenv("OPENAI_API_KEY")`
			`self.client = OpenAI(api_key=self.api_key)`
			`self.model = "gpt-4o-mini"`

			`def _extract_text_from_docs(self, docs):`
			`"""Extract text content from document objects."""`
			`return [doc.page_content for doc in docs]`

			`def get_roles(self, docs) -> RolesResponse:`
			`docs_text = self._extract_text_from_docs(docs)`
			`response = self.client.beta.chat.completions.parse(`
			`model=self.model,`
			`messages=[`
			`{`
			`"role": "system",`
			`"content": '''Suppose you are a role/position extractor from a company document.`
			`You extract the roles as a list, e.g., ["financial analyst", "data scientist", etc.].`
			`If no roles are found, return an empty list.''',`
			`},`
			`{`
			`"role": "user",`
			`"content": [{"type": "text", "text": text} for text in docs_text],`
			`}`
			`],`
			`response_format=RolesResponse,`
			`max_tokens=1024,`
			`temperature=0.1`
			`)`
			`return json.loads(response.choices[0].message.content)`

			`def check_role_sop(self, roles: str, docs) -> SOPsFound:`
			`docs_text = self._extract_text_from_docs(docs)`
			`response = self.client.beta.chat.completions.parse(`
			`model=self.model,`
			`messages=[`
			`{`
			`"role": "system",`
			`"content": f'''Your role is to check if the SOPs for the provided roles "{roles}" are found in the document.`
			`You are validating if the document can provide the SOPs.`
			`Return status=True with a proper message if found, and status=False with a proper message if not.`
			`Keep the message short, e.g., "SOPs found for the role: {roles}" or "SOPs not found for the role: {roles}".'''`
			`},`
			`{`
			`"role": "user",`
			`"content": [{"type": "text", "text": text} for text in docs_text],`
			`}`
			`],`
			`response_format=SOPsFound,`
			`max_tokens=1024,`
			`temperature=0.1`
			`)`
			`return json.loads(response.choices[0].message.content)`

			`def generate_sops(self, roles: List[str], docs) -> SOPsResponse:`
			`roles_sops_all = {}`

			`docs_text = self._extract_text_from_docs(docs)`

			`for role in roles:`
			`response = self.client.beta.chat.completions.parse(`
			`model=self.model,`
			`messages=[`
			`{`
			`"role": "system",`
			`"content": f'''You are a Standard Operating Procedure (SOP) extractor.`
			`Your task is to find SOPs for the role "{role}" in the provided text.`
			`SOPs should be categorized under "must", "shall", and "will".`
			`If the SOPs for the role are not explicitly stated, you are required to infer them from the context provided in the document,`
			`but only if there is clear evidence within the text.`
			`Do not generate or assume SOPs that are not directly supported by the document.`
			`Your extraction should strictly adhere to the content of the document, ensuring that no information is fabricated or inferred beyond what is present.`
			`If no SOPs are found for the role, return an empty list for each category.''',`
			`},`
			`{`
			`"role": "user",`
			`"content": [{"type": "text", "text": text} for text in docs_text],`
			`}`
			`],`
			`response_format=RoleSOPs,`
			`max_tokens=1024,`
			`temperature=0.1`
			`)`
			`role_sop = json.loads(response.choices[0].message.content)`
			`roles_sops_all[role] = role_sop`

			`return roles_sops_all`