role extracion and sop generation added

2024-08-31 01:29:39 +00:00
parent ccb0db21d6
commit 1f02a30a16
15 changed files with 734 additions and 11 deletions
@@ -0,0 +1,113 @@
+import os
+import json
+from openai import OpenAI
+from pydantic import BaseModel, Field
+from typing import List, Dict, Optional
+
+class SOPs(BaseModel):
+    must: Optional[List[str]] = Field(default_factory=list)
+    shall: Optional[List[str]] = Field(default_factory=list)
+    will: Optional[List[str]] = Field(default_factory=list)
+
+class RoleSOPs(BaseModel):
+    sops: SOPs
+
+class SOPsFound(BaseModel):
+    message: str
+    status: bool
+
+class RolesResponse(BaseModel):
+    roles: List[str]
+
+class SOPsResponse(BaseModel):
+    roles_sops: Dict[str, SOPs] = Field(default_factory=dict)
+
+class SopGenerator:
+    def __init__(self):
+        self.api_key = os.getenv("OPENAI_API_KEY")
+        self.client = OpenAI(api_key=self.api_key)
+        self.model = "gpt-4o-mini"
+
+    def _extract_text_from_docs(self, docs):
+        """Extract text content from document objects."""
+        return [doc.page_content for doc in docs]
+
+    def get_roles(self, docs) -> RolesResponse:
+        docs_text = self._extract_text_from_docs(docs)
+        response = self.client.beta.chat.completions.parse(
+            model=self.model,
+            messages=[
+                {
+                    "role": "system",
+                    "content": '''Suppose you are a role/position extractor from a company document. 
+                                  You extract the roles as a list, e.g., ["financial analyst", "data scientist", etc.].
+                                  If no roles are found, return an empty list.''',
+                },
+                {
+                    "role": "user",
+                    "content": [{"type": "text", "text": text} for text in docs_text],
+                }
+            ],
+            response_format=RolesResponse,
+            max_tokens=1024,
+            temperature=0.1
+        )
+        return json.loads(response.choices[0].message.content)
+
+    def check_role_sop(self, roles: str, docs) -> SOPsFound:
+        docs_text = self._extract_text_from_docs(docs)
+        response = self.client.beta.chat.completions.parse(
+            model=self.model,
+            messages=[
+                {
+                    "role": "system",
+                    "content": f'''Your role is to check if the SOPs for the provided roles "{roles}" are found in the document.
+                                   You are validating if the document can provide the SOPs.
+                                   Return status=True with a proper message if found, and status=False with a proper message if not.
+                                   Keep the message short, e.g., "SOPs found for the role: {roles}" or "SOPs not found for the role: {roles}".'''
+                },
+                {
+                    "role": "user",
+                    "content": [{"type": "text", "text": text} for text in docs_text],
+                }
+            ],
+            response_format=SOPsFound,
+            max_tokens=1024,
+            temperature=0.1
+        )
+        return json.loads(response.choices[0].message.content)
+
+    def generate_sops(self, roles: List[str], docs) -> SOPsResponse:
+        roles_sops_all = {}
+
+        docs_text = self._extract_text_from_docs(docs)
+
+        for role in roles:
+            response = self.client.beta.chat.completions.parse(
+                model=self.model,
+                messages=[
+                    {
+                        "role": "system",
+                        "content": f'''You are a Standard Operating Procedure (SOP) extractor. 
+                                       Your task is to find SOPs for the role "{role}" in the provided text.
+                                       SOPs should be categorized under "must", "shall", and "will".
+                                       If the SOPs for the role are not explicitly stated, you are required to infer them from the context provided in the document, 
+                                       but only if there is clear evidence within the text. 
+                                       Do not generate or assume SOPs that are not directly supported by the document.
+                                        Your extraction should strictly adhere to the content of the document, ensuring that no information is fabricated or inferred beyond what is present.
+                                       If no SOPs are found for the role, return an empty list for each category.''',
+                    },
+                    {
+                        "role": "user",
+                        "content": [{"type": "text", "text": text} for text in docs_text],
+                    }
+                ],
+                response_format=RoleSOPs,
+                max_tokens=1024,
+                temperature=0.1
+            )
+            role_sop = json.loads(response.choices[0].message.content)
+            roles_sops_all[role] = role_sop
+
+        return roles_sops_all
+