114 lines
4.7 KiB
Python
114 lines
4.7 KiB
Python
|
|
import os
|
||
|
|
import json
|
||
|
|
from openai import OpenAI
|
||
|
|
from pydantic import BaseModel, Field
|
||
|
|
from typing import List, Dict, Optional
|
||
|
|
|
||
|
|
class SOPs(BaseModel):
|
||
|
|
must: Optional[List[str]] = Field(default_factory=list)
|
||
|
|
shall: Optional[List[str]] = Field(default_factory=list)
|
||
|
|
will: Optional[List[str]] = Field(default_factory=list)
|
||
|
|
|
||
|
|
class RoleSOPs(BaseModel):
|
||
|
|
sops: SOPs
|
||
|
|
|
||
|
|
class SOPsFound(BaseModel):
|
||
|
|
message: str
|
||
|
|
status: bool
|
||
|
|
|
||
|
|
class RolesResponse(BaseModel):
|
||
|
|
roles: List[str]
|
||
|
|
|
||
|
|
class SOPsResponse(BaseModel):
|
||
|
|
roles_sops: Dict[str, SOPs] = Field(default_factory=dict)
|
||
|
|
|
||
|
|
class SopGenerator:
|
||
|
|
def __init__(self):
|
||
|
|
self.api_key = os.getenv("OPENAI_API_KEY")
|
||
|
|
self.client = OpenAI(api_key=self.api_key)
|
||
|
|
self.model = "gpt-4o-mini"
|
||
|
|
|
||
|
|
def _extract_text_from_docs(self, docs):
|
||
|
|
"""Extract text content from document objects."""
|
||
|
|
return [doc.page_content for doc in docs]
|
||
|
|
|
||
|
|
def get_roles(self, docs) -> RolesResponse:
|
||
|
|
docs_text = self._extract_text_from_docs(docs)
|
||
|
|
response = self.client.beta.chat.completions.parse(
|
||
|
|
model=self.model,
|
||
|
|
messages=[
|
||
|
|
{
|
||
|
|
"role": "system",
|
||
|
|
"content": '''Suppose you are a role/position extractor from a company document.
|
||
|
|
You extract the roles as a list, e.g., ["financial analyst", "data scientist", etc.].
|
||
|
|
If no roles are found, return an empty list.''',
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"role": "user",
|
||
|
|
"content": [{"type": "text", "text": text} for text in docs_text],
|
||
|
|
}
|
||
|
|
],
|
||
|
|
response_format=RolesResponse,
|
||
|
|
max_tokens=1024,
|
||
|
|
temperature=0.1
|
||
|
|
)
|
||
|
|
return json.loads(response.choices[0].message.content)
|
||
|
|
|
||
|
|
def check_role_sop(self, roles: str, docs) -> SOPsFound:
|
||
|
|
docs_text = self._extract_text_from_docs(docs)
|
||
|
|
response = self.client.beta.chat.completions.parse(
|
||
|
|
model=self.model,
|
||
|
|
messages=[
|
||
|
|
{
|
||
|
|
"role": "system",
|
||
|
|
"content": f'''Your role is to check if the SOPs for the provided roles "{roles}" are found in the document.
|
||
|
|
You are validating if the document can provide the SOPs.
|
||
|
|
Return status=True with a proper message if found, and status=False with a proper message if not.
|
||
|
|
Keep the message short, e.g., "SOPs found for the role: {roles}" or "SOPs not found for the role: {roles}".'''
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"role": "user",
|
||
|
|
"content": [{"type": "text", "text": text} for text in docs_text],
|
||
|
|
}
|
||
|
|
],
|
||
|
|
response_format=SOPsFound,
|
||
|
|
max_tokens=1024,
|
||
|
|
temperature=0.1
|
||
|
|
)
|
||
|
|
return json.loads(response.choices[0].message.content)
|
||
|
|
|
||
|
|
def generate_sops(self, roles: List[str], docs) -> SOPsResponse:
|
||
|
|
roles_sops_all = {}
|
||
|
|
|
||
|
|
docs_text = self._extract_text_from_docs(docs)
|
||
|
|
|
||
|
|
for role in roles:
|
||
|
|
response = self.client.beta.chat.completions.parse(
|
||
|
|
model=self.model,
|
||
|
|
messages=[
|
||
|
|
{
|
||
|
|
"role": "system",
|
||
|
|
"content": f'''You are a Standard Operating Procedure (SOP) extractor.
|
||
|
|
Your task is to find SOPs for the role "{role}" in the provided text.
|
||
|
|
SOPs should be categorized under "must", "shall", and "will".
|
||
|
|
If the SOPs for the role are not explicitly stated, you are required to infer them from the context provided in the document,
|
||
|
|
but only if there is clear evidence within the text.
|
||
|
|
Do not generate or assume SOPs that are not directly supported by the document.
|
||
|
|
Your extraction should strictly adhere to the content of the document, ensuring that no information is fabricated or inferred beyond what is present.
|
||
|
|
If no SOPs are found for the role, return an empty list for each category.''',
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"role": "user",
|
||
|
|
"content": [{"type": "text", "text": text} for text in docs_text],
|
||
|
|
}
|
||
|
|
],
|
||
|
|
response_format=RoleSOPs,
|
||
|
|
max_tokens=1024,
|
||
|
|
temperature=0.1
|
||
|
|
)
|
||
|
|
role_sop = json.loads(response.choices[0].message.content)
|
||
|
|
roles_sops_all[role] = role_sop
|
||
|
|
|
||
|
|
return roles_sops_all
|
||
|
|
|