data and code update.

2024-11-29 14:59:00 +01:00
parent b7a73606e0
commit 4a0ff6527c
13 changed files with 8665 additions and 1640 deletions
@@ -141,7 +141,37 @@ def influencer_data(influencer_name: str, product_category:str):
    
    return json.dumps(response)

-# Function to get all influencers details concurrently for a category
+
+def clean_influencer_data(influencer_data: list) -> dict:
+    logger.info("Cleaning and correcting influencer data")
+
+    initiator_prompt = PromptTemplate(
+        template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
+        You are a data-cleaning AI agent. Your task is to review and correct the follower count data for influencers, while maintaining the exact structure of the input.
+
+        Instructions:
+        - Locate follower or subscriber counts within the data.
+        - Convert shorthand notation to full integer values:
+            - 'k' represents thousands (e.g., '25k' becomes 25000).
+            - 'M' represents millions (e.g., '1.2M' becomes 1200000).
+            - 'B' represents billions (e.g., '1B' becomes 1000000000).
+        - Leave full integer counts as they are.
+        - If a follower or subscriber count is missing, keep it as NULL.
+        
+        Return the cleaned data in the same JSON format as provided, with only corrected follower counts if needed.
+    <|eot_id|><|start_header_id|>user<|end_header_id|>
+    {influencer_data}
+    <|eot_id|><|start_header_id|>assistant<|end_header_id|>""",
+        input_variables=["influencer_data"]
+    )
+
+    initiator_router = initiator_prompt | llm | JsonOutputParser()
+    output = initiator_router.invoke({"influencer_data": influencer_data})
+    
+    return output
+
+
+# Function to get all influencers details concurrently for a category# Function to get all influencers details concurrently for a category
 def get_all_influencer_data(influencer_names: list, category: str):
    all_influencers_data = []

@@ -157,8 +187,38 @@ def get_all_influencer_data(influencer_names: list, category: str):
                all_influencers_data.append(influencer_details)
            except Exception as exc:
                logger.error(f"{influencer} generated an exception: {exc}")
+                
+    # Convert the result to JSON format
+    all_influencers_json = json.dumps(all_influencers_data, indent=4)
+
+    # cleaning 
+    cleaned = clean_influencer_data(all_influencers_json)
    
-    return all_influencers_data
+    return cleaned
+
+def extract_youtube_data(influencer_data: list) -> dict:
+    logger.info("Extracting YouTube channel names for influencers")
+
+    initiator_prompt = PromptTemplate(
+        template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
+        You are a data-extraction AI agent. Your task is to extract the YouTube channel name for each influencer from the provided data. If the YouTube channel name is not available, return null for that influencer. 
+
+        Instructions:
+        - For each influencer, locate their YouTube channel name.
+        - If the YouTube channel name is present, include it in the output.
+        - If the YouTube channel name is missing, return null for that influencer.
+        
+        Return the output as a dictionary where the keys are the influencer names and the values are their corresponding YouTube channel names.
+    <|eot_id|><|start_header_id|>user<|end_header_id|>
+    {influencer_data}
+    <|eot_id|><|start_header_id|>assistant<|end_header_id|>""",
+        input_variables=["influencer_data"]
+    )
+
+    initiator_router = initiator_prompt | llm | JsonOutputParser()
+    output = initiator_router.invoke({"influencer_data": influencer_data})
+    
+    return output


 # # data = get_all_influencer_data(influencer_names=['Kylie Swift'], category="Entertainment & Pop Culture")