From 2ee7d1d0e4a99008b9356cd08dca50d7cb25d975 Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Fri, 13 Sep 2024 15:08:23 -0400 Subject: [PATCH 1/4] init --- .gitignore | 1 + examples/o1_web_crawler /o1_web_crawler.py | 132 +++++++++++++++++++++ examples/o1_web_crawler /requirements.txt | 3 + 3 files changed, 136 insertions(+) create mode 100644 examples/o1_web_crawler /o1_web_crawler.py create mode 100644 examples/o1_web_crawler /requirements.txt diff --git a/.gitignore b/.gitignore index 91b7ef48..367f28a7 100644 --- a/.gitignore +++ b/.gitignore @@ -19,3 +19,4 @@ apps/test-suite/load-test-results/test-run-report.json apps/playwright-service-ts/node_modules/ apps/playwright-service-ts/package-lock.json +/examples/o1_web_crawler /venv diff --git a/examples/o1_web_crawler /o1_web_crawler.py b/examples/o1_web_crawler /o1_web_crawler.py new file mode 100644 index 00000000..497dd771 --- /dev/null +++ b/examples/o1_web_crawler /o1_web_crawler.py @@ -0,0 +1,132 @@ +import os +from firecrawl import FirecrawlApp +import json +from dotenv import load_dotenv +from openai import OpenAI + +# Load environment variables +load_dotenv() + +# Retrieve API keys from environment variables +firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY") +openai_api_key = os.getenv("OPENAI_API_KEY") + +# Initialize the FirecrawlApp and OpenAI client +app = FirecrawlApp(api_key=firecrawl_api_key) +client = OpenAI(api_key=openai_api_key) + +# Find the page that most likely contains the objective +def find_relevant_page_via_map(objective, url, app, client): + try: + print(f"Okay, the objective is: {objective}") + print(f"I am going to search the website: {url}") + + map_prompt = f""" + The map function generates a list of URLs from a website and it accepts a search parameter. Based on the objective of: {objective}, come up with a 1-2 word search parameter that will help us find the information we need. Only respond with 1-2 words nothing else. + """ + + print("I'm asking the AI to suggest a search parameter...") + completion = client.chat.completions.create( + model="o1-preview", + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": map_prompt + } + ] + } + ] + ) + + map_search_parameter = completion.choices[0].message.content + print(f"I think the search parameter should be: {map_search_parameter}") + + print(f"Now I'm going to map the website using this search parameter...") + map_website = app.map_url(url, params=map_search_parameter) + print("I've successfully mapped the website!") + return map_website + except Exception as e: + print(f"Oops! An error occurred while finding the relevant page: {str(e)}") + return None + +# Scrape the top 3 pages and see if the objective is met, if so return in json format else return None +def find_objective_in_top_pages(map_website, objective, app, client): + try: + # Get top 3 links from the map result + top_links = map_website['links'][:3] + print(f"I'm going to check the top 3 links: {top_links}") + + for link in top_links: + print(f"Now I'm scraping this page: {link}") + # Scrape the page + scrape_result = app.scrape_url(link, params={'formats': ['markdown']}) + print("I've successfully scraped the page!") + + # Check if objective is met + check_prompt = f""" + Given the following scraped content and objective, determine if the objective is met. + If it is, extract the relevant information in JSON format. + If not, respond with 'Objective not met'. + + Objective: {objective} + Scraped content: {scrape_result['data']['markdown']} + """ + + print("I'm asking the AI to check if the objective is met...") + completion = client.chat.completions.create( + model="o1-preview", + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": check_prompt + } + ] + } + ] + ) + + result = completion.choices[0].message.content + + if result != "Objective not met": + print("Great news! I think I've found what we're looking for!") + return json.loads(result) + else: + print("This page doesn't seem to have what we need. Moving to the next one...") + + print("I've checked all 3 pages, but couldn't find what we're looking for.") + return None + except Exception as e: + print(f"Oh no! An error occurred while scraping top pages: {str(e)}") + return None + +# Main function to execute the process +def main(): + # Get user input + url = input("Enter the website to crawl: ") + objective = input("Enter your objective: ") + + print("Alright, let's get started!") + # Find the relevant page + map_website = find_relevant_page_via_map(objective, url, app, client) + + if map_website: + print("Great! I've found some relevant pages. Now let's see if we can find what we're looking for...") + # Find objective in top pages + result = find_objective_in_top_pages(map_website, objective, app, client) + + if result: + print("Success! I've found what you're looking for. Here's the extracted information:") + print(json.dumps(result, indent=2)) + else: + print("I'm sorry, but I couldn't find what you're looking for in the top 3 pages.") + else: + print("I'm afraid I couldn't find any relevant pages. Maybe we could try a different website or rephrase the objective?") + +if __name__ == "__main__": + main() diff --git a/examples/o1_web_crawler /requirements.txt b/examples/o1_web_crawler /requirements.txt new file mode 100644 index 00000000..249f8beb --- /dev/null +++ b/examples/o1_web_crawler /requirements.txt @@ -0,0 +1,3 @@ +firecrawl-py +python-dotenv +openai \ No newline at end of file From 3900603a28910af1b6fed23835774196b2da45c6 Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Mon, 16 Sep 2024 11:18:57 -0400 Subject: [PATCH 2/4] Almost done --- examples/o1_web_crawler /o1_web_crawler.py | 65 +++++++++++++--------- 1 file changed, 38 insertions(+), 27 deletions(-) diff --git a/examples/o1_web_crawler /o1_web_crawler.py b/examples/o1_web_crawler /o1_web_crawler.py index 497dd771..a0f98034 100644 --- a/examples/o1_web_crawler /o1_web_crawler.py +++ b/examples/o1_web_crawler /o1_web_crawler.py @@ -18,14 +18,14 @@ client = OpenAI(api_key=openai_api_key) # Find the page that most likely contains the objective def find_relevant_page_via_map(objective, url, app, client): try: - print(f"Okay, the objective is: {objective}") - print(f"I am going to search the website: {url}") + print(f"Understood. The objective is: {objective}") + print(f"Initiating search on the website: {url}") map_prompt = f""" The map function generates a list of URLs from a website and it accepts a search parameter. Based on the objective of: {objective}, come up with a 1-2 word search parameter that will help us find the information we need. Only respond with 1-2 words nothing else. """ - print("I'm asking the AI to suggest a search parameter...") + print("Analyzing objective to determine optimal search parameter...") completion = client.chat.completions.create( model="o1-preview", messages=[ @@ -42,40 +42,47 @@ def find_relevant_page_via_map(objective, url, app, client): ) map_search_parameter = completion.choices[0].message.content - print(f"I think the search parameter should be: {map_search_parameter}") + print(f"Optimal search parameter identified: {map_search_parameter}") - print(f"Now I'm going to map the website using this search parameter...") - map_website = app.map_url(url, params=map_search_parameter) - print("I've successfully mapped the website!") + print(f"Mapping website using the identified search parameter...") + map_website = app.map_url(url, params={"search": map_search_parameter}) + print("Website mapping completed successfully.") + print(f"Located {len(map_website)} relevant links.") return map_website except Exception as e: - print(f"Oops! An error occurred while finding the relevant page: {str(e)}") + print(f"Error encountered during relevant page identification: {str(e)}") return None # Scrape the top 3 pages and see if the objective is met, if so return in json format else return None def find_objective_in_top_pages(map_website, objective, app, client): try: # Get top 3 links from the map result - top_links = map_website['links'][:3] - print(f"I'm going to check the top 3 links: {top_links}") + top_links = map_website[:3] if isinstance(map_website, list) else [] + print(f"Proceeding to analyze top {len(top_links)} links: {top_links}") for link in top_links: - print(f"Now I'm scraping this page: {link}") + print(f"Initiating scrape of page: {link}") # Scrape the page scrape_result = app.scrape_url(link, params={'formats': ['markdown']}) - print("I've successfully scraped the page!") + print("Page scraping completed successfully.") + # Check if objective is met check_prompt = f""" - Given the following scraped content and objective, determine if the objective is met. - If it is, extract the relevant information in JSON format. - If not, respond with 'Objective not met'. + Given the following scraped content and objective, determine if the objective is met with high confidence. + If it is, extract the relevant information in a simple and concise JSON format. Use only the necessary fields and avoid nested structures if possible. + If the objective is not met with high confidence, respond with 'Objective not met'. Objective: {objective} - Scraped content: {scrape_result['data']['markdown']} + Scraped content: {scrape_result['markdown']} + + Remember: + 1. Only return JSON if you are highly confident the objective is fully met. + 2. Keep the JSON structure as simple and flat as possible. + 3. Do not include any explanations or markdown formatting in your response. """ - print("I'm asking the AI to check if the objective is met...") + print("Analyzing scraped content to determine objective fulfillment...") completion = client.chat.completions.create( model="o1-preview", messages=[ @@ -94,15 +101,19 @@ def find_objective_in_top_pages(map_website, objective, app, client): result = completion.choices[0].message.content if result != "Objective not met": - print("Great news! I think I've found what we're looking for!") - return json.loads(result) + print("Objective potentially fulfilled. Relevant information identified.") + try: + print(result) + return json.loads(result) + except json.JSONDecodeError: + print("Error in parsing response. Proceeding to next page...") else: - print("This page doesn't seem to have what we need. Moving to the next one...") + print("Objective not met on this page. Proceeding to next link...") - print("I've checked all 3 pages, but couldn't find what we're looking for.") + print("All available pages analyzed. Objective not fulfilled in examined content.") return None except Exception as e: - print(f"Oh no! An error occurred while scraping top pages: {str(e)}") + print(f"Error encountered during page analysis: {str(e)}") return None # Main function to execute the process @@ -111,22 +122,22 @@ def main(): url = input("Enter the website to crawl: ") objective = input("Enter your objective: ") - print("Alright, let's get started!") + print("Initiating web crawling process.") # Find the relevant page map_website = find_relevant_page_via_map(objective, url, app, client) if map_website: - print("Great! I've found some relevant pages. Now let's see if we can find what we're looking for...") + print("Relevant pages identified. Proceeding with detailed analysis...") # Find objective in top pages result = find_objective_in_top_pages(map_website, objective, app, client) if result: - print("Success! I've found what you're looking for. Here's the extracted information:") + print("Objective successfully fulfilled. Extracted information:") print(json.dumps(result, indent=2)) else: - print("I'm sorry, but I couldn't find what you're looking for in the top 3 pages.") + print("Unable to fulfill the objective with the available content.") else: - print("I'm afraid I couldn't find any relevant pages. Maybe we could try a different website or rephrase the objective?") + print("No relevant pages identified. Consider refining the search parameters or trying a different website.") if __name__ == "__main__": main() From 8c05aed6e9998889802dc03c3e696211f87f2ebe Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Mon, 16 Sep 2024 11:30:25 -0400 Subject: [PATCH 3/4] Finishing o1 crawler example --- examples/o1_web_crawler /o1_web_crawler.py | 61 +++++++++++++--------- 1 file changed, 35 insertions(+), 26 deletions(-) diff --git a/examples/o1_web_crawler /o1_web_crawler.py b/examples/o1_web_crawler /o1_web_crawler.py index a0f98034..e4fee3e8 100644 --- a/examples/o1_web_crawler /o1_web_crawler.py +++ b/examples/o1_web_crawler /o1_web_crawler.py @@ -4,6 +4,16 @@ import json from dotenv import load_dotenv from openai import OpenAI +# ANSI color codes +class Colors: + CYAN = '\033[96m' + YELLOW = '\033[93m' + GREEN = '\033[92m' + RED = '\033[91m' + MAGENTA = '\033[95m' + BLUE = '\033[94m' + RESET = '\033[0m' + # Load environment variables load_dotenv() @@ -18,14 +28,14 @@ client = OpenAI(api_key=openai_api_key) # Find the page that most likely contains the objective def find_relevant_page_via_map(objective, url, app, client): try: - print(f"Understood. The objective is: {objective}") - print(f"Initiating search on the website: {url}") + print(f"{Colors.CYAN}Understood. The objective is: {objective}{Colors.RESET}") + print(f"{Colors.CYAN}Initiating search on the website: {url}{Colors.RESET}") map_prompt = f""" The map function generates a list of URLs from a website and it accepts a search parameter. Based on the objective of: {objective}, come up with a 1-2 word search parameter that will help us find the information we need. Only respond with 1-2 words nothing else. """ - print("Analyzing objective to determine optimal search parameter...") + print(f"{Colors.YELLOW}Analyzing objective to determine optimal search parameter...{Colors.RESET}") completion = client.chat.completions.create( model="o1-preview", messages=[ @@ -42,15 +52,15 @@ def find_relevant_page_via_map(objective, url, app, client): ) map_search_parameter = completion.choices[0].message.content - print(f"Optimal search parameter identified: {map_search_parameter}") + print(f"{Colors.GREEN}Optimal search parameter identified: {map_search_parameter}{Colors.RESET}") - print(f"Mapping website using the identified search parameter...") + print(f"{Colors.YELLOW}Mapping website using the identified search parameter...{Colors.RESET}") map_website = app.map_url(url, params={"search": map_search_parameter}) - print("Website mapping completed successfully.") - print(f"Located {len(map_website)} relevant links.") + print(f"{Colors.GREEN}Website mapping completed successfully.{Colors.RESET}") + print(f"{Colors.GREEN}Located {len(map_website)} relevant links.{Colors.RESET}") return map_website except Exception as e: - print(f"Error encountered during relevant page identification: {str(e)}") + print(f"{Colors.RED}Error encountered during relevant page identification: {str(e)}{Colors.RESET}") return None # Scrape the top 3 pages and see if the objective is met, if so return in json format else return None @@ -58,13 +68,13 @@ def find_objective_in_top_pages(map_website, objective, app, client): try: # Get top 3 links from the map result top_links = map_website[:3] if isinstance(map_website, list) else [] - print(f"Proceeding to analyze top {len(top_links)} links: {top_links}") + print(f"{Colors.CYAN}Proceeding to analyze top {len(top_links)} links: {top_links}{Colors.RESET}") for link in top_links: - print(f"Initiating scrape of page: {link}") + print(f"{Colors.YELLOW}Initiating scrape of page: {link}{Colors.RESET}") # Scrape the page scrape_result = app.scrape_url(link, params={'formats': ['markdown']}) - print("Page scraping completed successfully.") + print(f"{Colors.GREEN}Page scraping completed successfully.{Colors.RESET}") # Check if objective is met @@ -82,7 +92,7 @@ def find_objective_in_top_pages(map_website, objective, app, client): 3. Do not include any explanations or markdown formatting in your response. """ - print("Analyzing scraped content to determine objective fulfillment...") + print(f"{Colors.YELLOW}Analyzing scraped content to determine objective fulfillment...{Colors.RESET}") completion = client.chat.completions.create( model="o1-preview", messages=[ @@ -101,43 +111,42 @@ def find_objective_in_top_pages(map_website, objective, app, client): result = completion.choices[0].message.content if result != "Objective not met": - print("Objective potentially fulfilled. Relevant information identified.") + print(f"{Colors.GREEN}Objective potentially fulfilled. Relevant information identified.{Colors.RESET}") try: - print(result) return json.loads(result) except json.JSONDecodeError: - print("Error in parsing response. Proceeding to next page...") + print(f"{Colors.RED}Error in parsing response. Proceeding to next page...{Colors.RESET}") else: - print("Objective not met on this page. Proceeding to next link...") + print(f"{Colors.YELLOW}Objective not met on this page. Proceeding to next link...{Colors.RESET}") - print("All available pages analyzed. Objective not fulfilled in examined content.") + print(f"{Colors.RED}All available pages analyzed. Objective not fulfilled in examined content.{Colors.RESET}") return None except Exception as e: - print(f"Error encountered during page analysis: {str(e)}") + print(f"{Colors.RED}Error encountered during page analysis: {str(e)}{Colors.RESET}") return None # Main function to execute the process def main(): # Get user input - url = input("Enter the website to crawl: ") - objective = input("Enter your objective: ") + url = input(f"{Colors.BLUE}Enter the website to crawl: {Colors.RESET}") + objective = input(f"{Colors.BLUE}Enter your objective: {Colors.RESET}") - print("Initiating web crawling process.") + print(f"{Colors.YELLOW}Initiating web crawling process...{Colors.RESET}") # Find the relevant page map_website = find_relevant_page_via_map(objective, url, app, client) if map_website: - print("Relevant pages identified. Proceeding with detailed analysis...") + print(f"{Colors.GREEN}Relevant pages identified. Proceeding with detailed analysis...{Colors.RESET}") # Find objective in top pages result = find_objective_in_top_pages(map_website, objective, app, client) if result: - print("Objective successfully fulfilled. Extracted information:") - print(json.dumps(result, indent=2)) + print(f"{Colors.GREEN}Objective successfully fulfilled. Extracted information:{Colors.RESET}") + print(f"{Colors.MAGENTA}{json.dumps(result, indent=2)}{Colors.RESET}") else: - print("Unable to fulfill the objective with the available content.") + print(f"{Colors.RED}Unable to fulfill the objective with the available content.{Colors.RESET}") else: - print("No relevant pages identified. Consider refining the search parameters or trying a different website.") + print(f"{Colors.RED}No relevant pages identified. Consider refining the search parameters or trying a different website.{Colors.RESET}") if __name__ == "__main__": main() From e58144798f124629543ac1de1a3cae798228cf4d Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Mon, 16 Sep 2024 16:04:32 -0400 Subject: [PATCH 4/4] Update o1_web_crawler.py --- examples/o1_web_crawler /o1_web_crawler.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/o1_web_crawler /o1_web_crawler.py b/examples/o1_web_crawler /o1_web_crawler.py index e4fee3e8..45bbd1ee 100644 --- a/examples/o1_web_crawler /o1_web_crawler.py +++ b/examples/o1_web_crawler /o1_web_crawler.py @@ -79,20 +79,19 @@ def find_objective_in_top_pages(map_website, objective, app, client): # Check if objective is met check_prompt = f""" - Given the following scraped content and objective, determine if the objective is met with high confidence. + Given the following scraped content and objective, determine if the objective is met. If it is, extract the relevant information in a simple and concise JSON format. Use only the necessary fields and avoid nested structures if possible. - If the objective is not met with high confidence, respond with 'Objective not met'. + If the objective is not met with confidence, respond with 'Objective not met'. Objective: {objective} Scraped content: {scrape_result['markdown']} Remember: - 1. Only return JSON if you are highly confident the objective is fully met. + 1. Only return JSON if you are confident the objective is fully met. 2. Keep the JSON structure as simple and flat as possible. 3. Do not include any explanations or markdown formatting in your response. """ - - print(f"{Colors.YELLOW}Analyzing scraped content to determine objective fulfillment...{Colors.RESET}") + completion = client.chat.completions.create( model="o1-preview", messages=[ @@ -121,6 +120,7 @@ def find_objective_in_top_pages(map_website, objective, app, client): print(f"{Colors.RED}All available pages analyzed. Objective not fulfilled in examined content.{Colors.RESET}") return None + except Exception as e: print(f"{Colors.RED}Error encountered during page analysis: {str(e)}{Colors.RESET}") return None