2024-09-13 15:08:23 -04:00
import os
from firecrawl import FirecrawlApp
import json
from dotenv import load_dotenv
from openai import OpenAI
2024-09-16 11:30:25 -04:00
# ANSI color codes
class Colors :
CYAN = ' \033 [96m '
YELLOW = ' \033 [93m '
GREEN = ' \033 [92m '
RED = ' \033 [91m '
MAGENTA = ' \033 [95m '
BLUE = ' \033 [94m '
RESET = ' \033 [0m '
2024-09-13 15:08:23 -04:00
# Load environment variables
load_dotenv ( )
# Retrieve API keys from environment variables
firecrawl_api_key = os . getenv ( " FIRECRAWL_API_KEY " )
openai_api_key = os . getenv ( " OPENAI_API_KEY " )
# Initialize the FirecrawlApp and OpenAI client
app = FirecrawlApp ( api_key = firecrawl_api_key )
client = OpenAI ( api_key = openai_api_key )
# Find the page that most likely contains the objective
def find_relevant_page_via_map ( objective , url , app , client ) :
try :
2024-09-16 11:30:25 -04:00
print ( f " { Colors . CYAN } Understood. The objective is: { objective } { Colors . RESET } " )
print ( f " { Colors . CYAN } Initiating search on the website: { url } { Colors . RESET } " )
2024-09-13 15:08:23 -04:00
map_prompt = f """
The map function generates a list of URLs from a website and it accepts a search parameter. Based on the objective of: { objective } , come up with a 1-2 word search parameter that will help us find the information we need. Only respond with 1-2 words nothing else.
"""
2024-09-16 11:30:25 -04:00
print ( f " { Colors . YELLOW } Analyzing objective to determine optimal search parameter... { Colors . RESET } " )
2024-09-13 15:08:23 -04:00
completion = client . chat . completions . create (
model = " o1-preview " ,
messages = [
{
" role " : " user " ,
" content " : [
{
" type " : " text " ,
" text " : map_prompt
}
]
}
]
)
map_search_parameter = completion . choices [ 0 ] . message . content
2024-09-16 11:30:25 -04:00
print ( f " { Colors . GREEN } Optimal search parameter identified: { map_search_parameter } { Colors . RESET } " )
2024-09-13 15:08:23 -04:00
2024-09-16 11:30:25 -04:00
print ( f " { Colors . YELLOW } Mapping website using the identified search parameter... { Colors . RESET } " )
2024-09-16 11:18:57 -04:00
map_website = app . map_url ( url , params = { " search " : map_search_parameter } )
2024-09-16 11:30:25 -04:00
print ( f " { Colors . GREEN } Website mapping completed successfully. { Colors . RESET } " )
print ( f " { Colors . GREEN } Located { len ( map_website ) } relevant links. { Colors . RESET } " )
2024-09-13 15:08:23 -04:00
return map_website
except Exception as e :
2024-09-16 11:30:25 -04:00
print ( f " { Colors . RED } Error encountered during relevant page identification: { str ( e ) } { Colors . RESET } " )
2024-09-13 15:08:23 -04:00
return None
# Scrape the top 3 pages and see if the objective is met, if so return in json format else return None
def find_objective_in_top_pages ( map_website , objective , app , client ) :
try :
# Get top 3 links from the map result
2024-09-16 11:18:57 -04:00
top_links = map_website [ : 3 ] if isinstance ( map_website , list ) else [ ]
2024-09-16 11:30:25 -04:00
print ( f " { Colors . CYAN } Proceeding to analyze top { len ( top_links ) } links: { top_links } { Colors . RESET } " )
2024-09-13 15:08:23 -04:00
for link in top_links :
2024-09-16 11:30:25 -04:00
print ( f " { Colors . YELLOW } Initiating scrape of page: { link } { Colors . RESET } " )
2024-09-13 15:08:23 -04:00
# Scrape the page
scrape_result = app . scrape_url ( link , params = { ' formats ' : [ ' markdown ' ] } )
2024-09-16 11:30:25 -04:00
print ( f " { Colors . GREEN } Page scraping completed successfully. { Colors . RESET } " )
2024-09-16 11:18:57 -04:00
2024-09-13 15:08:23 -04:00
# Check if objective is met
check_prompt = f """
2024-09-16 16:04:32 -04:00
Given the following scraped content and objective, determine if the objective is met.
2024-09-16 11:18:57 -04:00
If it is, extract the relevant information in a simple and concise JSON format. Use only the necessary fields and avoid nested structures if possible.
2024-09-16 16:04:32 -04:00
If the objective is not met with confidence, respond with ' Objective not met ' .
2024-09-13 15:08:23 -04:00
Objective: { objective }
2024-09-16 11:18:57 -04:00
Scraped content: { scrape_result [ ' markdown ' ] }
Remember:
2024-09-16 16:04:32 -04:00
1. Only return JSON if you are confident the objective is fully met.
2024-09-16 11:18:57 -04:00
2. Keep the JSON structure as simple and flat as possible.
3. Do not include any explanations or markdown formatting in your response.
2024-09-13 15:08:23 -04:00
"""
2024-09-16 16:04:32 -04:00
2024-09-13 15:08:23 -04:00
completion = client . chat . completions . create (
model = " o1-preview " ,
messages = [
{
" role " : " user " ,
" content " : [
{
" type " : " text " ,
" text " : check_prompt
}
]
}
]
)
result = completion . choices [ 0 ] . message . content
if result != " Objective not met " :
2024-09-16 11:30:25 -04:00
print ( f " { Colors . GREEN } Objective potentially fulfilled. Relevant information identified. { Colors . RESET } " )
2024-09-16 11:18:57 -04:00
try :
return json . loads ( result )
except json . JSONDecodeError :
2024-09-16 11:30:25 -04:00
print ( f " { Colors . RED } Error in parsing response. Proceeding to next page... { Colors . RESET } " )
2024-09-13 15:08:23 -04:00
else :
2024-09-16 11:30:25 -04:00
print ( f " { Colors . YELLOW } Objective not met on this page. Proceeding to next link... { Colors . RESET } " )
2024-09-13 15:08:23 -04:00
2024-09-16 11:30:25 -04:00
print ( f " { Colors . RED } All available pages analyzed. Objective not fulfilled in examined content. { Colors . RESET } " )
2024-09-13 15:08:23 -04:00
return None
2024-09-16 16:04:32 -04:00
2024-09-13 15:08:23 -04:00
except Exception as e :
2024-09-16 11:30:25 -04:00
print ( f " { Colors . RED } Error encountered during page analysis: { str ( e ) } { Colors . RESET } " )
2024-09-13 15:08:23 -04:00
return None
# Main function to execute the process
def main ( ) :
# Get user input
2024-09-16 11:30:25 -04:00
url = input ( f " { Colors . BLUE } Enter the website to crawl: { Colors . RESET } " )
objective = input ( f " { Colors . BLUE } Enter your objective: { Colors . RESET } " )
2024-09-13 15:08:23 -04:00
2024-09-16 11:30:25 -04:00
print ( f " { Colors . YELLOW } Initiating web crawling process... { Colors . RESET } " )
2024-09-13 15:08:23 -04:00
# Find the relevant page
map_website = find_relevant_page_via_map ( objective , url , app , client )
if map_website :
2024-09-16 11:30:25 -04:00
print ( f " { Colors . GREEN } Relevant pages identified. Proceeding with detailed analysis... { Colors . RESET } " )
2024-09-13 15:08:23 -04:00
# Find objective in top pages
result = find_objective_in_top_pages ( map_website , objective , app , client )
if result :
2024-09-16 11:30:25 -04:00
print ( f " { Colors . GREEN } Objective successfully fulfilled. Extracted information: { Colors . RESET } " )
print ( f " { Colors . MAGENTA } { json . dumps ( result , indent = 2 ) } { Colors . RESET } " )
2024-09-13 15:08:23 -04:00
else :
2024-09-16 11:30:25 -04:00
print ( f " { Colors . RED } Unable to fulfill the objective with the available content. { Colors . RESET } " )
2024-09-13 15:08:23 -04:00
else :
2024-09-16 11:30:25 -04:00
print ( f " { Colors . RED } No relevant pages identified. Consider refining the search parameters or trying a different website. { Colors . RESET } " )
2024-09-13 15:08:23 -04:00
if __name__ == " __main__ " :
main ( )