This commit is contained in:
Nicolas
2024-11-05 13:08:02 -05:00
2 changed files with 49 additions and 26 deletions
@@ -29,18 +29,15 @@ export function extractLinks(html: string, baseUrl: string): string[] {
const $ = cheerio.load(html); const $ = cheerio.load(html);
const links: string[] = []; const links: string[] = [];
// Parse the base URL to get the origin
const urlObject = new URL(baseUrl);
const origin = urlObject.origin;
$('a').each((_, element) => { $('a').each((_, element) => {
const href = $(element).attr('href'); const href = $(element).attr('href');
if (href) { if (href) {
try {
if (href.startsWith('http://') || href.startsWith('https://')) { if (href.startsWith('http://') || href.startsWith('https://')) {
// Absolute URL, add as is // Absolute URL, add as is
links.push(href); links.push(href);
} else if (href.startsWith('/')) { } else if (href.startsWith('/')) {
// Relative URL starting with '/', append to origin // Relative URL starting with '/', append to base URL
links.push(new URL(href, baseUrl).href); links.push(new URL(href, baseUrl).href);
} else if (!href.startsWith('#') && !href.startsWith('mailto:')) { } else if (!href.startsWith('#') && !href.startsWith('mailto:')) {
// Relative URL not starting with '/', append to base URL // Relative URL not starting with '/', append to base URL
@@ -50,6 +47,10 @@ export function extractLinks(html: string, baseUrl: string): string[] {
links.push(href); links.push(href);
} }
// Fragment-only links (#) are ignored // Fragment-only links (#) are ignored
} catch (error) {
// Log the error and continue
console.error(`Failed to construct URL for href: ${href} with base: ${baseUrl}`, error);
}
} }
}); });
+31 -9
View File
@@ -11,7 +11,7 @@ For more information visit https://github.com/firecrawl/
import logging import logging
import os import os
from .firecrawl import FirecrawlApp from .firecrawl import FirecrawlApp # noqa
__version__ = "1.4.0" __version__ = "1.4.0"
@@ -19,24 +19,46 @@ __version__ = "1.4.0"
logger: logging.Logger = logging.getLogger("firecrawl") logger: logging.Logger = logging.getLogger("firecrawl")
def _basic_config() -> None: def _configure_logger() -> None:
"""Set up basic configuration for logging with a specific format and date format.""" """
Configure the firecrawl logger for console output.
The function attaches a handler for console output with a specific format and date
format to the firecrawl logger.
"""
try: try:
logging.basicConfig( # Create the formatter
format="[%(asctime)s - %(name)s:%(lineno)d - %(levelname)s] %(message)s", formatter = logging.Formatter(
"[%(asctime)s - %(name)s:%(lineno)d - %(levelname)s] %(message)s",
datefmt="%Y-%m-%d %H:%M:%S", datefmt="%Y-%m-%d %H:%M:%S",
) )
# Create the console handler and set the formatter
console_handler = logging.StreamHandler()
console_handler.setFormatter(formatter)
# Add the console handler to the firecrawl logger
logger.addHandler(console_handler)
except Exception as e: except Exception as e:
logger.error("Failed to configure logging: %s", e) logger.error("Failed to configure logging: %s", e)
def setup_logging() -> None: def setup_logging() -> None:
"""Set up logging based on the FIRECRAWL_LOGGING_LEVEL environment variable.""" """Set up logging based on the FIRECRAWL_LOGGING_LEVEL environment variable."""
env = os.environ.get( # Check if the firecrawl logger already has a handler
"FIRECRAWL_LOGGING_LEVEL", "INFO" if logger.hasHandlers():
).upper() # Default to 'INFO' level return # To prevent duplicate logging
_basic_config()
# Check if the FIRECRAWL_LOGGING_LEVEL environment variable is set
if not (env := os.getenv("FIRECRAWL_LOGGING_LEVEL", "").upper()):
# Attach a no-op handler to prevent warnings about no handlers
logger.addHandler(logging.NullHandler())
return
# Attach the console handler to the firecrawl logger
_configure_logger()
# Set the logging level based on the FIRECRAWL_LOGGING_LEVEL environment variable
if env == "DEBUG": if env == "DEBUG":
logger.setLevel(logging.DEBUG) logger.setLevel(logging.DEBUG)
elif env == "INFO": elif env == "INFO":