Merge branch 'main' of https://github.com/mendableai/firecrawl
This commit is contained in:
@@ -29,27 +29,28 @@ export function extractLinks(html: string, baseUrl: string): string[] {
|
|||||||
const $ = cheerio.load(html);
|
const $ = cheerio.load(html);
|
||||||
const links: string[] = [];
|
const links: string[] = [];
|
||||||
|
|
||||||
// Parse the base URL to get the origin
|
|
||||||
const urlObject = new URL(baseUrl);
|
|
||||||
const origin = urlObject.origin;
|
|
||||||
|
|
||||||
$('a').each((_, element) => {
|
$('a').each((_, element) => {
|
||||||
const href = $(element).attr('href');
|
const href = $(element).attr('href');
|
||||||
if (href) {
|
if (href) {
|
||||||
if (href.startsWith('http://') || href.startsWith('https://')) {
|
try {
|
||||||
// Absolute URL, add as is
|
if (href.startsWith('http://') || href.startsWith('https://')) {
|
||||||
links.push(href);
|
// Absolute URL, add as is
|
||||||
} else if (href.startsWith('/')) {
|
links.push(href);
|
||||||
// Relative URL starting with '/', append to origin
|
} else if (href.startsWith('/')) {
|
||||||
links.push(new URL(href, baseUrl).href);
|
// Relative URL starting with '/', append to base URL
|
||||||
} else if (!href.startsWith('#') && !href.startsWith('mailto:')) {
|
links.push(new URL(href, baseUrl).href);
|
||||||
// Relative URL not starting with '/', append to base URL
|
} else if (!href.startsWith('#') && !href.startsWith('mailto:')) {
|
||||||
links.push(new URL(href, baseUrl).href);
|
// Relative URL not starting with '/', append to base URL
|
||||||
} else if (href.startsWith('mailto:')) {
|
links.push(new URL(href, baseUrl).href);
|
||||||
// mailto: links, add as is
|
} else if (href.startsWith('mailto:')) {
|
||||||
links.push(href);
|
// mailto: links, add as is
|
||||||
|
links.push(href);
|
||||||
|
}
|
||||||
|
// Fragment-only links (#) are ignored
|
||||||
|
} catch (error) {
|
||||||
|
// Log the error and continue
|
||||||
|
console.error(`Failed to construct URL for href: ${href} with base: ${baseUrl}`, error);
|
||||||
}
|
}
|
||||||
// Fragment-only links (#) are ignored
|
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ For more information visit https://github.com/firecrawl/
|
|||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from .firecrawl import FirecrawlApp
|
from .firecrawl import FirecrawlApp # noqa
|
||||||
|
|
||||||
__version__ = "1.4.0"
|
__version__ = "1.4.0"
|
||||||
|
|
||||||
@@ -19,24 +19,46 @@ __version__ = "1.4.0"
|
|||||||
logger: logging.Logger = logging.getLogger("firecrawl")
|
logger: logging.Logger = logging.getLogger("firecrawl")
|
||||||
|
|
||||||
|
|
||||||
def _basic_config() -> None:
|
def _configure_logger() -> None:
|
||||||
"""Set up basic configuration for logging with a specific format and date format."""
|
"""
|
||||||
|
Configure the firecrawl logger for console output.
|
||||||
|
|
||||||
|
The function attaches a handler for console output with a specific format and date
|
||||||
|
format to the firecrawl logger.
|
||||||
|
"""
|
||||||
try:
|
try:
|
||||||
logging.basicConfig(
|
# Create the formatter
|
||||||
format="[%(asctime)s - %(name)s:%(lineno)d - %(levelname)s] %(message)s",
|
formatter = logging.Formatter(
|
||||||
|
"[%(asctime)s - %(name)s:%(lineno)d - %(levelname)s] %(message)s",
|
||||||
datefmt="%Y-%m-%d %H:%M:%S",
|
datefmt="%Y-%m-%d %H:%M:%S",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Create the console handler and set the formatter
|
||||||
|
console_handler = logging.StreamHandler()
|
||||||
|
console_handler.setFormatter(formatter)
|
||||||
|
|
||||||
|
# Add the console handler to the firecrawl logger
|
||||||
|
logger.addHandler(console_handler)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error("Failed to configure logging: %s", e)
|
logger.error("Failed to configure logging: %s", e)
|
||||||
|
|
||||||
|
|
||||||
def setup_logging() -> None:
|
def setup_logging() -> None:
|
||||||
"""Set up logging based on the FIRECRAWL_LOGGING_LEVEL environment variable."""
|
"""Set up logging based on the FIRECRAWL_LOGGING_LEVEL environment variable."""
|
||||||
env = os.environ.get(
|
# Check if the firecrawl logger already has a handler
|
||||||
"FIRECRAWL_LOGGING_LEVEL", "INFO"
|
if logger.hasHandlers():
|
||||||
).upper() # Default to 'INFO' level
|
return # To prevent duplicate logging
|
||||||
_basic_config()
|
|
||||||
|
|
||||||
|
# Check if the FIRECRAWL_LOGGING_LEVEL environment variable is set
|
||||||
|
if not (env := os.getenv("FIRECRAWL_LOGGING_LEVEL", "").upper()):
|
||||||
|
# Attach a no-op handler to prevent warnings about no handlers
|
||||||
|
logger.addHandler(logging.NullHandler())
|
||||||
|
return
|
||||||
|
|
||||||
|
# Attach the console handler to the firecrawl logger
|
||||||
|
_configure_logger()
|
||||||
|
|
||||||
|
# Set the logging level based on the FIRECRAWL_LOGGING_LEVEL environment variable
|
||||||
if env == "DEBUG":
|
if env == "DEBUG":
|
||||||
logger.setLevel(logging.DEBUG)
|
logger.setLevel(logging.DEBUG)
|
||||||
elif env == "INFO":
|
elif env == "INFO":
|
||||||
|
|||||||
Reference in New Issue
Block a user