From c21ffa1f9ca874a654b491d11d5de63f5b4b360f Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Mon, 9 Sep 2024 16:04:27 +0200 Subject: [PATCH] Correctly handle log level configuration --- src/crawlee/_log_config.py | 68 +++++++++++++++++---- src/crawlee/basic_crawler/_basic_crawler.py | 16 ++--- src/crawlee/configuration.py | 6 +- 3 files changed, 64 insertions(+), 26 deletions(-) diff --git a/src/crawlee/_log_config.py b/src/crawlee/_log_config.py index 329fe147e..cd3340701 100644 --- a/src/crawlee/_log_config.py +++ b/src/crawlee/_log_config.py @@ -2,11 +2,15 @@ import json import logging +import sys import textwrap -import traceback -from typing import Any +from typing import TYPE_CHECKING, Any from colorama import Fore, Style, just_fix_windows_console +from typing_extensions import assert_never + +if TYPE_CHECKING: + from crawlee.configuration import Configuration just_fix_windows_console() @@ -31,6 +35,44 @@ _LOG_MESSAGE_INDENT = ' ' * 6 +def get_configured_log_level(configuration: Configuration) -> int: + verbose_logging_requested = 'verbose_log' in configuration.model_fields_set and configuration.verbose_log + + if 'log_level' in configuration.model_fields_set: + if configuration.log_level == 'DEBUG': + return logging.DEBUG + if configuration.log_level == 'INFO': + return logging.INFO + if configuration.log_level == 'WARNING': + return logging.WARNING + if configuration.log_level == 'ERROR': + return logging.ERROR + + assert_never(configuration.log_level) + + if sys.flags.dev_mode or verbose_logging_requested: + return logging.DEBUG + + return logging.INFO + + +def configure_logger( + logger: logging.Logger, + configuration: Configuration, + *, + remove_old_handlers: bool = False, +) -> None: + handler = logging.StreamHandler() + handler.setFormatter(CrawleeLogFormatter()) + + if remove_old_handlers: + for old_handler in logger.handlers[:]: + logger.removeHandler(old_handler) + + logger.addHandler(handler) + logger.setLevel(get_configured_log_level(configuration)) + + class CrawleeLogFormatter(logging.Formatter): """Log formatter that prints out the log message nicely formatted, with colored level and stringified extra fields. @@ -87,15 +129,6 @@ def format(self, record: logging.LogRecord) -> str: level_short_alias = _LOG_LEVEL_SHORT_ALIAS.get(record.levelno, record.levelname) level_string = f'{level_color_code}{level_short_alias}{Style.RESET_ALL} ' - # Format the exception, if there is some - # Basically just print the traceback and indent it a bit - exception_string = '' - if record.exc_info: - exc_info = record.exc_info - record.exc_info = None - exception_string = ''.join(traceback.format_exception(*exc_info)).rstrip() - exception_string = '\n' + textwrap.indent(exception_string, _LOG_MESSAGE_INDENT) - # Format the extra log record fields, if there were some # Just stringify them to JSON and color them gray extra_string = '' @@ -105,8 +138,19 @@ def format(self, record: logging.LogRecord) -> str: f' {Fore.LIGHTBLACK_EX}({json.dumps(extra, ensure_ascii=False, default=str)}){Style.RESET_ALL}' ) + # Call the parent method so that it populates missing fields in the record + super().format(record) + # Format the actual log message - log_string = super().format(record) + log_string = self.formatMessage(record) + + # Format the exception, if there is some + # Basically just print the traceback and indent it a bit + exception_string = '' + if record.exc_text: + exception_string = '\n' + textwrap.indent(record.exc_text.rstrip(), _LOG_MESSAGE_INDENT) + else: + exception_string = '' if self.include_logger_name: # Include logger name at the beginning of the log line diff --git a/src/crawlee/basic_crawler/_basic_crawler.py b/src/crawlee/basic_crawler/_basic_crawler.py index 0785a0b47..377e147ec 100644 --- a/src/crawlee/basic_crawler/_basic_crawler.py +++ b/src/crawlee/basic_crawler/_basic_crawler.py @@ -22,7 +22,7 @@ from crawlee._autoscaling import AutoscaledPool from crawlee._autoscaling.snapshotter import Snapshotter from crawlee._autoscaling.system_status import SystemStatus -from crawlee._log_config import CrawleeLogFormatter +from crawlee._log_config import configure_logger, get_configured_log_level from crawlee._request import BaseRequestData, Request, RequestState from crawlee._types import BasicCrawlingContext, HttpHeaders, RequestHandlerRunResult, SendRequestFunction from crawlee._utils.byte_size import ByteSize @@ -203,20 +203,14 @@ def __init__( self._retry_on_blocked = retry_on_blocked if configure_logging: - handler = logging.StreamHandler() - handler.setFormatter(CrawleeLogFormatter()) - root_logger = logging.getLogger() - - for old_handler in root_logger.handlers[:]: - root_logger.removeHandler(old_handler) - - root_logger.addHandler(handler) - root_logger.setLevel(logging.INFO if not sys.flags.dev_mode else logging.DEBUG) + configure_logger(root_logger, self._configuration, remove_old_handlers=True) # Silence HTTPX logger httpx_logger = logging.getLogger('httpx') - httpx_logger.setLevel(logging.WARNING if not sys.flags.dev_mode else logging.INFO) + httpx_logger.setLevel( + logging.DEBUG if get_configured_log_level(self._configuration) <= logging.DEBUG else logging.WARNING + ) if not _logger: _logger = logging.getLogger(__name__) diff --git a/src/crawlee/configuration.py b/src/crawlee/configuration.py index 6e935f53d..ff117e1a7 100644 --- a/src/crawlee/configuration.py +++ b/src/crawlee/configuration.py @@ -3,7 +3,7 @@ from __future__ import annotations from datetime import timedelta -from typing import Annotated +from typing import Annotated, Literal from pydantic import AliasChoices, Field from pydantic_settings import BaseSettings, SettingsConfigDict @@ -51,14 +51,14 @@ class Configuration(BaseSettings): ] = False log_level: Annotated[ - int, + Literal['DEBUG', 'INFO', 'WARNING', 'ERROR'], Field( validation_alias=AliasChoices( 'apify_log_level', 'crawlee_log_level', ) ), - ] = 4 # INFO + ] = 'INFO' default_dataset_id: Annotated[ str,