Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions haystack/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,9 @@
from haystack.version import __version__ # noqa: F401

# Initialize the logging configuration
# This is a no-op unless `structlog` is installed
haystack.logging.configure_logging()
# This is a no-op unless `structlog` is installed, and `force=False` makes it skip configuration if `structlog` has
# already been configured by the host application (so importing Haystack does not overwrite their setup).
haystack.logging.configure_logging(force=False)

# Same for tracing (no op if `opentelemetry` or `ddtrace` is not installed)
haystack.tracing.auto_enable_tracing()
Expand Down
92 changes: 70 additions & 22 deletions haystack/logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import os
import sys
import typing
from collections.abc import Sequence
from typing import Any

if typing.TYPE_CHECKING:
Expand All @@ -16,6 +17,10 @@
HAYSTACK_LOGGING_USE_JSON_ENV_VAR = "HAYSTACK_LOGGING_USE_JSON"
HAYSTACK_LOGGING_IGNORE_STRUCTLOG_ENV_VAR = "HAYSTACK_LOGGING_IGNORE_STRUCTLOG"

# Attribute set on a logger once we have patched its methods. `logging.getLogger` returns a shared singleton, so we
# use this marker to patch each logger only once and avoid wrapping the already-wrapped methods on repeated calls.
_PATCHED_MARKER = "_haystack_patched"


class PatchedLogger(typing.Protocol):
"""Class which enables using type checkers to find wrong logger usage."""
Expand Down Expand Up @@ -220,16 +225,13 @@ def _patch_structlog_call_information(logger: logging.Logger) -> None:
if not isinstance(logger, _FixedFindCallerLogger):
return

# completely copied from structlog. We only add `haystack.logging` to the list of ignored frames
# Copied from structlog's `_FixedFindCallerLogger.findCaller`, only adding `haystack.logging` to the list of
# ignored frames so our own logging wrappers don't show up as the caller. We deliberately do not forward
# `stacklevel` to `_find_first_app_frame_and_name`: that parameter only exists in structlog >= 25.5.0 and
# structlog is an optional dependency, so forwarding it would break logging on older versions.
def findCaller(stack_info: bool = False, stacklevel: int = 1) -> tuple[str, int, str, str | None]: # noqa: ARG001
try:
sinfo: str | None
# we need to exclude `haystack.logging` from the stack
f, name = _find_first_app_frame_and_name(["logging", "haystack.logging"])
sinfo = _format_stack(f) if stack_info else None
except Exception as error:
print(f"Error in findCaller: {error}")

f, _name = _find_first_app_frame_and_name(["logging", "haystack.logging"])
sinfo = _format_stack(f) if stack_info else None
return f.f_code.co_filename, f.f_lineno, f.f_code.co_name, sinfo

logger.findCaller = findCaller # type: ignore
Expand All @@ -247,6 +249,11 @@ def getLogger(name: str) -> PatchedLogger:
- it makes structure logging effective, not just an available feature
"""
logger = logging.getLogger(name)
if getattr(logger, _PATCHED_MARKER, False):
# Already patched: `logging.getLogger` returned the same singleton, so re-patching would stack the wrappers
# and interpolate the message more than once.
return typing.cast(PatchedLogger, logger)

logger.debug = patch_log_method_to_kwargs_only(logger.debug) # type: ignore
logger.info = patch_log_method_to_kwargs_only(logger.info) # type: ignore
logger.warn = patch_log_method_to_kwargs_only(logger.warn) # type: ignore
Expand All @@ -262,6 +269,8 @@ def getLogger(name: str) -> PatchedLogger:
# We also patch the `makeRecord` method to use keyword string interpolation
logger.makeRecord = patch_make_records_to_use_kwarg_string_interpolation(logger.makeRecord) # type: ignore

setattr(logger, _PATCHED_MARKER, True)

return typing.cast(PatchedLogger, logger)


Expand Down Expand Up @@ -295,7 +304,12 @@ def correlate_logs_with_traces(_: "WrappedLogger", __: str, event_dict: "EventDi
return event_dict


def configure_logging(use_json: bool | None = None) -> None:
def configure_logging(
use_json: bool | None = None,
logger_name: str | Sequence[str] = ("haystack", "haystack_integrations", "haystack_experimental"),
propagate: bool = True,
force: bool = True,
) -> None:
"""
Configure logging for Haystack.

Expand All @@ -306,6 +320,25 @@ def configure_logging(use_json: bool | None = None) -> None:
- If `structlog` is installed, you can JSON format all logs. Enable this by
- setting the `use_json` parameter to `True` when calling this function
- setting the environment variable `HAYSTACK_LOGGING_USE_JSON` to `true`

:param use_json: Whether to format logs as JSON. If `None`, we try to guess based on the environment.
:param logger_name:
The name (or names) of the logger our formatting handler is attached to. Defaults to Haystack's own
namespaces (`"haystack"`, `"haystack_integrations"` and `"haystack_experimental"`), so that we only touch
Haystack's own loggers and leave the logging configuration of the host application and any other libraries
running in the same process untouched. Pass an empty string (`""`) to attach the handler to the root logger
instead - this restores the legacy behavior of formatting *every* log record in the process.
:param propagate:
Whether the configured loggers should propagate their records to ancestor loggers (ultimately the root
logger). The default (`True`) keeps records flowing to handlers configured by the host application and to
capturing tools such as `pytest`'s `caplog`. Set it to `False` to make Haystack fully own the output of its
own logs - this avoids duplicate log lines when the host application also configures the root logger. It has
no effect when `logger_name=""` (the root logger has no ancestors).
:param force:
Whether to (re)configure logging even if `structlog` has already been configured by someone else. The default
(`True`) means an explicit call always takes over. Pass `False` to make this a no-op when `structlog` is
already configured - this is used by the import-time call in `haystack/__init__.py` so that merely importing
Haystack does not overwrite a `structlog` configuration set up by the host application.
"""
import haystack.utils.jupyter # to avoid circular imports

Expand All @@ -322,6 +355,11 @@ def configure_logging(use_json: bool | None = None) -> None:
# If the user wants to ignore structlog, we don't configure it and fall back to standard logging
return

# When not forcing, skip configuration if structlog is already configured (e.g. by the host application) so we
# leave its configuration and handlers untouched.
if not force and structlog.is_configured():
return

# We roughly follow the structlog documentation here:
# https://www.structlog.org/en/stable/standard-library.html#rendering-using-structlog-based-formatters-within-logging
# This means that we use structlog to format the log entries for entries emitted via `logging` and `structlog`.
Expand Down Expand Up @@ -352,11 +390,16 @@ def configure_logging(use_json: bool | None = None) -> None:
shared_processors.append(correlate_logs_with_traces)

structlog.configure(

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This structlog.configure(...) call is (still?) process-global.

If Haystack is imported before the host app configures structlog, any non-Haystack structlog logger used while Haystack’s config is active will still run through Haystack’s processor chain (regardless of logger name).

Maybe we could move structlog.configure(...) into configure_logging(...) calls.

processors=shared_processors + [structlog.stdlib.ProcessorFormatter.wrap_for_formatter],
# `filter_by_level` reads the effective level from the underlying stdlib logger on *every* call, so changes
# to the log level made after `configure_logging` runs (e.g. by the host application) are respected.
processors=[
structlog.stdlib.filter_by_level,
*shared_processors,
structlog.stdlib.ProcessorFormatter.wrap_for_formatter,
],
logger_factory=structlog.stdlib.LoggerFactory(ignore_frame_names=["haystack.logging"]),
cache_logger_on_first_use=True,
# This is a filter that will filter out log entries that are below the log level of the root logger.
wrapper_class=structlog.make_filtering_bound_logger(min_level=logging.root.getEffectiveLevel()),
wrapper_class=structlog.stdlib.BoundLogger,
)

renderers: list[Processor]
Expand Down Expand Up @@ -392,12 +435,17 @@ def configure_logging(use_json: bool | None = None) -> None:
# Use OUR `ProcessorFormatter` to format all `logging` entries.
handler.setFormatter(formatter)

root_logger = logging.getLogger()
# avoid adding our handler twice
old_handlers = [
h
for h in root_logger.handlers
if not (isinstance(h, logging.StreamHandler) and h.name == "HaystackLoggingHandler")
]
new_handlers = [handler, *old_handlers]
root_logger.handlers = new_handlers
# By default we attach to Haystack's own namespaces so that we only format Haystack's log records and leave the
# loggers of the host application and other libraries in the same process untouched. Pass `logger_name=""` to
# attach to the root logger instead (legacy behavior - formats every record in the process).
logger_names = [logger_name] if isinstance(logger_name, str) else list(logger_name)
for name in logger_names:
target_logger = logging.getLogger(name)
# avoid adding our handler twice
old_handlers = [
h
for h in target_logger.handlers
if not (isinstance(h, logging.StreamHandler) and h.name == "HaystackLoggingHandler")
]
target_logger.handlers = [handler, *old_handlers]

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This may duplicate Haystack logs when users opt back into root formatting with configure_logging(logger_name="").

import haystack already installs HaystackLoggingHandler on the haystack namespaces then adds another handler to root but leaves the namespace handlers in place. Since propagation remains enabled, haystack.* records may be emitted twice.

Not 100% sure that this happens, would try to verify with e.g.:

import logging
import haystack

haystack.logging.configure_logging(use_json=True, logger_name="")
logging.getLogger("haystack.demo").warning("dup check")

And maybe extend this test to cover also the logger_name="" case.

target_logger.propagate = propagate
5 changes: 4 additions & 1 deletion haystack/utils/requests_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,10 @@
import httpx
from tenacity import after_log, before_log, retry, retry_if_exception_type, stop_after_attempt, wait_exponential

logger = logging.getLogger(__file__)
# NOTE: this uses the standard library logger (not `haystack.logging`) on purpose: tenacity's `before_log`/`after_log`
# call the logger with positional arguments, which Haystack's keyword-only patched logger would reject. We still name
# it with `__name__` so it lives under the `haystack` namespace and is picked up by `configure_logging`.
logger = logging.getLogger(__name__)


def request_with_retry(
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
---
upgrade:
- |
``haystack.logging.configure_logging`` now attaches its formatting handler to Haystack's own logger namespaces
(``haystack``, ``haystack_integrations`` and ``haystack_experimental``) instead of the root logger. This makes
Haystack a better-behaved library when it runs next to other services in the same process: the log records of the
host application and other libraries are no longer reformatted or duplicated by Haystack. To restore the previous
behavior of formatting every log record in the process, call ``configure_logging(logger_name="")``.
fixes:
- |
``haystack.logging.configure_logging`` no longer freezes the log level at import time. Previously the level was
captured once when the function ran (at ``import haystack``), so a log level set later by the application was
ignored for ``structlog``-native loggers. The effective level is now read on every log call.
- |
Fixed the logger in ``haystack.utils.requests_utils`` being named after the module's file path instead of
``haystack.utils.requests_utils``, which kept its records outside the ``haystack`` logger namespace.
- |
Importing Haystack no longer overwrites a ``structlog`` configuration that the host application already set up.
The import-time call to ``configure_logging`` now skips configuration when ``structlog`` is already configured.
- |
``haystack.logging.getLogger`` is now idempotent. Previously, calling it more than once for the same logger name
wrapped the already-wrapped logger methods again, which caused the log message to be run through ``str.format``
once per call. As a result a field value containing ``{...}`` could be re-interpolated and pull in the value of
another field. Each logger is now patched only once.
- |
The patched ``findCaller`` used to determine the source of a log entry no longer prints to stdout and no longer
masks errors with a misleading ``NameError``, matching structlog's own ``findCaller`` implementation.
features:
- |
``haystack.logging.configure_logging`` gained three parameters: ``logger_name`` to choose which logger(s) the
formatting handler is attached to, ``propagate`` to control whether Haystack's loggers propagate their records to
ancestor loggers, and ``force`` to control whether an existing ``structlog`` configuration is replaced. Set
``propagate=False`` to let Haystack fully own the output of its own logs and avoid duplicate log lines when the
host application also configures the root logger.
Loading
Loading