Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 18 additions & 8 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,25 +37,35 @@ jobs:

test:
name: Tests
runs-on: ubuntu-latest
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest]
python-version: ["3.10", "3.11", "3.12"]
include:
# Smoke-test on Windows to catch POSIX-only regressions (e.g. SIGALRM).
- os: windows-latest
python-version: "3.11"
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why 3.11 here?

# Smoke-test on macOS — primary developer platform, catches BSD/Darwin quirks.
- os: macos-latest
python-version: "3.11"
steps:
- uses: actions/checkout@v4

- name: Install uv
- name: Install uv and Python ${{ matrix.python-version }}
uses: astral-sh/setup-uv@v7
with:
version: "latest"

- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}

- name: Install dependencies
run: uv sync --extra dev
# Force uv-managed (python-build-standalone) Python. The runner's
# pre-installed Python on macOS is compiled without
# --enable-loadable-sqlite-extensions, so sqlite-vec fails to load.
# python-build-standalone enables loadable sqlite extensions on
# every platform.
run: uv sync --extra dev --python-preference=only-managed

- name: Run tests
run: uv run pytest -v --cov --cov-config=pyproject.toml --cov-report=xml
Expand All @@ -65,7 +75,7 @@ jobs:

- name: Upload coverage to Codecov
uses: codecov/codecov-action@v4
if: matrix.python-version == '3.11'
if: matrix.os == 'ubuntu-latest' && matrix.python-version == '3.11'
with:
files: ./coverage.xml
fail_ci_if_error: false
Expand Down
9 changes: 8 additions & 1 deletion librarian/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -662,8 +662,13 @@ def add_source(
rprint(f" [red]Error:[/red] {err.get('path', '')}: {err.get('error', '')}")

skipped = result.get("skipped", 0)
status_line = (
"[red]Source added, but indexing had errors.[/red]"
if errors
else "[green]Source added and indexed![/green]"
)
summary = (
f"[green]Source added and indexed![/green]\n\n"
f"{status_line}\n\n"
f"Name: [cyan]{source['name']}[/cyan]\n"
f"Path: [blue]{source_path}[/blue]\n"
f"Files found: [yellow]{len(files_to_index)}[/yellow]\n"
Expand All @@ -675,6 +680,8 @@ def add_source(
summary += f"\nErrors: [red]{len(errors)}[/red]"

rprint(Panel(summary, title="Source Added"))
if errors:
raise typer.Exit(1)


# =============================================================================
Expand Down
63 changes: 34 additions & 29 deletions librarian/processing/parsers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
import logging
import signal
from abc import ABC, abstractmethod
from collections.abc import Iterator
from contextlib import contextmanager
from pathlib import Path

from librarian.types import ParsedDocument
Expand All @@ -31,6 +33,30 @@ def _timeout_handler(signum: int, frame: object) -> None:
raise FileReadTimeoutError("File read timed out")


@contextmanager
def _read_timeout(timeout: int) -> Iterator[None]:
"""Apply a POSIX alarm-based read timeout when available; no-op otherwise.

Windows Python does not expose ``signal.SIGALRM`` / ``signal.alarm``, so we
fall back to an unbounded read on those platforms rather than refusing to
parse the file at all.
"""
sigalrm = getattr(signal, "SIGALRM", None)
alarm = getattr(signal, "alarm", None)
if sigalrm is None or alarm is None:
yield
return

old_handler = signal.getsignal(sigalrm)
signal.signal(sigalrm, _timeout_handler)
alarm(timeout)
try:
yield
finally:
alarm(0)
signal.signal(sigalrm, old_handler)


def safe_read_text(
file_path: Path,
timeout: int = DEFAULT_READ_TIMEOUT,
Expand Down Expand Up @@ -61,21 +87,14 @@ def safe_read_text(
msg = f"File not found: {file_path}"
raise FileNotFoundError(msg)

old_handler = signal.getsignal(signal.SIGALRM)
content: str | None = None
try:
signal.signal(signal.SIGALRM, _timeout_handler)
signal.alarm(timeout)

try:
content = file_path.read_text(encoding=encoding)
except UnicodeDecodeError:
if fallback_encoding:
content = file_path.read_text(encoding=fallback_encoding)
else:
with _read_timeout(timeout):
try:
return file_path.read_text(encoding=encoding)
except UnicodeDecodeError:
if fallback_encoding:
return file_path.read_text(encoding=fallback_encoding)
raise

signal.alarm(0)
except FileReadTimeoutError as e:
raise FileReadTimeoutError(
f"Timed out reading {file_path} after {timeout}s "
Expand All @@ -87,11 +106,6 @@ def safe_read_text(
raise FileReadError(f"Permission denied: {file_path}") from e
except OSError as e:
raise FileReadError(f"Cannot read {file_path}: {e}") from e
finally:
signal.alarm(0)
signal.signal(signal.SIGALRM, old_handler)

return content


def safe_read_bytes(
Expand All @@ -117,13 +131,9 @@ def safe_read_bytes(
msg = f"File not found: {file_path}"
raise FileNotFoundError(msg)

old_handler = signal.getsignal(signal.SIGALRM)
content: bytes | None = None
try:
signal.signal(signal.SIGALRM, _timeout_handler)
signal.alarm(timeout)
content = file_path.read_bytes()
signal.alarm(0)
with _read_timeout(timeout):
return file_path.read_bytes()
except FileReadTimeoutError as e:
raise FileReadTimeoutError(
f"Timed out reading {file_path} after {timeout}s "
Expand All @@ -135,11 +145,6 @@ def safe_read_bytes(
raise FileReadError(f"Permission denied: {file_path}") from e
except OSError as e:
raise FileReadError(f"Cannot read {file_path}: {e}") from e
finally:
signal.alarm(0)
signal.signal(signal.SIGALRM, old_handler)

return content


class BaseParser(ABC):
Expand Down
46 changes: 46 additions & 0 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
"""Tests for CLI behavior."""

from pathlib import Path
from typing import Any

import pytest
from typer.testing import CliRunner

from librarian import cli


def test_add_directory_exits_nonzero_when_indexing_errors(
tmp_path: Path, monkeypatch: pytest.MonkeyPatch
) -> None:
"""Directory add should fail automation when indexing reports errors."""
docs_dir = tmp_path / "docs"
docs_dir.mkdir()
fixture = docs_dir / "fixture.md"
fixture.write_text("# Synthetic Markdown Fixture\n\nContent.", encoding="utf-8")

config_dir = tmp_path / "config"
monkeypatch.setattr(cli, "CONFIG_DIR", config_dir)
monkeypatch.setattr(cli, "SOURCES_FILE", config_dir / "sources.json")
monkeypatch.setattr(cli, "SETTINGS_FILE", config_dir / "settings.json")
monkeypatch.setattr(cli, "_get_config", lambda: {"ensure_directories": lambda: None})

async def fake_server_ingest(context: Any, directory: str) -> dict[str, Any]:
return {
"directory": directory,
"total_files": 1,
"indexed": 0,
"updated": 0,
"skipped": 0,
"errors": [{"path": str(fixture), "error": "parser exploded"}],
"files": [],
}

monkeypatch.setattr("librarian.server.index_directory_to_library", fake_server_ingest)

# Force a wide terminal so Rich does not wrap the error message off-screen.
monkeypatch.setenv("COLUMNS", "500")
result = CliRunner().invoke(cli.app, ["add", str(docs_dir), "--verbose"])

assert result.exit_code == 1
assert "Errors:" in result.output
assert "parser exploded" in result.output
28 changes: 28 additions & 0 deletions tests/test_file_read_errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,20 @@ def test_read_existing_file(self, tmp_path: Path) -> None:
content = safe_read_text(test_file)
assert content == "hello world"

def test_read_existing_file_without_sigalrm(
self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
) -> None:
"""Test reading text on platforms without POSIX SIGALRM."""
import signal

test_file = tmp_path / "test.md"
test_file.write_text("# Hello\nworld", encoding="utf-8")

monkeypatch.delattr(signal, "SIGALRM", raising=False)

content = safe_read_text(test_file)
assert content == "# Hello\nworld"

def test_file_not_found(self, tmp_path: Path) -> None:
"""Test FileNotFoundError for missing files."""
missing = tmp_path / "nonexistent.md"
Expand Down Expand Up @@ -99,6 +113,20 @@ def test_read_existing_file(self, tmp_path: Path) -> None:
content = safe_read_bytes(test_file)
assert content == b"\x00\x01\x02"

def test_read_existing_file_without_sigalrm(
self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
) -> None:
"""Test reading bytes on platforms without POSIX SIGALRM."""
import signal

test_file = tmp_path / "test.bin"
test_file.write_bytes(b"\x00\x01\x02")

monkeypatch.delattr(signal, "SIGALRM", raising=False)

content = safe_read_bytes(test_file)
assert content == b"\x00\x01\x02"

def test_file_not_found(self, tmp_path: Path) -> None:
"""Test FileNotFoundError for missing files."""
missing = tmp_path / "nonexistent.bin"
Expand Down
Loading