From d90a5db89e41ceb6b6bb739bd6c3608b87259dfc Mon Sep 17 00:00:00 2001 From: Federico Kamelhar Date: Sat, 23 May 2026 13:38:27 -0400 Subject: [PATCH 1/3] test(integration): refresh stale notebook + workbench tests against current layout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three groups of pre-existing integration failures, none related to live LLM behavior — all are tests that drifted past code that moved. (A) ``tests/integration/test_notebooks_subset.py`` — ``TestNotebookExecution`` had 5 methods pointing at notebook filenames that no longer exist on disk (the notebook catalogue was renumbered to a contiguous 1-70 sequence). Updated each subprocess invocation to target the current file. Renamed the test methods to match the new numbers (``test_notebook_36_runs`` → ``test_notebook_35_runs`` etc.) so the suite reads consistently end-to-end. Added a header comment noting that these must stay in sync with ``examples/``. (B) ``tests/integration/test_workbench_categories.py`` — ``test_endpoint_returns_curated_categories`` asserted ``"router"`` was a top-level category; the workbench combined router + observability into a single ``"router-observability"`` track in ``workbench/backend/runner.py::NOTEBOOK_CATEGORIES``. Updated the required-id list. Renamed the SSE-suite assertion to ``test_router_observability_groups_router_plus_eventbus`` and pointed it at notebooks 58-61 (the actual router + EventBus + observability notebooks today) instead of 52-55 (which are now production / checkpointer tests). (E) ``examples/notebook_70_oci_tools.py`` — the ``_env`` helper hard- required ``OCI_USE_PROFILE`` / ``OCI_USE_REGION`` / ``OCI_USE_TENANCY`` and ``OCI_GENAI_PROFILE``, exiting 2 if any was missing. That's hostile to users who already exported the standard OCI envelope (``OCI_PROFILE`` / ``OCI_REGION`` / ``OCI_COMPARTMENT``), and it's exactly what tripped ``test_notebooks_all_live.py`` for this notebook in CI. Added a ``fallbacks=`` parameter to ``_env`` and wired every ``OCI_USE_*`` / ``OCI_GENAI_*`` read to fall back through the standard names. Documented in the helper's docstring. Local re-runs: - 5 ``test_notebooks_subset.py::TestNotebookExecution`` tests pass (33s, all run real ``python examples/notebook_NN_*.py`` subprocesses). - 4 ``test_workbench_categories.py::TestNotebookCategories`` tests pass against the live runner ``TestClient``. - ``test_notebooks_all_live.py[notebook_70_oci_tools]`` passes with ``OCI_PROFILE`` / ``OCI_REGION`` / ``OCI_COMPARTMENT`` set (no ``OCI_USE_*`` overrides required). Signed-off-by: Federico Kamelhar --- examples/notebook_70_oci_tools.py | 37 ++++++++++----- tests/integration/test_notebooks_subset.py | 46 +++++++++++-------- .../integration/test_workbench_categories.py | 30 ++++++++---- 3 files changed, 74 insertions(+), 39 deletions(-) diff --git a/examples/notebook_70_oci_tools.py b/examples/notebook_70_oci_tools.py index 76c31635..a59c4f86 100644 --- a/examples/notebook_70_oci_tools.py +++ b/examples/notebook_70_oci_tools.py @@ -74,11 +74,26 @@ from typing import Any -def _env(name: str, default: str | None = None) -> str: - val = os.environ.get(name, default) +def _env(name: str, default: str | None = None, *, fallbacks: tuple[str, ...] = ()) -> str: + """Read env var ``name``; fall back to any of ``fallbacks`` if unset. + + Supports the ``OCI_USE_*`` aliases documented in this notebook AND + the standard ``OCI_PROFILE`` / ``OCI_REGION`` / ``OCI_COMPARTMENT`` + envelope, so users with stock OCI environment variables don't have + to re-export anything just to run this notebook. + """ + val = os.environ.get(name) if not val: + for fb in fallbacks: + val = os.environ.get(fb) + if val: + break + if not val: + val = default + if not val: + tried = [name, *fallbacks] sys.stderr.write( - f"missing env var {name} — see the prerequisites in the notebook docstring\n" + f"missing env var (tried {tried}) — see the prerequisites in the notebook docstring\n" ) sys.exit(2) return val @@ -139,9 +154,9 @@ async def part2_execute() -> None: """Call real OCI services directly through use_oci.""" from locus.tools import use_oci - profile = _env("OCI_USE_PROFILE") - region = _env("OCI_USE_REGION") - tenancy = _env("OCI_USE_TENANCY") + profile = _env("OCI_USE_PROFILE", fallbacks=("OCI_PROFILE",)) + region = _env("OCI_USE_REGION", fallbacks=("OCI_REGION", "OCI_GENAI_REGION")) + tenancy = _env("OCI_USE_TENANCY", fallbacks=("OCI_COMPARTMENT", "OCI_TENANCY")) print(f"=== use_oci — direct dispatch (profile={profile}, region={region}) ===\n") @@ -234,11 +249,11 @@ async def part3_agent() -> None: from locus.models import get_model from locus.tools import describe_oci, use_oci - use_profile = _env("OCI_USE_PROFILE") - use_region = _env("OCI_USE_REGION") - tenancy = _env("OCI_USE_TENANCY") - genai_profile = _env("OCI_GENAI_PROFILE") - genai_region = _env("OCI_GENAI_REGION", "us-chicago-1") + use_profile = _env("OCI_USE_PROFILE", fallbacks=("OCI_PROFILE",)) + use_region = _env("OCI_USE_REGION", fallbacks=("OCI_REGION", "OCI_GENAI_REGION")) + tenancy = _env("OCI_USE_TENANCY", fallbacks=("OCI_COMPARTMENT", "OCI_TENANCY")) + genai_profile = _env("OCI_GENAI_PROFILE", fallbacks=("OCI_PROFILE",)) + genai_region = _env("OCI_GENAI_REGION", "us-chicago-1", fallbacks=("OCI_REGION",)) print( f"=== Agent loop (model via {genai_profile}@{genai_region}, " diff --git a/tests/integration/test_notebooks_subset.py b/tests/integration/test_notebooks_subset.py index 8d8dc0f1..9bfd4ae0 100644 --- a/tests/integration/test_notebooks_subset.py +++ b/tests/integration/test_notebooks_subset.py @@ -786,77 +786,83 @@ def test_sse_response_headers(self): class TestNotebookExecution: """Tests that run actual notebooks (with mock model).""" + # Test method names mirror the current ``examples/notebook_NN_*.py`` + # numbering. The notebooks have been renumbered a few times during + # development; the previous test methods pointed at filenames that + # no longer exist on disk and were failing with FileNotFoundError. + # Keep these in sync with the actual ``examples/`` layout. + @pytest.mark.asyncio - async def test_notebook_36_runs(self): - """Test that notebook 35 runs without error.""" + async def test_notebook_13_runs(self): + """Smoke: notebook 13 (SSE streaming) executes cleanly.""" import subprocess import sys result = subprocess.run( - [sys.executable, "examples/notebook_41_structured_output.py"], + [sys.executable, "examples/notebook_13_sse_streaming.py"], capture_output=True, text=True, timeout=60, check=False, ) - assert result.returncode == 0, f"Notebook 36 failed: {result.stderr}" + assert result.returncode == 0, f"Notebook 13 failed: {result.stderr}" @pytest.mark.asyncio - async def test_notebook_37_runs(self): - """Test that notebook 36 runs without error.""" + async def test_notebook_35_runs(self): + """Smoke: notebook 35 (structured output) executes cleanly.""" import subprocess import sys result = subprocess.run( - [sys.executable, "examples/notebook_42_reasoning_patterns.py"], + [sys.executable, "examples/notebook_35_structured_output.py"], capture_output=True, text=True, timeout=60, check=False, ) - assert result.returncode == 0, f"Notebook 37 failed: {result.stderr}" + assert result.returncode == 0, f"Notebook 35 failed: {result.stderr}" @pytest.mark.asyncio - async def test_notebook_43_runs(self): - """Test that notebook 42 runs without error.""" + async def test_notebook_36_runs(self): + """Smoke: notebook 36 (reasoning patterns) executes cleanly.""" import subprocess import sys result = subprocess.run( - [sys.executable, "examples/notebook_48_playbooks.py"], + [sys.executable, "examples/notebook_36_reasoning_patterns.py"], capture_output=True, text=True, timeout=60, check=False, ) - assert result.returncode == 0, f"Notebook 43 failed: {result.stderr}" + assert result.returncode == 0, f"Notebook 36 failed: {result.stderr}" @pytest.mark.asyncio - async def test_notebook_49_runs(self): - """Test that notebook 48 runs without error.""" + async def test_notebook_46_runs(self): + """Smoke: notebook 46 (playbooks) executes cleanly.""" import subprocess import sys result = subprocess.run( - [sys.executable, "examples/notebook_54_checkpoint_backends.py"], + [sys.executable, "examples/notebook_46_playbooks.py"], capture_output=True, text=True, timeout=60, check=False, ) - assert result.returncode == 0, f"Notebook 49 failed: {result.stderr}" + assert result.returncode == 0, f"Notebook 46 failed: {result.stderr}" @pytest.mark.asyncio - async def test_notebook_14_runs(self): - """Test that notebook 13 runs without error.""" + async def test_notebook_52_runs(self): + """Smoke: notebook 52 (checkpoint backends) executes cleanly.""" import subprocess import sys result = subprocess.run( - [sys.executable, "examples/notebook_19_sse_streaming.py"], + [sys.executable, "examples/notebook_52_checkpoint_backends.py"], capture_output=True, text=True, timeout=60, check=False, ) - assert result.returncode == 0, f"Notebook 14 failed: {result.stderr}" + assert result.returncode == 0, f"Notebook 52 failed: {result.stderr}" diff --git a/tests/integration/test_workbench_categories.py b/tests/integration/test_workbench_categories.py index ac2cb301..9220aab5 100644 --- a/tests/integration/test_workbench_categories.py +++ b/tests/integration/test_workbench_categories.py @@ -64,7 +64,17 @@ def test_endpoint_returns_curated_categories(self, client: TestClient) -> None: # Must include the cardinal sections — these power the user- # facing learning path. Drift here = the README / nav docs are # describing categories that no longer exist. - for required in ("fundamentals", "graphs", "multi-agent", "router", "observability"): + # + # ``router-observability`` is a single combined category — the + # cognitive router and the EventBus observability surface ship + # together as one learning track, and the workbench reflects that + # in its NOTEBOOK_CATEGORIES list. + for required in ( + "fundamentals", + "graphs", + "multi-agent", + "router-observability", + ): assert required in ids, f"missing notebook category: {required}" for c in cats: assert c["name"], f"category {c['id']} has empty name" @@ -78,16 +88,20 @@ def test_every_notebook_has_known_category(self, client: TestClient) -> None: f"notebook {t['id']} has unknown category {t.get('category')!r}" ) - def test_observability_category_contains_new_sse_notebooks(self, client: TestClient) -> None: - """Notebooks 52-55 (the SSE retrofit suite) must live under - ``observability`` so the sidebar surfaces them as a group.""" - obs_numbers = sorted( + def test_router_observability_groups_router_plus_eventbus(self, client: TestClient) -> None: + """The combined ``router-observability`` track must surface the + cognitive router (notebook 58) and the EventBus / observability + notebooks (59, 60, 61) as a single sidebar group. Drift here + means the curated learning path lost a notebook to ``misc``.""" + track_numbers = sorted( t["number"] for t in client.get("/api/notebooks").json() - if t.get("category") == "observability" + if t.get("category") == "router-observability" ) - for n in (52, 53, 54, 55): - assert n in obs_numbers, f"notebook {n} missing from 'observability'" + for n in (58, 59, 60, 61): + assert n in track_numbers, ( + f"notebook {n} missing from 'router-observability' (got {track_numbers})" + ) def test_notebooks_sorted_by_category_then_order(self, client: TestClient) -> None: """The catalogue is pre-sorted by (category position, From 816fcef7ee12bd3e3fd922a1f44621eb7d7ac676 Mon Sep 17 00:00:00 2001 From: Federico Kamelhar Date: Sat, 23 May 2026 15:49:15 -0400 Subject: [PATCH 2/3] fix(oci+tests): bump OCI client read timeout + integration fixture max_tokens for reasoning models MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three more pre-existing integration failures that surfaced once the stale-test cleanup landed. All caused by the test envelope being too tight for reasoning-model traffic (gpt-5.5, o-series), not by real bugs. (1) ``src/locus/models/providers/oci/client.py`` — the OCI Python SDK default read timeout is 60s, which isn't enough for reasoning-model summarization calls in the orchestrator + swarm flows (first response token can take 90-180s to arrive after the model finishes hidden chain-of-thought). Added ``connect_timeout`` (default 10s) and ``read_timeout`` (default 300s) to ``OCIClientConfig`` and wired both through to ``GenerativeAiInferenceClient`` for all four auth modes (api_key, security_token, instance_principal, resource_principal). Surfaces in failures as ``urllib3.ReadTimeoutError: ... read timeout=60.0``. (2) ``tests/integration/test_notebooks_all_live.py`` — the ``_NOTEBOOK_TIMEOUT_OVERRIDES`` map keyed off ``notebook_40_emergent_routing.py``; the notebook had been renumbered to ``notebook_34_emergent_routing.py`` so the override no longer matched, leaving the test on the default 360s budget while the underlying notebook actually needs ~7-9 min. Renamed the key. (3) ``tests/integration/conftest.py`` — the OCI / OpenAI test fixtures built models with ``max_tokens=512``. Reasoning models burn 200-2000+ output tokens on hidden chain-of-thought before producing any visible text; at 512 they return empty content with ``finish_reason='length'``, which surfaces in orchestrator + swarm tests as ``summary=''`` and ``findings={}`` even though ``success=True``. Bumped to 8192 with a comment explaining the ceiling-vs-target tradeoff (short-answer tests still finish fast because the model stops naturally when done). Local re-runs (BOAT-OC1, ``openai.gpt-5.5``, us-chicago-1): - ``test_summary_instead_of_bare_stop`` — passes (was OCI timeout) - ``test_notebook_runs_clean[notebook_34_emergent_routing]`` — passes (was 360s subprocess timeout) - ``test_swarm_executes_tasks`` — passes (was empty findings) - ``test_orchestrator_single_specialist`` — passes (was empty summary) - ``test_orchestrator_multiple_specialists`` — passes (was empty summary) 5/5 of the previously-environmental failures now pass deterministically. Signed-off-by: Federico Kamelhar --- src/locus/models/providers/oci/client.py | 23 ++++++++++++++++++++ tests/integration/conftest.py | 13 +++++++++-- tests/integration/test_notebooks_all_live.py | 10 ++++++--- 3 files changed, 41 insertions(+), 5 deletions(-) diff --git a/src/locus/models/providers/oci/client.py b/src/locus/models/providers/oci/client.py index e8545a1f..5bb2612c 100644 --- a/src/locus/models/providers/oci/client.py +++ b/src/locus/models/providers/oci/client.py @@ -68,6 +68,25 @@ class OCIClientConfig(BaseModel): auth_type: OCIAuthType = Field(default=OCIAuthType.API_KEY, description="Auth type") compartment_id: str | None = Field(default=None, description="OCI compartment OCID") service_endpoint: str | None = Field(default=None, description="Full service endpoint URL") + # HTTP timeouts in seconds for the underlying OCI Python SDK + # ``GenerativeAiInferenceClient``. The SDK defaults to ``(10, 60)`` + # (connect, read); 60s read is not enough for reasoning models + # (gpt-5.5, o-series, etc.) doing long-form summarization in + # orchestrator/swarm flows, where the first response token can take + # 90-180 seconds to arrive. Bump the read timeout to 300s by + # default; callers needing tighter latency contracts can override. + connect_timeout: float = Field( + default=10.0, + description="HTTP connect timeout in seconds.", + ) + read_timeout: float = Field( + default=300.0, + description=( + "HTTP read timeout in seconds. Default 300s accommodates " + "reasoning-model summarization (gpt-5.5, o-series) which " + "can sit on the wire for 90-180s before the first token." + ), + ) model_config = {"extra": "allow"} @@ -161,6 +180,7 @@ def _create_client(self) -> GenerativeAiInferenceClient: return GenerativeAiInferenceClient( config=self.oci_config, service_endpoint=self.config.service_endpoint, + timeout=(self.config.connect_timeout, self.config.read_timeout), ) def _create_security_token_client(self) -> GenerativeAiInferenceClient: @@ -207,6 +227,7 @@ def _create_security_token_client(self) -> GenerativeAiInferenceClient: config=oci_cfg, signer=signer, service_endpoint=self.config.service_endpoint, + timeout=(self.config.connect_timeout, self.config.read_timeout), ) def _create_instance_principal_client(self) -> GenerativeAiInferenceClient: @@ -223,6 +244,7 @@ def _create_instance_principal_client(self) -> GenerativeAiInferenceClient: config={}, signer=signer, service_endpoint=self.config.service_endpoint, + timeout=(self.config.connect_timeout, self.config.read_timeout), ) def _create_resource_principal_client(self) -> GenerativeAiInferenceClient: @@ -239,6 +261,7 @@ def _create_resource_principal_client(self) -> GenerativeAiInferenceClient: config={}, signer=signer, service_endpoint=self.config.service_endpoint, + timeout=(self.config.connect_timeout, self.config.read_timeout), ) def get_serving_mode(self, model_id: str) -> Any: diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index ec4e9030..799ce2ba 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -213,13 +213,22 @@ def _build_model(): if endpoint and compartment: from locus.models.providers.oci import OCIModel + # max_tokens=8192 is a ceiling, not a target. Reasoning + # models (gpt-5.5, o-series, etc.) burn 200-2000+ tokens + # of hidden chain-of-thought before producing any visible + # output; at 512 they typically return empty content with + # finish_reason='length', which surfaces in orchestrator / + # swarm tests as ``summary=''``. 8192 leaves room for both + # the reasoning trace and a normal-length response without + # being wasteful on short-answer tests (the model stops + # naturally when done). return OCIModel( model_id=model_id, profile_name=os.getenv("OCI_PROFILE", "DEFAULT"), auth_type=os.getenv("OCI_AUTH_TYPE", "api_key"), service_endpoint=endpoint, compartment_id=compartment, - max_tokens=512, + max_tokens=8192, ) # OpenAI fallback @@ -227,7 +236,7 @@ def _build_model(): from locus.models.native.openai import OpenAIModel model_id = os.getenv("OPENAI_MODEL_ID", "gpt-4o-mini") - return OpenAIModel(model=model_id, max_tokens=512) + return OpenAIModel(model=model_id, max_tokens=8192) # Anthropic fallback — cheapest path for non-OCI iteration. if anthropic_available(): diff --git a/tests/integration/test_notebooks_all_live.py b/tests/integration/test_notebooks_all_live.py index ce5b35cd..bc4d2542 100644 --- a/tests/integration/test_notebooks_all_live.py +++ b/tests/integration/test_notebooks_all_live.py @@ -56,9 +56,13 @@ def _has_oci_config() -> bool: # the default and override per-notebook below. _DEFAULT_TIMEOUT = 360 _NOTEBOOK_TIMEOUT_OVERRIDES: dict[str, int] = { - # notebook_40_emergent_routing: 5 dispatches × 2-3 LLM calls each - # through a reasoning model — empirical wall time ~7-9 min. - "notebook_40_emergent_routing.py": 900, + # notebook_34_emergent_routing: 5 dispatches × 2-3 LLM calls each + # through a reasoning model — empirical wall time ~7-9 min. The + # filename used to be ``notebook_40_emergent_routing.py``; this + # override key was stale after the catalogue renumbering and let + # the test fall through to the ``_DEFAULT_TIMEOUT`` (360s), which + # isn't enough — the subprocess was getting SIGKILL'd at 6 min. + "notebook_34_emergent_routing.py": 900, } From 68b501bc4c54add01f2b3f406138774912d73a18 Mon Sep 17 00:00:00 2001 From: Federico Kamelhar Date: Sat, 23 May 2026 16:08:00 -0400 Subject: [PATCH 3/3] test(oci-client): update mock assertion for new timeout kwarg ``test_instance_principal_client_creation`` was pinned to the exact keyword args passed to ``GenerativeAiInferenceClient``. The previous commit added ``timeout=(connect, read)`` to all four client-creation paths, so the strict ``assert_called_once_with(...)`` started missing the new kwarg. Updated the assertion to include the default tuple ``(10.0, 300.0)`` from ``OCIClientConfig``. Signed-off-by: Federico Kamelhar --- tests/unit/test_oci_client.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/unit/test_oci_client.py b/tests/unit/test_oci_client.py index b72ff21d..2aba7d47 100644 --- a/tests/unit/test_oci_client.py +++ b/tests/unit/test_oci_client.py @@ -293,6 +293,7 @@ def test_instance_principal_client_creation(self, mock_signer_class, mock_client config={}, signer=mock_signer, service_endpoint="https://test.endpoint.com", + timeout=(10.0, 300.0), ) assert result == mock_client