From d90a5db89e41ceb6b6bb739bd6c3608b87259dfc Mon Sep 17 00:00:00 2001
From: Federico Kamelhar <federico.kamelhar@oracle.com>
Date: Sat, 23 May 2026 13:38:27 -0400
Subject: [PATCH 1/3] test(integration): refresh stale notebook + workbench
 tests against current layout
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three groups of pre-existing integration failures, none related to live
LLM behavior — all are tests that drifted past code that moved.

(A) ``tests/integration/test_notebooks_subset.py`` —
``TestNotebookExecution`` had 5 methods pointing at notebook filenames
that no longer exist on disk (the notebook catalogue was renumbered to
a contiguous 1-70 sequence). Updated each subprocess invocation to
target the current file. Renamed the test methods to match the new
numbers (``test_notebook_36_runs`` → ``test_notebook_35_runs`` etc.) so
the suite reads consistently end-to-end. Added a header comment noting
that these must stay in sync with ``examples/``.

(B) ``tests/integration/test_workbench_categories.py`` —
``test_endpoint_returns_curated_categories`` asserted ``"router"`` was
a top-level category; the workbench combined router + observability
into a single ``"router-observability"`` track in
``workbench/backend/runner.py::NOTEBOOK_CATEGORIES``. Updated the
required-id list. Renamed the SSE-suite assertion to
``test_router_observability_groups_router_plus_eventbus`` and pointed
it at notebooks 58-61 (the actual router + EventBus + observability
notebooks today) instead of 52-55 (which are now production /
checkpointer tests).

(E) ``examples/notebook_70_oci_tools.py`` — the ``_env`` helper hard-
required ``OCI_USE_PROFILE`` / ``OCI_USE_REGION`` / ``OCI_USE_TENANCY``
and ``OCI_GENAI_PROFILE``, exiting 2 if any was missing. That's
hostile to users who already exported the standard OCI envelope
(``OCI_PROFILE`` / ``OCI_REGION`` / ``OCI_COMPARTMENT``), and it's
exactly what tripped ``test_notebooks_all_live.py`` for this notebook
in CI. Added a ``fallbacks=`` parameter to ``_env`` and wired every
``OCI_USE_*`` / ``OCI_GENAI_*`` read to fall back through the standard
names. Documented in the helper's docstring.

Local re-runs:
- 5 ``test_notebooks_subset.py::TestNotebookExecution`` tests pass
  (33s, all run real ``python examples/notebook_NN_*.py`` subprocesses).
- 4 ``test_workbench_categories.py::TestNotebookCategories`` tests
  pass against the live runner ``TestClient``.
- ``test_notebooks_all_live.py[notebook_70_oci_tools]`` passes with
  ``OCI_PROFILE`` / ``OCI_REGION`` / ``OCI_COMPARTMENT`` set (no
  ``OCI_USE_*`` overrides required).

Signed-off-by: Federico Kamelhar <federico.kamelhar@oracle.com>
---
 examples/notebook_70_oci_tools.py             | 37 ++++++++++-----
 tests/integration/test_notebooks_subset.py    | 46 +++++++++++--------
 .../integration/test_workbench_categories.py  | 30 ++++++++----
 3 files changed, 74 insertions(+), 39 deletions(-)

diff --git a/examples/notebook_70_oci_tools.py b/examples/notebook_70_oci_tools.py
index 76c31635..a59c4f86 100644
--- a/examples/notebook_70_oci_tools.py
+++ b/examples/notebook_70_oci_tools.py
@@ -74,11 +74,26 @@
 from typing import Any
 
 
-def _env(name: str, default: str | None = None) -> str:
-    val = os.environ.get(name, default)
+def _env(name: str, default: str | None = None, *, fallbacks: tuple[str, ...] = ()) -> str:
+    """Read env var ``name``; fall back to any of ``fallbacks`` if unset.
+
+    Supports the ``OCI_USE_*`` aliases documented in this notebook AND
+    the standard ``OCI_PROFILE`` / ``OCI_REGION`` / ``OCI_COMPARTMENT``
+    envelope, so users with stock OCI environment variables don't have
+    to re-export anything just to run this notebook.
+    """
+    val = os.environ.get(name)
     if not val:
+        for fb in fallbacks:
+            val = os.environ.get(fb)
+            if val:
+                break
+    if not val:
+        val = default
+    if not val:
+        tried = [name, *fallbacks]
         sys.stderr.write(
-            f"missing env var {name} — see the prerequisites in the notebook docstring\n"
+            f"missing env var (tried {tried}) — see the prerequisites in the notebook docstring\n"
         )
         sys.exit(2)
     return val
@@ -139,9 +154,9 @@ async def part2_execute() -> None:
     """Call real OCI services directly through use_oci."""
     from locus.tools import use_oci
 
-    profile = _env("OCI_USE_PROFILE")
-    region = _env("OCI_USE_REGION")
-    tenancy = _env("OCI_USE_TENANCY")
+    profile = _env("OCI_USE_PROFILE", fallbacks=("OCI_PROFILE",))
+    region = _env("OCI_USE_REGION", fallbacks=("OCI_REGION", "OCI_GENAI_REGION"))
+    tenancy = _env("OCI_USE_TENANCY", fallbacks=("OCI_COMPARTMENT", "OCI_TENANCY"))
 
     print(f"=== use_oci — direct dispatch (profile={profile}, region={region}) ===\n")
 
@@ -234,11 +249,11 @@ async def part3_agent() -> None:
     from locus.models import get_model
     from locus.tools import describe_oci, use_oci
 
-    use_profile = _env("OCI_USE_PROFILE")
-    use_region = _env("OCI_USE_REGION")
-    tenancy = _env("OCI_USE_TENANCY")
-    genai_profile = _env("OCI_GENAI_PROFILE")
-    genai_region = _env("OCI_GENAI_REGION", "us-chicago-1")
+    use_profile = _env("OCI_USE_PROFILE", fallbacks=("OCI_PROFILE",))
+    use_region = _env("OCI_USE_REGION", fallbacks=("OCI_REGION", "OCI_GENAI_REGION"))
+    tenancy = _env("OCI_USE_TENANCY", fallbacks=("OCI_COMPARTMENT", "OCI_TENANCY"))
+    genai_profile = _env("OCI_GENAI_PROFILE", fallbacks=("OCI_PROFILE",))
+    genai_region = _env("OCI_GENAI_REGION", "us-chicago-1", fallbacks=("OCI_REGION",))
 
     print(
         f"=== Agent loop (model via {genai_profile}@{genai_region}, "
diff --git a/tests/integration/test_notebooks_subset.py b/tests/integration/test_notebooks_subset.py
index 8d8dc0f1..9bfd4ae0 100644
--- a/tests/integration/test_notebooks_subset.py
+++ b/tests/integration/test_notebooks_subset.py
@@ -786,77 +786,83 @@ def test_sse_response_headers(self):
 class TestNotebookExecution:
     """Tests that run actual notebooks (with mock model)."""
 
+    # Test method names mirror the current ``examples/notebook_NN_*.py``
+    # numbering. The notebooks have been renumbered a few times during
+    # development; the previous test methods pointed at filenames that
+    # no longer exist on disk and were failing with FileNotFoundError.
+    # Keep these in sync with the actual ``examples/`` layout.
+
     @pytest.mark.asyncio
-    async def test_notebook_36_runs(self):
-        """Test that notebook 35 runs without error."""
+    async def test_notebook_13_runs(self):
+        """Smoke: notebook 13 (SSE streaming) executes cleanly."""
         import subprocess
         import sys
 
         result = subprocess.run(
-            [sys.executable, "examples/notebook_41_structured_output.py"],
+            [sys.executable, "examples/notebook_13_sse_streaming.py"],
             capture_output=True,
             text=True,
             timeout=60,
             check=False,
         )
-        assert result.returncode == 0, f"Notebook 36 failed: {result.stderr}"
+        assert result.returncode == 0, f"Notebook 13 failed: {result.stderr}"
 
     @pytest.mark.asyncio
-    async def test_notebook_37_runs(self):
-        """Test that notebook 36 runs without error."""
+    async def test_notebook_35_runs(self):
+        """Smoke: notebook 35 (structured output) executes cleanly."""
         import subprocess
         import sys
 
         result = subprocess.run(
-            [sys.executable, "examples/notebook_42_reasoning_patterns.py"],
+            [sys.executable, "examples/notebook_35_structured_output.py"],
             capture_output=True,
             text=True,
             timeout=60,
             check=False,
         )
-        assert result.returncode == 0, f"Notebook 37 failed: {result.stderr}"
+        assert result.returncode == 0, f"Notebook 35 failed: {result.stderr}"
 
     @pytest.mark.asyncio
-    async def test_notebook_43_runs(self):
-        """Test that notebook 42 runs without error."""
+    async def test_notebook_36_runs(self):
+        """Smoke: notebook 36 (reasoning patterns) executes cleanly."""
         import subprocess
         import sys
 
         result = subprocess.run(
-            [sys.executable, "examples/notebook_48_playbooks.py"],
+            [sys.executable, "examples/notebook_36_reasoning_patterns.py"],
             capture_output=True,
             text=True,
             timeout=60,
             check=False,
         )
-        assert result.returncode == 0, f"Notebook 43 failed: {result.stderr}"
+        assert result.returncode == 0, f"Notebook 36 failed: {result.stderr}"
 
     @pytest.mark.asyncio
-    async def test_notebook_49_runs(self):
-        """Test that notebook 48 runs without error."""
+    async def test_notebook_46_runs(self):
+        """Smoke: notebook 46 (playbooks) executes cleanly."""
         import subprocess
         import sys
 
         result = subprocess.run(
-            [sys.executable, "examples/notebook_54_checkpoint_backends.py"],
+            [sys.executable, "examples/notebook_46_playbooks.py"],
             capture_output=True,
             text=True,
             timeout=60,
             check=False,
         )
-        assert result.returncode == 0, f"Notebook 49 failed: {result.stderr}"
+        assert result.returncode == 0, f"Notebook 46 failed: {result.stderr}"
 
     @pytest.mark.asyncio
-    async def test_notebook_14_runs(self):
-        """Test that notebook 13 runs without error."""
+    async def test_notebook_52_runs(self):
+        """Smoke: notebook 52 (checkpoint backends) executes cleanly."""
         import subprocess
         import sys
 
         result = subprocess.run(
-            [sys.executable, "examples/notebook_19_sse_streaming.py"],
+            [sys.executable, "examples/notebook_52_checkpoint_backends.py"],
             capture_output=True,
             text=True,
             timeout=60,
             check=False,
         )
-        assert result.returncode == 0, f"Notebook 14 failed: {result.stderr}"
+        assert result.returncode == 0, f"Notebook 52 failed: {result.stderr}"
diff --git a/tests/integration/test_workbench_categories.py b/tests/integration/test_workbench_categories.py
index ac2cb301..9220aab5 100644
--- a/tests/integration/test_workbench_categories.py
+++ b/tests/integration/test_workbench_categories.py
@@ -64,7 +64,17 @@ def test_endpoint_returns_curated_categories(self, client: TestClient) -> None:
         # Must include the cardinal sections — these power the user-
         # facing learning path. Drift here = the README / nav docs are
         # describing categories that no longer exist.
-        for required in ("fundamentals", "graphs", "multi-agent", "router", "observability"):
+        #
+        # ``router-observability`` is a single combined category — the
+        # cognitive router and the EventBus observability surface ship
+        # together as one learning track, and the workbench reflects that
+        # in its NOTEBOOK_CATEGORIES list.
+        for required in (
+            "fundamentals",
+            "graphs",
+            "multi-agent",
+            "router-observability",
+        ):
             assert required in ids, f"missing notebook category: {required}"
         for c in cats:
             assert c["name"], f"category {c['id']} has empty name"
@@ -78,16 +88,20 @@ def test_every_notebook_has_known_category(self, client: TestClient) -> None:
                 f"notebook {t['id']} has unknown category {t.get('category')!r}"
             )
 
-    def test_observability_category_contains_new_sse_notebooks(self, client: TestClient) -> None:
-        """Notebooks 52-55 (the SSE retrofit suite) must live under
-        ``observability`` so the sidebar surfaces them as a group."""
-        obs_numbers = sorted(
+    def test_router_observability_groups_router_plus_eventbus(self, client: TestClient) -> None:
+        """The combined ``router-observability`` track must surface the
+        cognitive router (notebook 58) and the EventBus / observability
+        notebooks (59, 60, 61) as a single sidebar group. Drift here
+        means the curated learning path lost a notebook to ``misc``."""
+        track_numbers = sorted(
             t["number"]
             for t in client.get("/api/notebooks").json()
-            if t.get("category") == "observability"
+            if t.get("category") == "router-observability"
         )
-        for n in (52, 53, 54, 55):
-            assert n in obs_numbers, f"notebook {n} missing from 'observability'"
+        for n in (58, 59, 60, 61):
+            assert n in track_numbers, (
+                f"notebook {n} missing from 'router-observability' (got {track_numbers})"
+            )
 
     def test_notebooks_sorted_by_category_then_order(self, client: TestClient) -> None:
         """The catalogue is pre-sorted by (category position,

From 816fcef7ee12bd3e3fd922a1f44621eb7d7ac676 Mon Sep 17 00:00:00 2001
From: Federico Kamelhar <federico.kamelhar@oracle.com>
Date: Sat, 23 May 2026 15:49:15 -0400
Subject: [PATCH 2/3] fix(oci+tests): bump OCI client read timeout +
 integration fixture max_tokens for reasoning models
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three more pre-existing integration failures that surfaced once the
stale-test cleanup landed. All caused by the test envelope being too
tight for reasoning-model traffic (gpt-5.5, o-series), not by real bugs.

(1) ``src/locus/models/providers/oci/client.py`` — the OCI Python SDK
default read timeout is 60s, which isn't enough for reasoning-model
summarization calls in the orchestrator + swarm flows (first response
token can take 90-180s to arrive after the model finishes hidden
chain-of-thought). Added ``connect_timeout`` (default 10s) and
``read_timeout`` (default 300s) to ``OCIClientConfig`` and wired both
through to ``GenerativeAiInferenceClient`` for all four auth modes
(api_key, security_token, instance_principal, resource_principal).
Surfaces in failures as
``urllib3.ReadTimeoutError: ... read timeout=60.0``.

(2) ``tests/integration/test_notebooks_all_live.py`` — the
``_NOTEBOOK_TIMEOUT_OVERRIDES`` map keyed off
``notebook_40_emergent_routing.py``; the notebook had been renumbered
to ``notebook_34_emergent_routing.py`` so the override no longer
matched, leaving the test on the default 360s budget while the
underlying notebook actually needs ~7-9 min. Renamed the key.

(3) ``tests/integration/conftest.py`` — the OCI / OpenAI test
fixtures built models with ``max_tokens=512``. Reasoning models burn
200-2000+ output tokens on hidden chain-of-thought before producing
any visible text; at 512 they return empty content with
``finish_reason='length'``, which surfaces in orchestrator + swarm
tests as ``summary=''`` and ``findings={}`` even though
``success=True``. Bumped to 8192 with a comment explaining the
ceiling-vs-target tradeoff (short-answer tests still finish fast
because the model stops naturally when done).

Local re-runs (BOAT-OC1, ``openai.gpt-5.5``, us-chicago-1):

- ``test_summary_instead_of_bare_stop`` — passes (was OCI timeout)
- ``test_notebook_runs_clean[notebook_34_emergent_routing]`` — passes
  (was 360s subprocess timeout)
- ``test_swarm_executes_tasks`` — passes (was empty findings)
- ``test_orchestrator_single_specialist`` — passes (was empty summary)
- ``test_orchestrator_multiple_specialists`` — passes (was empty summary)

5/5 of the previously-environmental failures now pass deterministically.

Signed-off-by: Federico Kamelhar <federico.kamelhar@oracle.com>
---
 src/locus/models/providers/oci/client.py     | 23 ++++++++++++++++++++
 tests/integration/conftest.py                | 13 +++++++++--
 tests/integration/test_notebooks_all_live.py | 10 ++++++---
 3 files changed, 41 insertions(+), 5 deletions(-)

diff --git a/src/locus/models/providers/oci/client.py b/src/locus/models/providers/oci/client.py
index e8545a1f..5bb2612c 100644
--- a/src/locus/models/providers/oci/client.py
+++ b/src/locus/models/providers/oci/client.py
@@ -68,6 +68,25 @@ class OCIClientConfig(BaseModel):
     auth_type: OCIAuthType = Field(default=OCIAuthType.API_KEY, description="Auth type")
     compartment_id: str | None = Field(default=None, description="OCI compartment OCID")
     service_endpoint: str | None = Field(default=None, description="Full service endpoint URL")
+    # HTTP timeouts in seconds for the underlying OCI Python SDK
+    # ``GenerativeAiInferenceClient``. The SDK defaults to ``(10, 60)``
+    # (connect, read); 60s read is not enough for reasoning models
+    # (gpt-5.5, o-series, etc.) doing long-form summarization in
+    # orchestrator/swarm flows, where the first response token can take
+    # 90-180 seconds to arrive. Bump the read timeout to 300s by
+    # default; callers needing tighter latency contracts can override.
+    connect_timeout: float = Field(
+        default=10.0,
+        description="HTTP connect timeout in seconds.",
+    )
+    read_timeout: float = Field(
+        default=300.0,
+        description=(
+            "HTTP read timeout in seconds. Default 300s accommodates "
+            "reasoning-model summarization (gpt-5.5, o-series) which "
+            "can sit on the wire for 90-180s before the first token."
+        ),
+    )
 
     model_config = {"extra": "allow"}
 
@@ -161,6 +180,7 @@ def _create_client(self) -> GenerativeAiInferenceClient:
         return GenerativeAiInferenceClient(
             config=self.oci_config,
             service_endpoint=self.config.service_endpoint,
+            timeout=(self.config.connect_timeout, self.config.read_timeout),
         )
 
     def _create_security_token_client(self) -> GenerativeAiInferenceClient:
@@ -207,6 +227,7 @@ def _create_security_token_client(self) -> GenerativeAiInferenceClient:
             config=oci_cfg,
             signer=signer,
             service_endpoint=self.config.service_endpoint,
+            timeout=(self.config.connect_timeout, self.config.read_timeout),
         )
 
     def _create_instance_principal_client(self) -> GenerativeAiInferenceClient:
@@ -223,6 +244,7 @@ def _create_instance_principal_client(self) -> GenerativeAiInferenceClient:
             config={},
             signer=signer,
             service_endpoint=self.config.service_endpoint,
+            timeout=(self.config.connect_timeout, self.config.read_timeout),
         )
 
     def _create_resource_principal_client(self) -> GenerativeAiInferenceClient:
@@ -239,6 +261,7 @@ def _create_resource_principal_client(self) -> GenerativeAiInferenceClient:
             config={},
             signer=signer,
             service_endpoint=self.config.service_endpoint,
+            timeout=(self.config.connect_timeout, self.config.read_timeout),
         )
 
     def get_serving_mode(self, model_id: str) -> Any:
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
index ec4e9030..799ce2ba 100644
--- a/tests/integration/conftest.py
+++ b/tests/integration/conftest.py
@@ -213,13 +213,22 @@ def _build_model():
         if endpoint and compartment:
             from locus.models.providers.oci import OCIModel
 
+            # max_tokens=8192 is a ceiling, not a target. Reasoning
+            # models (gpt-5.5, o-series, etc.) burn 200-2000+ tokens
+            # of hidden chain-of-thought before producing any visible
+            # output; at 512 they typically return empty content with
+            # finish_reason='length', which surfaces in orchestrator /
+            # swarm tests as ``summary=''``. 8192 leaves room for both
+            # the reasoning trace and a normal-length response without
+            # being wasteful on short-answer tests (the model stops
+            # naturally when done).
             return OCIModel(
                 model_id=model_id,
                 profile_name=os.getenv("OCI_PROFILE", "DEFAULT"),
                 auth_type=os.getenv("OCI_AUTH_TYPE", "api_key"),
                 service_endpoint=endpoint,
                 compartment_id=compartment,
-                max_tokens=512,
+                max_tokens=8192,
             )
 
     # OpenAI fallback
@@ -227,7 +236,7 @@ def _build_model():
         from locus.models.native.openai import OpenAIModel
 
         model_id = os.getenv("OPENAI_MODEL_ID", "gpt-4o-mini")
-        return OpenAIModel(model=model_id, max_tokens=512)
+        return OpenAIModel(model=model_id, max_tokens=8192)
 
     # Anthropic fallback — cheapest path for non-OCI iteration.
     if anthropic_available():
diff --git a/tests/integration/test_notebooks_all_live.py b/tests/integration/test_notebooks_all_live.py
index ce5b35cd..bc4d2542 100644
--- a/tests/integration/test_notebooks_all_live.py
+++ b/tests/integration/test_notebooks_all_live.py
@@ -56,9 +56,13 @@ def _has_oci_config() -> bool:
 # the default and override per-notebook below.
 _DEFAULT_TIMEOUT = 360
 _NOTEBOOK_TIMEOUT_OVERRIDES: dict[str, int] = {
-    # notebook_40_emergent_routing: 5 dispatches × 2-3 LLM calls each
-    # through a reasoning model — empirical wall time ~7-9 min.
-    "notebook_40_emergent_routing.py": 900,
+    # notebook_34_emergent_routing: 5 dispatches × 2-3 LLM calls each
+    # through a reasoning model — empirical wall time ~7-9 min. The
+    # filename used to be ``notebook_40_emergent_routing.py``; this
+    # override key was stale after the catalogue renumbering and let
+    # the test fall through to the ``_DEFAULT_TIMEOUT`` (360s), which
+    # isn't enough — the subprocess was getting SIGKILL'd at 6 min.
+    "notebook_34_emergent_routing.py": 900,
 }
 
 

From 68b501bc4c54add01f2b3f406138774912d73a18 Mon Sep 17 00:00:00 2001
From: Federico Kamelhar <federico.kamelhar@oracle.com>
Date: Sat, 23 May 2026 16:08:00 -0400
Subject: [PATCH 3/3] test(oci-client): update mock assertion for new timeout
 kwarg

``test_instance_principal_client_creation`` was pinned to the exact
keyword args passed to ``GenerativeAiInferenceClient``. The previous
commit added ``timeout=(connect, read)`` to all four client-creation
paths, so the strict ``assert_called_once_with(...)`` started missing
the new kwarg. Updated the assertion to include the default tuple
``(10.0, 300.0)`` from ``OCIClientConfig``.

Signed-off-by: Federico Kamelhar <federico.kamelhar@oracle.com>
---
 tests/unit/test_oci_client.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/unit/test_oci_client.py b/tests/unit/test_oci_client.py
index b72ff21d..2aba7d47 100644
--- a/tests/unit/test_oci_client.py
+++ b/tests/unit/test_oci_client.py
@@ -293,6 +293,7 @@ def test_instance_principal_client_creation(self, mock_signer_class, mock_client
             config={},
             signer=mock_signer,
             service_endpoint="https://test.endpoint.com",
+            timeout=(10.0, 300.0),
         )
         assert result == mock_client