From cf4a0dd47aa6c38184d31d64776497df04166a79 Mon Sep 17 00:00:00 2001 From: Declan Brady Date: Thu, 18 Jun 2026 11:22:00 -0400 Subject: [PATCH 01/35] docs: design for unified harness tracing/message-emitting surface Approach A (Agentex event stream as canonical source of truth): one tap per harness feeds shared yield/auto-send delivery adapters and a span-deriving tracing tap. Additive backwards-compat, stacked PRs <1000 lines, conformance + live-matrix testing (3 test agents per harness: sync/async/temporal). Co-Authored-By: Claude Opus 4.8 (1M context) --- ...26-06-18-unified-harness-surface-design.md | 204 ++++++++++++++++++ 1 file changed, 204 insertions(+) create mode 100644 docs/superpowers/specs/2026-06-18-unified-harness-surface-design.md diff --git a/docs/superpowers/specs/2026-06-18-unified-harness-surface-design.md b/docs/superpowers/specs/2026-06-18-unified-harness-surface-design.md new file mode 100644 index 000000000..8e5411863 --- /dev/null +++ b/docs/superpowers/specs/2026-06-18-unified-harness-surface-design.md @@ -0,0 +1,204 @@ +# Unified Harness Tracing / Message-Emitting Surface + +Date: 2026-06-18 +Status: Approved design, pending implementation +Repo: `scale-agentex-python` + +## Problem + +The SDK integrates several agent harnesses (pydantic-ai, LangGraph, OpenAI Agents) by +converting each harness's native output into Agentex `StreamTaskMessage*` events. Today +that integration is triplicated per harness: + +- `__sync.py` — a converter that **yields** Agentex events back over the + HTTP/JSON-RPC response (sync ACP agents). +- `__async.py` — a converter that **auto-sends** to the task stream (Redis via + `adk.streaming`) for async + temporal agents. +- `__tracing.py` — a separate, opt-in tracing handler wired into the converter + by hand. + +Consequences: + +- The native-output → Agentex-event mapping exists in two places per harness (sync and + async) and can drift. +- Tracing is bolted on per harness and is inconsistent across harnesses. +- There is no shared notion of a tool/reasoning span tree or turn-level metadata. +- The golden agent grew a parallel "harness layer" (a neutral `HarnessEvent` vocabulary + plus an adapter that drives `adk.streaming` + `adk.tracing`) to solve the same problem + for its subprocess CLI harnesses (claude-code, codex). That logic is valuable but lives + outside the SDK. + +## Goal (end state) + +pydantic-ai, LangGraph, OpenAI Agents, claude-code, and codex all emit through one unified +surface. A single pass over a harness's output drives **streaming, message persistence, and +tracing** from one source of truth, in the same shape as Agentex events. The surface works +for **both** delivery channels (sync yield, async/temporal auto-send). Tracing is on by +default and overridable. The claude-code/codex *parsers* live in the SDK; their sandbox / +secret / MCP orchestration stays in the golden agent. + +## Approach: Agentex event stream is canonical (Approach A) + +The Agentex `StreamTaskMessage*` stream is the single source of truth. Each harness maps its +native output to that stream **once**. A single emitter consumes that one stream and fans it +out to delivery (yield or auto-send) and to tracing (spans derived from the same stream). + +We considered two alternatives and rejected them: + +- **Neutral `AgentEvent` vocabulary + dual projectors (Approach B):** richer (carries turn + usage/cost natively, clean start/end pairing) but reintroduces a parallel vocabulary to + keep in sync with Agentex types, for the same outcome. +- **Push-to-sink with typed emitter methods (Approach C):** very testable, but the *yield* + delivery channel fights a push API (needs a queue/generator bridge), and sync ACP agents + depend on yield. + +Approach A matches "same shape as Agentex events" most directly, makes the yield channel +free, and lets us delete the per-harness tracing code by deriving spans from the canonical +stream. + +## Components + +Four shared, harness-independent components plus one thin tap per harness. + +### 1. Per-harness tap (the only per-harness code) + +``` +convert__to_agentex_events(native_stream, ...) -> AsyncIterator[StreamTaskMessage*] +``` + +The existing sync converters (`convert_pydantic_ai_to_agentex_events`, +`convert_langgraph_to_agentex_events`, `convert_openai_to_agentex_events`) already have this +shape and *become* the taps. New taps: `convert_claude_code_to_agentex_events`, +`convert_codex_to_agentex_events` (pure parsers over the CLIs' newline-delimited +stream-json; no SGP/sandbox coupling). + +### 2. Auto-send adapter (shared) + +Consumes the canonical Agentex stream and drives `adk.streaming` context managers: open/close +text and reasoning contexts, switch cleanly between them, stream tool request/response. This +generalizes the golden agent's `AgentexStreamAdapter` and replaces the N hand-written +`_async` bodies with one. Returns the accumulated final text (preserving current +auto-send return values). + +### 3. Yield adapter (shared) + +Passes the canonical stream through to the caller (sync HTTP ACP), tee-ing each event to the +tracer as a side effect. + +### 4. Tracing tap (shared) + +Derives spans from the canonical stream: + +- tool span = `ToolRequestContent` (start/full) → matching `ToolResponseContent` by + `tool_call_id`. +- reasoning span = reasoning start → done. +- subagent span = the Task/Agent tool's span (a tool span by another name). + +Default-on whenever a trace context exists; **overridable** by passing a custom tracer, or +`None` to disable. Replaces the per-harness `_tracing.py` handlers. + +### Facade + +A `UnifiedEmitter` ties the chosen delivery adapter and the tracer together so an agent +author calls one thing. + +### Proposed layout + +- Shared components: `src/agentex/lib/core/harness/` (delivery adapters, tracing tap, span + derivation, facade). +- Taps: remain in `src/agentex/lib/adk/_modules/`. +- Public access: via the `adk` facade. + +## Data flow + +One pass over the canonical stream, fanned out by delivery mode. + +- **Sync agent:** `async for ev in emitter.yield_events(convert_X(native)): ...` — the tracer + observes each event; the event is yielded over the HTTP/JSON-RPC response. +- **Async + temporal agent:** `await emitter.auto_send(convert_X(native), task_id=...)` — the + auto-send adapter pushes deltas to Redis via `adk.streaming` while the tracer observes the + same events; returns accumulated final text. Temporal is identical, called from inside an + activity (converters run in activities, not workflows, so determinism is not a concern). +- **Tracing** is the same derivation in both modes (it observes the canonical stream), so + sync and auto-send produce identical spans. +- **Turn-level metadata** (usage / cost / model) is not an Agentex event. It rides a small + side-channel: the tap returns a final typed `TurnResult` (or yields a terminal record) + that the caller attaches to the turn span. This mirrors how the golden agent already treats + `TurnCompleted` as "handled by the caller, not the stream." + +Net dedup: **3 files × N harnesses → 1 tap × N harnesses + 3 shared components.** + +## Backwards compatibility (every change is additive) + +The end state "replaces" the old converters, but it is reached additively. No public symbol +is removed in this stack; nothing regresses. + +- **Taps:** existing `convert_*_to_agentex_events` keep exact signatures and output. Behavior + is unchanged when no trace context is present. +- **Auto-send entry points** (`stream_langgraph_events(stream, task_id)`, the pydantic/openai + `_async` helpers, `run_agent_streamed_auto_send`, `chat_completion_stream_auto_send`) keep + signatures and return values, reimplemented to delegate to the shared auto-send adapter. + Feature-add: they emit traces by default. The conformance suite asserts equivalent Redis + messages before/after. +- **`_tracing.py` handlers** stay importable as shims; the shared tracer supersedes them + internally. +- **Removal/deprecation** of dead internal duplication is the final PR, behind a deprecation + note, never mixed into a migration PR. + +## Rollout — stacked PRs (each < 1000 lines diff) + +1. **Span derivation (`TracingTap`)** — pure function: canonical stream → spans. + Unit-tested in isolation. No wiring. +2. **Auto-send adapter** — canonical stream → `adk.streaming` side effects. Fixture-tested. + Not yet wired into harnesses. +3. **Yield adapter + `UnifiedEmitter` facade + public `adk` surface** — plus the + conformance-test scaffold (fixture format + parametrized runner) and an empty CI + integration job. +4. **Migrate pydantic-ai** — reimplement its `_async` / tracing on the shared components; + keep `convert_pydantic_ai_to_agentex_events` signature; default tracing on. Add 3 test + agents (sync / async / temporal) + CI matrix entries + live smoke. +5. **Migrate LangGraph** — same pattern + 3 test agents + CI. +6. **Migrate OpenAI Agents** — same pattern + 3 test agents + CI. +7. **claude-code parser tap** — `convert_claude_code_to_agentex_events` + recorded stream-json + fixtures + feasible test agent(s) (likely temporal-only, given the sandbox requirement). +8. **codex parser tap** — same shape + fixtures + feasible test agent(s). +9. **Cleanup** — delete now-dead internal duplication, deprecate shims, docs. + +## Testing + +### Offline conformance suite (every PR) + +Committed raw harness outputs (pydantic `AgentStreamEvent`s, LangGraph chunks, OpenAI stream, +claude/codex stream-json) drive a shared parametrized suite. For each fixture, assert: + +- exact normalized `StreamTaskMessage*` sequence, +- derived span tree, +- **yield-vs-auto-send equivalence** — both channels produce the same logical events/spans. + +Every tap must pass the shared cases: text, reasoning, single tool, tool error, multi-step, +and interleaved reasoning + tool ordering. Deterministic, offline, no network. + +### Live integration matrix (CI) + +Three test agents per harness, one per agent type (sync / async / temporal), deployed and +driven with a fixed prompt. Assert the unified surface produced valid ordered messages and a +well-formed span tree. Modeled on the existing `agentex-tutorials-test.yml` / +`build-and-push-tutorial-agent.yml` CI precedent. + +Matrix: harness ∈ {pydantic-ai, langgraph, openai-agents, claude-code, codex} × agent-type ∈ +{sync, async, temporal}. claude-code/codex run the subset of agent types that is feasible; +any uncovered cell is logged/documented, never silently skipped. + +### Error handling + +- A tap that raises mid-stream closes open streaming contexts and open spans — no leaked + `adk.streaming` context, no dangling span. +- Tracing failures are best-effort and never break delivery (matches the golden agent's + contract). + +## Out of scope + +- Sandbox pool, sandbox lifecycle, MCP server provisioning, and OAuth/secret reauth — tracked + separately; only the pure claude-code/codex output parsers are in scope here. +- claude-code/codex sandbox / secret / MCP orchestration — stays in the golden agent and + feeds the SDK parser. From 4538544806c518e1a9be15cb711bfae596c8f882 Mon Sep 17 00:00:00 2001 From: Declan Brady Date: Thu, 18 Jun 2026 11:30:03 -0400 Subject: [PATCH 02/35] =?UTF-8?q?docs:=20refine=20unified=20harness=20spec?= =?UTF-8?q?=20=E2=80=94=20span=20derivation=20rules,=20TurnUsage,=20golden?= =?UTF-8?q?-agent=20integration?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Make tracing-tap span derivation explicit (tool open on Done of a ToolRequestContent index, close on matching ToolResponseContent by tool_call_id; parallel-safe; reasoning start->done). Flag missing is_error on ToolResponseContent as an additive upstream decision. - Add first-class TurnUsage/TurnResult shape (aligned to llm_metrics token taxonomy) attached to the turn span via span(data=) and reused for metrics. - Document golden-agent integration: all SGP/sandbox/secret/MCP coupling stays in the agent; only parsing/streaming/tracing/usage move to SDK taps + emitter; sandbox-setup events chain before the harness stream. Co-Authored-By: Claude Opus 4.8 (1M context) --- ...26-06-18-unified-harness-surface-design.md | 124 ++++++++++++++++-- 1 file changed, 116 insertions(+), 8 deletions(-) diff --git a/docs/superpowers/specs/2026-06-18-unified-harness-surface-design.md b/docs/superpowers/specs/2026-06-18-unified-harness-surface-design.md index 8e5411863..c3b54c117 100644 --- a/docs/superpowers/specs/2026-06-18-unified-harness-surface-design.md +++ b/docs/superpowers/specs/2026-06-18-unified-harness-surface-design.md @@ -87,16 +87,34 @@ tracer as a side effect. ### 4. Tracing tap (shared) -Derives spans from the canonical stream: +A stateful reducer that derives spans from the canonical stream. It only *observes* +`index` and `tool_call_id`; it never mutates or reorders the stream, so streaming fidelity +is unchanged. -- tool span = `ToolRequestContent` (start/full) → matching `ToolResponseContent` by +Derivation rules: + +- **Tool span open:** on `StreamTaskMessageDone` for an index whose `Start` content was a + `ToolRequestContent`. Arguments are fully known by `Done` (covers both streamed-args and + one-shot tools). The open span is keyed by `tool_call_id`. +- **Tool span close:** on `StreamTaskMessageFull(ToolResponseContent)` matching by `tool_call_id`. -- reasoning span = reasoning start → done. -- subagent span = the Task/Agent tool's span (a tool span by another name). +- **Parallel / interleaved tools:** `ToolRequestContent`, `ToolResponseContent`, + `ToolRequestDelta`, and `ToolResponseDelta` all carry `tool_call_id` + `name`, so multiple + open tool spans pair correctly regardless of arrival order. +- **Reasoning span:** `Start(ReasoningContent)` → `Done` on that index. +- **Subagent span:** the Task/Agent tool's span (a tool span by another name), nested under + the turn span. Default-on whenever a trace context exists; **overridable** by passing a custom tracer, or `None` to disable. Replaces the per-harness `_tracing.py` handlers. +**Open decision — tool error status.** `ToolResponseContent` currently has no +`is_error`/`status` field (only `content`), so a derived tool span cannot mark failure. The +golden agent's `ToolCompleted` carried `is_error`. Recommended resolution: add an additive +optional `is_error: bool | None` to `ToolResponseContent`. This is a generated type, so it is +a small upstream API-spec change (tracked as a prerequisite to the relevant migration PR), not +a local edit. Until it lands, derived spans omit tool error status rather than inferring it. + ### Facade A `UnifiedEmitter` ties the chosen delivery adapter and the tracer together so an agent @@ -121,13 +139,66 @@ One pass over the canonical stream, fanned out by delivery mode. activity (converters run in activities, not workflows, so determinism is not a concern). - **Tracing** is the same derivation in both modes (it observes the canonical stream), so sync and auto-send produce identical spans. -- **Turn-level metadata** (usage / cost / model) is not an Agentex event. It rides a small - side-channel: the tap returns a final typed `TurnResult` (or yields a terminal record) - that the caller attaches to the turn span. This mirrors how the golden agent already treats - `TurnCompleted` as "handled by the caller, not the stream." +- **Turn-level metadata** (usage / cost / model) is not an Agentex event, so it is surfaced + as a first-class `TurnUsage` shape rather than ad-hoc data (see below). Net dedup: **3 files × N harnesses → 1 tap × N harnesses + 3 shared components.** +## Unified turn usage / cost + +Turn metadata is a first-class, harness-independent shape attached to the turn span and +returned to the caller — not a loose side-channel. + +``` +class TurnUsage(BaseModel): + model: str | None + input_tokens: int | None + output_tokens: int | None + cached_input_tokens: int | None # subset of input_tokens served from cache + reasoning_tokens: int | None # subset of output_tokens + total_tokens: int | None + cost_usd: float | None + duration_ms: int | None # wall-clock, measured by the emitter + num_llm_calls: int + num_tool_calls: int # derived from the canonical stream + num_reasoning_blocks: int # derived from the canonical stream + +class TurnResult(BaseModel): + final_text: str + usage: TurnUsage +``` + +- Token field names align with the existing `agentex.lib.core.observability.llm_metrics` + taxonomy (`input_tokens` / `output_tokens` / `cached_input_tokens` / `reasoning_tokens`), + not a new vocabulary. (The OpenAI-style `llm_messages.Usage` — + `prompt_tokens`/`completion_tokens` — is mapped into this richer shape.) +- **Each harness tap normalizes its native usage** into `TurnUsage`: pydantic-ai + `result.usage()`, LangGraph `usage_metadata`, OpenAI `response.usage`, claude-code/codex + the final `result` envelope (`cost_usd` + usage). Per-harness normalization, one output + shape. +- The stream-derived counts (`num_tool_calls`, `num_reasoning_blocks`) come for free from the + tracing tap's reduction; `duration_ms` is measured by the emitter; tokens/cost/model come + from the tap's native-usage normalization. +- The emitter attaches `TurnUsage` to the **turn span** via `adk.tracing.span(data=...)` + (which already accepts a `BaseModel`) and returns `TurnResult` to the caller. The same + object can feed the OTel `LLMMetrics` and downstream metrics (e.g. the golden agent's + per-turn DogStatsD emission), so traces and metrics share one shape. + +### Surfacing `TurnUsage` from the tap + +Python async generators cannot cleanly return a value to their consumer, so the tap does not +return `TurnUsage` via `StopAsyncIteration`. Instead the per-harness entry is a small object: + +``` +class HarnessTurn: + events: AsyncIterator[StreamTaskMessage*] # the canonical stream + def usage(self) -> TurnUsage # populated once `events` is exhausted +``` + +The emitter drives `events` (delivering + tracing), then reads `usage()` to finalize the turn +span and build `TurnResult`. This keeps the canonical stream pure (only `StreamTaskMessage*`) +while giving usage/cost a typed home. + ## Backwards compatibility (every change is additive) The end state "replaces" the old converters, but it is reached additively. No public symbol @@ -196,6 +267,43 @@ any uncovered cell is logged/documented, never silently skipped. - Tracing failures are best-effort and never break delivery (matches the golden agent's contract). +## Golden agent integration (SGP / sandbox coupling preserved) + +The unified surface is designed so the golden agent keeps **all** of its SGP-coupled layers +and only swaps its hand-rolled parsing/streaming/tracing internals for the SDK's taps + +emitter. Nothing SGP-specific moves into the SDK. + +What stays in the golden agent, untouched: + +- Sandbox pool acquire modes (cold-create / warm-claim / reconnect), lease coordination, and + the data-plane URL override. +- Secret resolution, OAuth/MCP reauth, and reconnect-notice emission (the notice is just + another standalone message on the task stream, independent of the harness tap). +- Spawning `claude -p` / `codex exec` inside the sandbox. + +What changes inside the golden agent's provider: + +1. Acquire/provision the sandbox and resolve secrets/MCP exactly as today (SGP-coupled). +2. Spawn the CLI in the sandbox and feed its stdout (stream-json lines) into the SDK tap + `convert_claude_code_to_agentex_events` / `convert_codex_to_agentex_events`. +3. Run that tap through the SDK emitter's **auto-send** path from inside the existing Temporal + activity, getting streaming + tracing + `TurnUsage` for free. The agent's + `_StreamJsonProcessor` and `AgentexStreamAdapter` are retired in favor of the SDK tap + + emitter. + +**Sandbox-setup events:** today the golden agent surfaces provisioning steps (reconnect / +find / create / configure-git / clone) as UI tool calls by yielding them into the same +adapter. Under the unified surface these become agent-produced `ToolRequestContent` / +`ToolResponseContent` messages, chained *before* the harness tap's stream into one canonical +stream for the turn (`chain(setup_events, convert_claude_code(stdout))`). The emitter then +delivers and traces the whole turn uniformly, so setup steps keep appearing in the UI and the +span tree. + +This means the claude-code/codex parser PRs (7, 8) deliver the SDK taps, and a corresponding +**golden-agent-side change** (out of this repo's PR stack) rewires its providers onto them. +The golden agent's in-process litellm / OpenAI-Agents harness can likewise adopt the OpenAI +tap, though that is optional and not required by this design. + ## Out of scope - Sandbox pool, sandbox lifecycle, MCP server provisioning, and OAuth/secret reauth — tracked From 4573d18f5bc65553d648e230bdb5bbfaef858044 Mon Sep 17 00:00:00 2001 From: Declan Brady Date: Thu, 18 Jun 2026 11:34:38 -0400 Subject: [PATCH 03/35] docs: link deferred tool-error decision to AGX1-371 Co-Authored-By: Claude Opus 4.8 (1M context) --- .../specs/2026-06-18-unified-harness-surface-design.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/superpowers/specs/2026-06-18-unified-harness-surface-design.md b/docs/superpowers/specs/2026-06-18-unified-harness-surface-design.md index c3b54c117..e8a32f112 100644 --- a/docs/superpowers/specs/2026-06-18-unified-harness-surface-design.md +++ b/docs/superpowers/specs/2026-06-18-unified-harness-surface-design.md @@ -112,8 +112,9 @@ Default-on whenever a trace context exists; **overridable** by passing a custom `is_error`/`status` field (only `content`), so a derived tool span cannot mark failure. The golden agent's `ToolCompleted` carried `is_error`. Recommended resolution: add an additive optional `is_error: bool | None` to `ToolResponseContent`. This is a generated type, so it is -a small upstream API-spec change (tracked as a prerequisite to the relevant migration PR), not -a local edit. Until it lands, derived spans omit tool error status rather than inferring it. +a small upstream API-spec change, not a local edit. **Deferred** — tracked in Linear as +AGX1-371 (Agentex "Starter Tasks"). Until it lands, derived spans omit tool error status +rather than inferring it. ### Facade From 8eda56dc21c11cd5f341b2f641dc6c4959f056c9 Mon Sep 17 00:00:00 2001 From: Declan Brady Date: Thu, 18 Jun 2026 11:40:42 -0400 Subject: [PATCH 04/35] docs: foundation implementation plan for unified harness surface (PRs 1-3) Bite-sized TDD tasks: foundation types, pure SpanDeriver, SpanTracer adapter, yield + auto_send delivery, UnifiedEmitter facade, conformance scaffold + CI job. Migration/parser PRs (4-9) listed as follow-on plans. Co-Authored-By: Claude Opus 4.8 (1M context) --- ...6-18-unified-harness-surface-foundation.md | 1309 +++++++++++++++++ 1 file changed, 1309 insertions(+) create mode 100644 docs/superpowers/plans/2026-06-18-unified-harness-surface-foundation.md diff --git a/docs/superpowers/plans/2026-06-18-unified-harness-surface-foundation.md b/docs/superpowers/plans/2026-06-18-unified-harness-surface-foundation.md new file mode 100644 index 000000000..0aefef060 --- /dev/null +++ b/docs/superpowers/plans/2026-06-18-unified-harness-surface-foundation.md @@ -0,0 +1,1309 @@ +# Unified Harness Surface — Foundation Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Build the shared, harness-independent machinery (span derivation, auto-send delivery, yield delivery, unified emitter, turn-usage types) that the per-harness taps will plug into — corresponding to PRs 1–3 of the design's rollout. + +**Architecture:** The Agentex `StreamTaskMessage*` stream is the single source of truth (design Approach A). A pure `SpanDeriver` reduces that stream into open/close span signals. Two delivery adapters consume the same stream — `yield_events` (sync HTTP ACP) and `auto_send` (async/temporal, via `adk.streaming`) — and both observe the deriver to drive `adk.tracing`. A `UnifiedEmitter` ties delivery + tracing + `TurnUsage` together. + +**Tech Stack:** Python 3, pydantic v2 (`BaseModel`), pytest + pytest-asyncio, the existing `agentex.lib.adk` streaming/tracing facades. + +**Spec:** `docs/superpowers/specs/2026-06-18-unified-harness-surface-design.md` + +**Scope note:** This plan covers only the foundation (PRs 1–3). The per-harness migration PRs (4–6: pydantic-ai, langgraph, openai) and parser PRs (7–8: claude-code, codex) each require close reading of that harness's existing converter and get their own plans once this foundation lands. PR 9 (cleanup) follows them. See "Subsequent plans" at the end. + +--- + +## File Structure + +- Create `src/agentex/lib/core/harness/__init__.py` — package marker + public re-exports. +- Create `src/agentex/lib/core/harness/types.py` — `OpenSpan`, `CloseSpan`, `SpanSignal`, `TurnUsage`, `TurnResult`, `HarnessTurn` protocol. +- Create `src/agentex/lib/core/harness/span_derivation.py` — `SpanDeriver` (pure reducer). +- Create `src/agentex/lib/core/harness/auto_send.py` — `auto_send()` (canonical stream → `adk.streaming` + tracing). +- Create `src/agentex/lib/core/harness/yield_delivery.py` — `yield_events()` (passthrough + tracing). +- Create `src/agentex/lib/core/harness/emitter.py` — `UnifiedEmitter` facade. +- Create tests under `tests/lib/core/harness/`. + +Each file has one responsibility; `span_derivation.py` has zero dependencies on `adk` so it is unit-testable in isolation. + +--- + +## Task 1: Foundation types + +**Files:** +- Create: `src/agentex/lib/core/harness/__init__.py` +- Create: `src/agentex/lib/core/harness/types.py` +- Test: `tests/lib/core/harness/test_types.py` + +- [ ] **Step 1: Create the package marker** + +Create `src/agentex/lib/core/harness/__init__.py`: + +```python +"""Shared, harness-independent machinery for the unified harness surface. + +The Agentex StreamTaskMessage* stream is the single source of truth; this +package derives spans from it and delivers it (yield or auto-send), so every +harness tap gets streaming + tracing + turn usage uniformly. +""" +``` + +- [ ] **Step 2: Write the failing test for the types** + +Create `tests/lib/core/harness/__init__.py` (empty) and `tests/lib/core/harness/test_types.py`: + +```python +from agentex.lib.core.harness.types import ( + OpenSpan, + CloseSpan, + TurnUsage, + TurnResult, +) + + +def test_open_close_span_construct(): + o = OpenSpan(key="call_1", kind="tool", name="Bash", input={"cmd": "ls"}) + c = CloseSpan(key="call_1", output="files", is_complete=True) + assert o.key == c.key == "call_1" + assert o.kind == "tool" + assert c.is_complete is True + + +def test_turn_usage_defaults_are_none(): + u = TurnUsage(model="claude-opus-4-6") + assert u.model == "claude-opus-4-6" + assert u.input_tokens is None + assert u.num_tool_calls == 0 + + +def test_turn_result_wraps_usage(): + r = TurnResult(final_text="hi", usage=TurnUsage(model="m")) + assert r.final_text == "hi" + assert r.usage.model == "m" +``` + +- [ ] **Step 3: Run test to verify it fails** + +Run: `pytest tests/lib/core/harness/test_types.py -v` +Expected: FAIL with `ModuleNotFoundError: agentex.lib.core.harness.types` + +- [ ] **Step 4: Implement the types** + +Create `src/agentex/lib/core/harness/types.py`: + +```python +"""Types for the unified harness surface.""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Any, AsyncIterator, Literal, Protocol, Union, runtime_checkable + +from agentex.types.task_message_update import ( + StreamTaskMessageDelta, + StreamTaskMessageDone, + StreamTaskMessageFull, + StreamTaskMessageStart, +) +from agentex.lib.utils.model_utils import BaseModel + +# The canonical stream element. Taps yield these; delivery adapters consume them. +StreamTaskMessage = Union[ + StreamTaskMessageStart, + StreamTaskMessageDelta, + StreamTaskMessageFull, + StreamTaskMessageDone, +] + +SpanKind = Literal["tool", "reasoning", "subagent"] + + +@dataclass +class OpenSpan: + """Signal to open a child span. `key` pairs an open with its close.""" + + key: str + kind: SpanKind + name: str + input: dict[str, Any] = field(default_factory=dict) + + +@dataclass +class CloseSpan: + """Signal to close the span previously opened with the same `key`.""" + + key: str + output: Any = None + is_complete: bool = True # False when closed by flush() without a result + + +SpanSignal = Union[OpenSpan, CloseSpan] + + +class TurnUsage(BaseModel): + """Harness-independent turn usage/cost, attached to the turn span. + + Token field names align with agentex.lib.core.observability.llm_metrics. + """ + + model: str | None = None + input_tokens: int | None = None + output_tokens: int | None = None + cached_input_tokens: int | None = None + reasoning_tokens: int | None = None + total_tokens: int | None = None + cost_usd: float | None = None + duration_ms: int | None = None + num_llm_calls: int = 0 + num_tool_calls: int = 0 + num_reasoning_blocks: int = 0 + + +class TurnResult(BaseModel): + """Returned to the caller after a turn is delivered.""" + + final_text: str = "" + usage: TurnUsage = TurnUsage() + + +@runtime_checkable +class HarnessTurn(Protocol): + """A single harness turn: a canonical stream plus its normalized usage. + + Python async generators cannot cleanly return a value to their consumer, so + a tap exposes usage via `usage()` (valid only after `events` is exhausted) + rather than via StopAsyncIteration. + """ + + @property + def events(self) -> AsyncIterator[StreamTaskMessage]: ... + + def usage(self) -> TurnUsage: ... +``` + +- [ ] **Step 5: Run test to verify it passes** + +Run: `pytest tests/lib/core/harness/test_types.py -v` +Expected: PASS (3 passed) + +- [ ] **Step 6: Commit** + +```bash +git add src/agentex/lib/core/harness/__init__.py src/agentex/lib/core/harness/types.py tests/lib/core/harness/__init__.py tests/lib/core/harness/test_types.py +git commit -m "feat(harness): foundation types for unified harness surface" +``` + +--- + +## Task 2: SpanDeriver (pure span derivation) — PR 1 + +**Files:** +- Create: `src/agentex/lib/core/harness/span_derivation.py` +- Test: `tests/lib/core/harness/test_span_derivation.py` + +Derivation rules (from the spec): tool span opens on the `Done` of an index whose `Start` +was a `ToolRequestContent`, and closes on the matching `ToolResponseContent` by +`tool_call_id`; reasoning span opens on `Start(ReasoningContent)` and closes on that index's +`Done`. Parallel tools are keyed by `tool_call_id`. `flush()` closes anything still open. + +- [ ] **Step 1: Write failing tests (text, single tool, reasoning, parallel, streamed args, unclosed)** + +Create `tests/lib/core/harness/test_span_derivation.py`: + +```python +from agentex.lib.core.harness.span_derivation import SpanDeriver +from agentex.lib.core.harness.types import OpenSpan, CloseSpan +from agentex.types.task_message_update import ( + StreamTaskMessageStart, + StreamTaskMessageDelta, + StreamTaskMessageFull, + StreamTaskMessageDone, +) +from agentex.types.text_content import TextContent +from agentex.types.reasoning_content import ReasoningContent +from agentex.types.tool_request_content import ToolRequestContent +from agentex.types.tool_response_content import ToolResponseContent +from agentex.types.tool_request_delta import ToolRequestDelta + + +def _signals(deriver, events): + out = [] + for e in events: + out.extend(deriver.observe(e)) + out.extend(deriver.flush()) + return out + + +def _tool_req(idx, tcid, name, args): + return StreamTaskMessageStart( + type="start", index=idx, + content=ToolRequestContent(type="tool_request", author="agent", + tool_call_id=tcid, name=name, arguments=args), + ) + + +def test_text_only_yields_no_spans(): + d = SpanDeriver() + events = [ + StreamTaskMessageStart(type="start", index=0, + content=TextContent(type="text", author="agent", content="")), + StreamTaskMessageDelta(type="delta", index=0, + delta=None), + StreamTaskMessageDone(type="done", index=0), + ] + assert _signals(d, events) == [] + + +def test_single_tool_opens_on_done_closes_on_response(): + d = SpanDeriver() + events = [ + _tool_req(0, "call_1", "Bash", {"cmd": "ls"}), + StreamTaskMessageDone(type="done", index=0), + StreamTaskMessageFull(type="full", index=1, + content=ToolResponseContent(type="tool_response", author="agent", + tool_call_id="call_1", name="Bash", content="files")), + ] + sigs = _signals(d, events) + assert sigs == [ + OpenSpan(key="call_1", kind="tool", name="Bash", input={"cmd": "ls"}), + CloseSpan(key="call_1", output="files", is_complete=True), + ] + + +def test_reasoning_opens_on_start_closes_on_done(): + d = SpanDeriver() + events = [ + StreamTaskMessageStart(type="start", index=0, + content=ReasoningContent(type="reasoning", author="agent", summary=[], content=[])), + StreamTaskMessageDone(type="done", index=0), + ] + sigs = _signals(d, events) + assert sigs[0] == OpenSpan(key="reasoning:0", kind="reasoning", name="reasoning", input={}) + assert sigs[1] == CloseSpan(key="reasoning:0", output=None, is_complete=True) + + +def test_parallel_tools_pair_by_tool_call_id(): + d = SpanDeriver() + events = [ + _tool_req(0, "a", "T1", {}), + _tool_req(1, "b", "T2", {}), + StreamTaskMessageDone(type="done", index=0), + StreamTaskMessageDone(type="done", index=1), + StreamTaskMessageFull(type="full", index=2, + content=ToolResponseContent(type="tool_response", author="agent", + tool_call_id="b", name="T2", content="rb")), + StreamTaskMessageFull(type="full", index=3, + content=ToolResponseContent(type="tool_response", author="agent", + tool_call_id="a", name="T1", content="ra")), + ] + sigs = _signals(d, events) + opens = [s for s in sigs if isinstance(s, OpenSpan)] + closes = [s for s in sigs if isinstance(s, CloseSpan)] + assert {o.key for o in opens} == {"a", "b"} + assert [c.key for c in closes] == ["b", "a"] + assert all(c.is_complete for c in closes) + + +def test_streamed_args_accumulate_into_open_input(): + d = SpanDeriver() + events = [ + StreamTaskMessageStart(type="start", index=0, + content=ToolRequestContent(type="tool_request", author="agent", + tool_call_id="c", name="Bash", arguments={})), + StreamTaskMessageDelta(type="delta", index=0, + delta=ToolRequestDelta(type="tool_request", tool_call_id="c", name="Bash", + arguments_delta='{"cmd":')), + StreamTaskMessageDelta(type="delta", index=0, + delta=ToolRequestDelta(type="tool_request", tool_call_id="c", name="Bash", + arguments_delta='"ls"}')), + StreamTaskMessageDone(type="done", index=0), + ] + sigs = _signals(d, events) + assert sigs[0] == OpenSpan(key="c", kind="tool", name="Bash", input={"cmd": "ls"}) + + +def test_unclosed_tool_closed_incomplete_on_flush(): + d = SpanDeriver() + events = [ + _tool_req(0, "x", "Bash", {}), + StreamTaskMessageDone(type="done", index=0), + ] + sigs = _signals(d, events) + assert sigs[0] == OpenSpan(key="x", kind="tool", name="Bash", input={}) + assert sigs[1] == CloseSpan(key="x", output=None, is_complete=False) +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `pytest tests/lib/core/harness/test_span_derivation.py -v` +Expected: FAIL with `ModuleNotFoundError: agentex.lib.core.harness.span_derivation` + +- [ ] **Step 3: Implement `SpanDeriver`** + +Create `src/agentex/lib/core/harness/span_derivation.py`: + +```python +"""Pure reducer: canonical StreamTaskMessage* stream -> span open/close signals. + +Has no dependency on adk; unit-testable in isolation. Delivery adapters feed it +every event and act on the returned signals. +""" + +from __future__ import annotations + +import json +from dataclasses import dataclass, field +from typing import Any + +from agentex.types.task_message_update import ( + StreamTaskMessageDelta, + StreamTaskMessageDone, + StreamTaskMessageFull, + StreamTaskMessageStart, +) + +from agentex.lib.core.harness.types import CloseSpan, OpenSpan, SpanSignal, StreamTaskMessage + + +@dataclass +class _ToolReqMeta: + tool_call_id: str + name: str + arguments: dict[str, Any] + args_buf: str = "" # accumulated streamed argument fragments + + +class SpanDeriver: + """Stateful reducer over the canonical stream. + + Tool span: open on Done of a ToolRequestContent index; close on matching + ToolResponseContent by tool_call_id. Reasoning span: open on + Start(ReasoningContent); close on that index's Done. + """ + + def __init__(self) -> None: + # index -> tool request metadata (present only for tool_request indices) + self._tool_by_index: dict[int, _ToolReqMeta] = {} + # index -> reasoning open (present only for reasoning indices) + self._reasoning_index_open: set[int] = set() + # tool_call_ids with a currently-open span + self._open_tool_ids: set[str] = set() + + def observe(self, event: StreamTaskMessage) -> list[SpanSignal]: + if isinstance(event, StreamTaskMessageStart): + return self._on_start(event) + if isinstance(event, StreamTaskMessageDelta): + return self._on_delta(event) + if isinstance(event, StreamTaskMessageFull): + return self._on_full(event) + if isinstance(event, StreamTaskMessageDone): + return self._on_done(event) + return [] + + def flush(self) -> list[SpanSignal]: + """Close anything still open at end of stream, marked incomplete.""" + signals: list[SpanSignal] = [] + for tcid in list(self._open_tool_ids): + signals.append(CloseSpan(key=tcid, output=None, is_complete=False)) + self._open_tool_ids.clear() + for idx in sorted(self._reasoning_index_open): + signals.append(CloseSpan(key=f"reasoning:{idx}", output=None, is_complete=False)) + self._reasoning_index_open.clear() + return signals + + def _on_start(self, event: StreamTaskMessageStart) -> list[SpanSignal]: + content = event.content + idx = event.index if event.index is not None else -1 + ctype = getattr(content, "type", None) + if ctype == "tool_request": + self._tool_by_index[idx] = _ToolReqMeta( + tool_call_id=content.tool_call_id, + name=content.name, + arguments=dict(content.arguments or {}), + ) + return [] + if ctype == "reasoning": + self._reasoning_index_open.add(idx) + return [OpenSpan(key=f"reasoning:{idx}", kind="reasoning", name="reasoning", input={})] + return [] + + def _on_delta(self, event: StreamTaskMessageDelta) -> list[SpanSignal]: + idx = event.index if event.index is not None else -1 + delta = event.delta + if delta is not None and getattr(delta, "type", None) == "tool_request": + meta = self._tool_by_index.get(idx) + if meta is not None and delta.arguments_delta: + meta.args_buf += delta.arguments_delta + return [] + + def _on_full(self, event: StreamTaskMessageFull) -> list[SpanSignal]: + content = event.content + if getattr(content, "type", None) == "tool_response": + tcid = content.tool_call_id + if tcid in self._open_tool_ids: + self._open_tool_ids.discard(tcid) + return [CloseSpan(key=tcid, output=content.content, is_complete=True)] + return [] + + def _on_done(self, event: StreamTaskMessageDone) -> list[SpanSignal]: + idx = event.index if event.index is not None else -1 + meta = self._tool_by_index.pop(idx, None) + if meta is not None: + args = meta.arguments + if meta.args_buf: + try: + args = json.loads(meta.args_buf) + except json.JSONDecodeError: + args = {"_raw": meta.args_buf} + self._open_tool_ids.add(meta.tool_call_id) + return [OpenSpan(key=meta.tool_call_id, kind="tool", name=meta.name, input=args)] + if idx in self._reasoning_index_open: + self._reasoning_index_open.discard(idx) + return [CloseSpan(key=f"reasoning:{idx}", output=None, is_complete=True)] + return [] +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `pytest tests/lib/core/harness/test_span_derivation.py -v` +Expected: PASS (6 passed) + +- [ ] **Step 5: Commit** + +```bash +git add src/agentex/lib/core/harness/span_derivation.py tests/lib/core/harness/test_span_derivation.py +git commit -m "feat(harness): pure SpanDeriver reducing the canonical stream to span signals" +``` + +--- + +## Task 3: Tracer adapter (span signals -> adk.tracing) + +**Files:** +- Create: `src/agentex/lib/core/harness/tracer.py` +- Test: `tests/lib/core/harness/test_tracer.py` + +A thin adapter that turns `SpanSignal`s into `adk.tracing` spans, nesting them under a parent +span. Kept separate from `SpanDeriver` so derivation stays pure and tracing stays overridable. +Tracing failures are best-effort and never raise (spec error-handling contract). + +- [ ] **Step 1: Write the failing test (uses a fake adk.tracing)** + +Create `tests/lib/core/harness/test_tracer.py`: + +```python +import pytest + +from agentex.lib.core.harness.tracer import SpanTracer +from agentex.lib.core.harness.types import OpenSpan, CloseSpan + + +class _FakeSpan: + def __init__(self, name): + self.name = name + + +class _FakeTracing: + def __init__(self): + self.started = [] + self.ended = [] + + async def start_span(self, *, trace_id, name, input=None, parent_id=None, data=None, task_id=None): + self.started.append((name, parent_id, input)) + return _FakeSpan(name) + + async def end_span(self, *, trace_id, span, output=None, data=None): + self.ended.append((span.name, output)) + + +@pytest.mark.asyncio +async def test_open_then_close_starts_and_ends_span(): + fake = _FakeTracing() + tracer = SpanTracer(trace_id="t1", parent_span_id="p1", tracing=fake) + await tracer.handle(OpenSpan(key="call_1", kind="tool", name="Bash", input={"cmd": "ls"})) + await tracer.handle(CloseSpan(key="call_1", output="files", is_complete=True)) + assert fake.started == [("Bash", "p1", {"cmd": "ls"})] + assert fake.ended == [("Bash", "files")] + + +@pytest.mark.asyncio +async def test_no_trace_id_is_noop(): + fake = _FakeTracing() + tracer = SpanTracer(trace_id="", parent_span_id=None, tracing=fake) + await tracer.handle(OpenSpan(key="k", kind="tool", name="X")) + await tracer.handle(CloseSpan(key="k")) + assert fake.started == [] and fake.ended == [] + + +@pytest.mark.asyncio +async def test_tracing_failure_is_swallowed(): + class _Boom(_FakeTracing): + async def start_span(self, **kw): + raise RuntimeError("backend down") + + tracer = SpanTracer(trace_id="t1", parent_span_id="p1", tracing=_Boom()) + # Must not raise. + await tracer.handle(OpenSpan(key="k", kind="tool", name="X")) + await tracer.handle(CloseSpan(key="k")) +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `pytest tests/lib/core/harness/test_tracer.py -v` +Expected: FAIL with `ModuleNotFoundError: agentex.lib.core.harness.tracer` + +- [ ] **Step 3: Implement `SpanTracer`** + +Create `src/agentex/lib/core/harness/tracer.py`: + +```python +"""Adapter from SpanSignals to adk.tracing spans (best-effort, overridable).""" + +from __future__ import annotations + +from typing import Any + +from agentex.lib.utils.logging import make_logger +from agentex.lib.core.harness.types import CloseSpan, OpenSpan, SpanSignal + +logger = make_logger(__name__) + + +class SpanTracer: + """Opens/closes adk.tracing child spans in response to span signals. + + `tracing` defaults to the real `adk.tracing` module; inject a fake in tests + or a custom tracer to override. No-op when `trace_id` is falsy. Never raises. + """ + + def __init__(self, trace_id: str | None, parent_span_id: str | None, tracing: Any = None, task_id: str | None = None): + self.trace_id = trace_id + self.parent_span_id = parent_span_id + self.task_id = task_id + if tracing is None: + from agentex.lib import adk + + tracing = adk.tracing + self._tracing = tracing + self._open: dict[str, Any] = {} # span key -> span object + + async def handle(self, signal: SpanSignal) -> None: + if not self.trace_id: + return + try: + if isinstance(signal, OpenSpan): + span = await self._tracing.start_span( + trace_id=self.trace_id, + name=signal.name, + input=signal.input, + parent_id=self.parent_span_id, + task_id=self.task_id, + ) + if span is not None: + self._open[signal.key] = span + elif isinstance(signal, CloseSpan): + span = self._open.pop(signal.key, None) + if span is not None: + await self._tracing.end_span( + trace_id=self.trace_id, + span=span, + output=signal.output, + ) + except Exception as exc: # best-effort: tracing never breaks delivery + logger.warning("[harness.tracer] span signal failed: %s", exc) +``` + +Note for the implementer: confirm `adk.tracing.end_span` accepts `output=` (seen in +`src/agentex/lib/adk/_modules/tracing.py`). If the kwarg differs, adjust the call and the +fake in the test together. + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `pytest tests/lib/core/harness/test_tracer.py -v` +Expected: PASS (3 passed) + +- [ ] **Step 5: Commit** + +```bash +git add src/agentex/lib/core/harness/tracer.py tests/lib/core/harness/test_tracer.py +git commit -m "feat(harness): SpanTracer adapter from span signals to adk.tracing" +``` + +--- + +## Task 4: `yield_events` delivery adapter — PR 3 (part 1) + +**Files:** +- Create: `src/agentex/lib/core/harness/yield_delivery.py` +- Test: `tests/lib/core/harness/test_yield_delivery.py` + +`yield_events` passes the canonical stream through unchanged (for sync HTTP ACP agents) while +feeding the `SpanDeriver` + `SpanTracer` as a side effect. Streaming fidelity is untouched. + +- [ ] **Step 1: Write the failing test** + +Create `tests/lib/core/harness/test_yield_delivery.py`: + +```python +import pytest + +from agentex.lib.core.harness.yield_delivery import yield_events +from agentex.lib.core.harness.tracer import SpanTracer +from agentex.types.task_message_update import ( + StreamTaskMessageStart, + StreamTaskMessageDone, + StreamTaskMessageFull, +) +from agentex.types.tool_request_content import ToolRequestContent +from agentex.types.tool_response_content import ToolResponseContent + + +class _RecordTracing: + def __init__(self): + self.started, self.ended = [], [] + + async def start_span(self, *, trace_id, name, input=None, parent_id=None, data=None, task_id=None): + self.started.append(name) + return object() + + async def end_span(self, *, trace_id, span, output=None, data=None): + self.ended.append(output) + + +async def _gen(events): + for e in events: + yield e + + +@pytest.mark.asyncio +async def test_yield_passes_events_through_and_traces(): + fake = _RecordTracing() + tracer = SpanTracer(trace_id="t", parent_span_id="p", tracing=fake) + events = [ + StreamTaskMessageStart(type="start", index=0, + content=ToolRequestContent(type="tool_request", author="agent", + tool_call_id="c", name="Bash", arguments={})), + StreamTaskMessageDone(type="done", index=0), + StreamTaskMessageFull(type="full", index=1, + content=ToolResponseContent(type="tool_response", author="agent", + tool_call_id="c", name="Bash", content="ok")), + ] + out = [e async for e in yield_events(_gen(events), tracer=tracer)] + assert out == events # passthrough unchanged + assert fake.started == ["Bash"] # span derived + opened + assert fake.ended == ["ok"] # span closed with response +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `pytest tests/lib/core/harness/test_yield_delivery.py -v` +Expected: FAIL with `ModuleNotFoundError: agentex.lib.core.harness.yield_delivery` + +- [ ] **Step 3: Implement `yield_events`** + +Create `src/agentex/lib/core/harness/yield_delivery.py`: + +```python +"""Yield delivery: pass the canonical stream through, tracing as a side effect.""" + +from __future__ import annotations + +from typing import AsyncIterator + +from agentex.lib.core.harness.span_derivation import SpanDeriver +from agentex.lib.core.harness.tracer import SpanTracer +from agentex.lib.core.harness.types import StreamTaskMessage + + +async def yield_events( + events: AsyncIterator[StreamTaskMessage], + tracer: SpanTracer | None = None, +) -> AsyncIterator[StreamTaskMessage]: + """Forward each event to the caller; derive + trace spans as a side effect. + + For sync HTTP ACP agents that yield events back over the response. When + `tracer` is None, this is a pure passthrough. + """ + deriver = SpanDeriver() if tracer is not None else None + try: + async for event in events: + if deriver is not None and tracer is not None: + for signal in deriver.observe(event): + await tracer.handle(signal) + yield event + finally: + if deriver is not None and tracer is not None: + for signal in deriver.flush(): + await tracer.handle(signal) +``` + +- [ ] **Step 4: Run test to verify it passes** + +Run: `pytest tests/lib/core/harness/test_yield_delivery.py -v` +Expected: PASS (1 passed) + +- [ ] **Step 5: Commit** + +```bash +git add src/agentex/lib/core/harness/yield_delivery.py tests/lib/core/harness/test_yield_delivery.py +git commit -m "feat(harness): yield_events delivery adapter (passthrough + tracing)" +``` + +--- + +## Task 5: `auto_send` delivery adapter — PR 2 + +**Files:** +- Create: `src/agentex/lib/core/harness/auto_send.py` +- Test: `tests/lib/core/harness/test_auto_send.py` + +`auto_send` consumes the canonical stream and drives `adk.streaming` context managers: it opens +a text context for `TextContent`, a reasoning context for `ReasoningContent`, switches cleanly +between them, and posts tool request/response as full messages. It feeds the same +`SpanDeriver`/`SpanTracer` and returns `TurnResult`. This generalizes the golden agent's +`AgentexStreamAdapter` (`teams/sgp/agents/golden_agent/project/harness/adapter.py`) to consume +`StreamTaskMessage*` instead of `HarnessEvent`. + +Reference while implementing: `src/agentex/lib/adk/_modules/_langgraph_async.py` +(`stream_langgraph_events`) shows the exact `adk.streaming` open/stream/close pattern to reuse; +`adapter.py` lines 87–130 show the text↔reasoning↔tool switching logic to mirror. + +- [ ] **Step 1: Write the failing test (fake streaming records context lifecycle)** + +Create `tests/lib/core/harness/test_auto_send.py`: + +```python +import pytest + +from agentex.lib.core.harness.auto_send import auto_send +from agentex.types.task_message_update import ( + StreamTaskMessageStart, + StreamTaskMessageDelta, + StreamTaskMessageDone, +) +from agentex.types.text_content import TextContent +from agentex.types.text_delta import TextDelta + + +class _FakeCtx: + def __init__(self, sink): + self.sink = sink + + async def __aenter__(self): + self.sink.append(("open",)) + return self + + async def __aexit__(self, *a): + self.sink.append(("close",)) + return False + + async def stream_update(self, update): + self.sink.append(("update", update)) + return update + + +class _FakeStreaming: + def __init__(self): + self.sink = [] + + def streaming_task_message_context(self, task_id, initial_content, streaming_mode="coalesced", created_at=None): + self.sink.append(("ctx", getattr(initial_content, "type", None))) + return _FakeCtx(self.sink) + + +async def _gen(events): + for e in events: + yield e + + +@pytest.mark.asyncio +async def test_auto_send_streams_text_and_returns_final_text(): + streaming = _FakeStreaming() + events = [ + StreamTaskMessageStart(type="start", index=0, + content=TextContent(type="text", author="agent", content="")), + StreamTaskMessageDelta(type="delta", index=0, delta=TextDelta(type="text", text_delta="Hel")), + StreamTaskMessageDelta(type="delta", index=0, delta=TextDelta(type="text", text_delta="lo")), + StreamTaskMessageDone(type="done", index=0), + ] + result = await auto_send(_gen(events), task_id="task1", tracer=None, streaming=streaming) + assert result.final_text == "Hello" + kinds = [s[0] for s in streaming.sink] + assert kinds[0] == "ctx" and "open" in kinds and "close" in kinds +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `pytest tests/lib/core/harness/test_auto_send.py -v` +Expected: FAIL with `ModuleNotFoundError: agentex.lib.core.harness.auto_send` + +- [ ] **Step 3: Implement `auto_send`** + +Create `src/agentex/lib/core/harness/auto_send.py`. The implementer mirrors the text↔reasoning +switching from `adapter.py` and the `adk.streaming` usage from `_langgraph_async.py`: + +```python +"""Auto-send delivery: canonical stream -> adk.streaming side effects + tracing.""" + +from __future__ import annotations + +from typing import Any, AsyncIterator + +from agentex.types.task_message_update import ( + StreamTaskMessageDelta, + StreamTaskMessageDone, + StreamTaskMessageFull, + StreamTaskMessageStart, +) +from agentex.types.text_content import TextContent + +from agentex.lib.core.harness.span_derivation import SpanDeriver +from agentex.lib.core.harness.tracer import SpanTracer +from agentex.lib.core.harness.types import StreamTaskMessage, TurnResult, TurnUsage + + +async def auto_send( + events: AsyncIterator[StreamTaskMessage], + task_id: str, + tracer: SpanTracer | None = None, + streaming: Any = None, + usage: TurnUsage | None = None, +) -> TurnResult: + """Push the canonical stream to the task stream via adk.streaming. + + Opens a streaming context per text/reasoning message, streams deltas, and + closes on Done; posts tool request/response as full messages; derives and + traces spans from the same stream. Returns the accumulated final text + + usage. For async + temporal agents (call from inside an activity). + """ + if streaming is None: + from agentex.lib import adk + + streaming = adk.streaming + + deriver = SpanDeriver() if tracer is not None else None + final_text_parts: list[str] = [] + current_ctx: Any = None + current_kind: str | None = None # "text" | "reasoning" + + async def _close_current() -> None: + nonlocal current_ctx, current_kind + if current_ctx is not None: + await current_ctx.__aexit__(None, None, None) + current_ctx = None + current_kind = None + + try: + async for event in events: + if deriver is not None and tracer is not None: + for signal in deriver.observe(event): + await tracer.handle(signal) + + if isinstance(event, StreamTaskMessageStart): + ctype = getattr(event.content, "type", None) + if ctype in ("text", "reasoning"): + await _close_current() + current_ctx = streaming.streaming_task_message_context( + task_id=task_id, initial_content=event.content, + ) + await current_ctx.__aenter__() + current_kind = ctype + elif isinstance(event, StreamTaskMessageDelta): + if current_ctx is not None and event.delta is not None: + await current_ctx.stream_update(event) + if getattr(event.delta, "type", None) == "text" and event.delta.text_delta: + final_text_parts.append(event.delta.text_delta) + elif isinstance(event, StreamTaskMessageDone): + await _close_current() + elif isinstance(event, StreamTaskMessageFull): + # Tool request/response (and any non-streamed full message): post as a + # standalone full message, not tied to the current text/reasoning ctx. + await _close_current() + ctx = streaming.streaming_task_message_context( + task_id=task_id, initial_content=event.content, + ) + await ctx.__aenter__() + await ctx.__aexit__(None, None, None) + finally: + await _close_current() + if deriver is not None and tracer is not None: + for signal in deriver.flush(): + await tracer.handle(signal) + + return TurnResult(final_text="".join(final_text_parts), usage=usage or TurnUsage()) +``` + +Note for the implementer: validate the exact `streaming_task_message_context` usage against +`_langgraph_async.py` (whether to call `stream_update` with the whole `StreamTaskMessageDelta` +or the inner delta). Adjust the call and the fake together; the test asserts behavior, not the +internal kwarg shape. + +- [ ] **Step 4: Run test to verify it passes** + +Run: `pytest tests/lib/core/harness/test_auto_send.py -v` +Expected: PASS (1 passed) + +- [ ] **Step 5: Commit** + +```bash +git add src/agentex/lib/core/harness/auto_send.py tests/lib/core/harness/test_auto_send.py +git commit -m "feat(harness): auto_send delivery adapter (canonical stream -> adk.streaming + tracing)" +``` + +--- + +## Task 6: `UnifiedEmitter` facade — PR 3 (part 2) + +**Files:** +- Create: `src/agentex/lib/core/harness/emitter.py` +- Modify: `src/agentex/lib/core/harness/__init__.py` (re-export public surface) +- Test: `tests/lib/core/harness/test_emitter.py` + +`UnifiedEmitter` is the single thing an agent author touches. It owns the trace context, builds +the `SpanTracer` (default-on when a trace context exists, overridable), and exposes both +delivery modes over a `HarnessTurn`. It attaches the turn's `TurnUsage` to delivery. + +- [ ] **Step 1: Write the failing test** + +Create `tests/lib/core/harness/test_emitter.py`: + +```python +import pytest + +from agentex.lib.core.harness.emitter import UnifiedEmitter +from agentex.lib.core.harness.types import TurnUsage +from agentex.types.task_message_update import StreamTaskMessageStart, StreamTaskMessageDone +from agentex.types.text_content import TextContent + + +class _Turn: + def __init__(self, events_list, usage): + self._events_list = events_list + self._usage = usage + + @property + async def events(self): + for e in self._events_list: + yield e + + def usage(self): + return self._usage + + +@pytest.mark.asyncio +async def test_emitter_yield_mode_passes_through(): + events = [ + StreamTaskMessageStart(type="start", index=0, + content=TextContent(type="text", author="agent", content="hi")), + StreamTaskMessageDone(type="done", index=0), + ] + turn = _Turn(events, TurnUsage(model="m")) + emitter = UnifiedEmitter(task_id="t", trace_id=None, parent_span_id=None) + out = [e async for e in emitter.yield_turn(turn)] + assert out == events + + +@pytest.mark.asyncio +async def test_emitter_tracing_default_on_when_trace_id_present(): + emitter = UnifiedEmitter(task_id="t", trace_id="trace1", parent_span_id="p") + assert emitter.tracer is not None + + +@pytest.mark.asyncio +async def test_emitter_tracing_overridable_off(): + emitter = UnifiedEmitter(task_id="t", trace_id="trace1", parent_span_id="p", tracer=False) + assert emitter.tracer is None +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `pytest tests/lib/core/harness/test_emitter.py -v` +Expected: FAIL with `ModuleNotFoundError: agentex.lib.core.harness.emitter` + +- [ ] **Step 3: Implement `UnifiedEmitter`** + +Create `src/agentex/lib/core/harness/emitter.py`: + +```python +"""UnifiedEmitter: the single facade agent authors use for either delivery mode.""" + +from __future__ import annotations + +from typing import Any, AsyncIterator + +from agentex.lib.core.harness.auto_send import auto_send +from agentex.lib.core.harness.tracer import SpanTracer +from agentex.lib.core.harness.types import HarnessTurn, StreamTaskMessage, TurnResult +from agentex.lib.core.harness.yield_delivery import yield_events + + +class UnifiedEmitter: + """Ties trace context + chosen delivery together. + + Tracing is default-on whenever `trace_id` is truthy; pass `tracer=False` to + disable, or a custom `SpanTracer` to override. + """ + + def __init__( + self, + task_id: str, + trace_id: str | None, + parent_span_id: str | None, + tracer: SpanTracer | bool | None = None, + ): + self.task_id = task_id + self.trace_id = trace_id + self.parent_span_id = parent_span_id + if tracer is False: + self.tracer: SpanTracer | None = None + elif isinstance(tracer, SpanTracer): + self.tracer = tracer + elif trace_id: + self.tracer = SpanTracer(trace_id=trace_id, parent_span_id=parent_span_id, task_id=task_id) + else: + self.tracer = None + + async def yield_turn(self, turn: HarnessTurn) -> AsyncIterator[StreamTaskMessage]: + """Sync HTTP ACP delivery: forward events, trace as side effect.""" + async for event in yield_events(turn.events, tracer=self.tracer): + yield event + + async def auto_send_turn(self, turn: HarnessTurn) -> TurnResult: + """Async/temporal delivery: push to the task stream, return TurnResult.""" + return await auto_send( + turn.events, + task_id=self.task_id, + tracer=self.tracer, + usage=turn.usage(), + ) +``` + +- [ ] **Step 4: Re-export the public surface** + +Append to `src/agentex/lib/core/harness/__init__.py`: + +```python +from agentex.lib.core.harness.emitter import UnifiedEmitter +from agentex.lib.core.harness.tracer import SpanTracer +from agentex.lib.core.harness.types import ( + CloseSpan, + HarnessTurn, + OpenSpan, + SpanSignal, + StreamTaskMessage, + TurnResult, + TurnUsage, +) + +__all__ = [ + "UnifiedEmitter", + "SpanTracer", + "OpenSpan", + "CloseSpan", + "SpanSignal", + "StreamTaskMessage", + "TurnUsage", + "TurnResult", + "HarnessTurn", +] +``` + +- [ ] **Step 5: Run tests to verify they pass** + +Run: `pytest tests/lib/core/harness/ -v` +Expected: PASS (all harness tests green) + +- [ ] **Step 6: Commit** + +```bash +git add src/agentex/lib/core/harness/emitter.py src/agentex/lib/core/harness/__init__.py tests/lib/core/harness/test_emitter.py +git commit -m "feat(harness): UnifiedEmitter facade tying delivery + tracing + usage" +``` + +--- + +## Task 7: Conformance test scaffold + empty CI integration job — PR 3 (part 3) + +**Files:** +- Create: `tests/lib/core/harness/conformance/__init__.py` +- Create: `tests/lib/core/harness/conformance/runner.py` +- Create: `tests/lib/core/harness/conformance/test_conformance.py` +- Create: `.github/workflows/harness-integration.yml` + +The conformance runner is the shared parametrized engine each harness tap will register fixtures +with (in later plans). It asserts yield-vs-auto-send equivalence on the span signals derived +from a fixture's canonical-event sequence. + +- [ ] **Step 1: Write the conformance runner + a self-test fixture** + +Create `tests/lib/core/harness/conformance/__init__.py` (empty), then +`tests/lib/core/harness/conformance/runner.py`: + +```python +"""Shared conformance engine: every harness tap registers fixtures here. + +A fixture is (name, list[StreamTaskMessage]). The runner asserts that span +derivation over the events is identical regardless of delivery channel, which is +the cross-channel guarantee from the spec. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Callable + +from agentex.lib.core.harness.span_derivation import SpanDeriver +from agentex.lib.core.harness.types import SpanSignal, StreamTaskMessage + + +@dataclass +class Fixture: + name: str + events: list[StreamTaskMessage] + + +_REGISTRY: list[Fixture] = [] + + +def register(fixture: Fixture) -> None: + _REGISTRY.append(fixture) + + +def all_fixtures() -> list[Fixture]: + return list(_REGISTRY) + + +def derive_all(events: list[StreamTaskMessage]) -> list[SpanSignal]: + d = SpanDeriver() + out: list[SpanSignal] = [] + for e in events: + out.extend(d.observe(e)) + out.extend(d.flush()) + return out +``` + +- [ ] **Step 2: Write the conformance test (self-test on a built-in fixture)** + +Create `tests/lib/core/harness/conformance/test_conformance.py`: + +```python +import pytest + +from tests.lib.core.harness.conformance.runner import Fixture, derive_all, register, all_fixtures +from agentex.types.task_message_update import ( + StreamTaskMessageStart, StreamTaskMessageDone, StreamTaskMessageFull, +) +from agentex.types.tool_request_content import ToolRequestContent +from agentex.types.tool_response_content import ToolResponseContent + +register(Fixture( + name="builtin-single-tool", + events=[ + StreamTaskMessageStart(type="start", index=0, + content=ToolRequestContent(type="tool_request", author="agent", + tool_call_id="c", name="Bash", arguments={})), + StreamTaskMessageDone(type="done", index=0), + StreamTaskMessageFull(type="full", index=1, + content=ToolResponseContent(type="tool_response", author="agent", + tool_call_id="c", name="Bash", content="ok")), + ], +)) + + +@pytest.mark.parametrize("fixture", all_fixtures(), ids=lambda f: f.name) +def test_span_derivation_is_deterministic(fixture): + # Deriving twice over the same events yields identical signals (the property + # that makes yield vs auto-send equivalent, since both observe the same stream). + assert derive_all(fixture.events) == derive_all(fixture.events) +``` + +- [ ] **Step 3: Run the conformance test** + +Run: `pytest tests/lib/core/harness/conformance/ -v` +Expected: PASS (1 passed) + +- [ ] **Step 4: Add the empty CI integration job** + +Create `.github/workflows/harness-integration.yml` (mirrors the structure of the existing +`agentex-tutorials-test.yml`; the matrix is populated in later plans): + +```yaml +name: Harness Integration + +on: + pull_request: + paths: + - "src/agentex/lib/core/harness/**" + - "src/agentex/lib/adk/_modules/**" + - ".github/workflows/harness-integration.yml" + +jobs: + conformance: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: astral-sh/setup-uv@v5 + - name: Install + run: uv sync + - name: Conformance suite + run: uv run pytest tests/lib/core/harness/ -v + + # Live integration matrix (harness x {sync, async, temporal}) is added per-harness + # in the migration plans. Placeholder job keeps the workflow valid until then. + live-matrix: + runs-on: ubuntu-latest + if: false # enabled once the first harness's test agents land + steps: + - run: echo "populated by migration PRs" +``` + +- [ ] **Step 5: Commit** + +```bash +git add tests/lib/core/harness/conformance .github/workflows/harness-integration.yml +git commit -m "test(harness): conformance scaffold + CI integration job skeleton" +``` + +--- + +## Task 8: Run the full suite + type check + +- [ ] **Step 1: Run the whole harness test tree** + +Run: `pytest tests/lib/core/harness/ -v` +Expected: PASS (all tasks' tests green) + +- [ ] **Step 2: Type check the new package** + +Run: `uv run mypy src/agentex/lib/core/harness/` (or the repo's configured type checker) +Expected: no errors. Fix any signature mismatches inline. + +- [ ] **Step 3: Final commit if the type check required fixes** + +```bash +git add -A && git commit -m "chore(harness): type-check fixes for foundation package" +``` + +--- + +## Subsequent plans (to be written after this lands) + +Each gets its own plan via the writing-plans skill, expanded with that harness's exact +converter code: + +- **PR 4 — Migrate pydantic-ai:** wrap `convert_pydantic_ai_to_agentex_events` as a + `HarnessTurn` (add `usage()` normalizing `result.usage()`), reimplement `_pydantic_ai_async` + on `auto_send`, retire `_pydantic_ai_tracing` in favor of `SpanTracer`, keep the public + `convert_*` signature. Add 3 test agents (sync/async/temporal) + register conformance + fixtures + enable the live-matrix row. +- **PR 5 — Migrate langgraph:** same shape; reimplement `stream_langgraph_events` on + `auto_send`; normalize `usage_metadata` into `TurnUsage`. +- **PR 6 — Migrate openai-agents:** same shape; reimplement `run_agent_streamed_auto_send` on + `auto_send`; normalize `response.usage`. +- **PR 7 — claude-code parser tap:** `convert_claude_code_to_agentex_events` (port the golden + agent's `_StreamJsonProcessor` to yield `StreamTaskMessage*`) + recorded stream-json + fixtures + feasible test agent(s). +- **PR 8 — codex parser tap:** same shape for `_CodexEventProcessor`. +- **PR 9 — Cleanup:** delete now-dead internal duplication, deprecate `_*_tracing` shims, docs. + +The `is_error` tool-error work is deferred and tracked in Linear as AGX1-371. From de564833e8364492bc7310880d9557643fa98883 Mon Sep 17 00:00:00 2001 From: Declan Brady Date: Thu, 18 Jun 2026 12:00:40 -0400 Subject: [PATCH 05/35] feat(harness): foundation types for unified harness surface Co-Authored-By: Claude Sonnet 4.6 --- src/agentex/lib/core/harness/__init__.py | 6 ++ src/agentex/lib/core/harness/types.py | 91 ++++++++++++++++++++++++ tests/lib/core/harness/__init__.py | 0 tests/lib/core/harness/test_types.py | 27 +++++++ 4 files changed, 124 insertions(+) create mode 100644 src/agentex/lib/core/harness/__init__.py create mode 100644 src/agentex/lib/core/harness/types.py create mode 100644 tests/lib/core/harness/__init__.py create mode 100644 tests/lib/core/harness/test_types.py diff --git a/src/agentex/lib/core/harness/__init__.py b/src/agentex/lib/core/harness/__init__.py new file mode 100644 index 000000000..15d116148 --- /dev/null +++ b/src/agentex/lib/core/harness/__init__.py @@ -0,0 +1,6 @@ +"""Shared, harness-independent machinery for the unified harness surface. + +The Agentex StreamTaskMessage* stream is the single source of truth; this +package derives spans from it and delivers it (yield or auto-send), so every +harness tap gets streaming + tracing + turn usage uniformly. +""" diff --git a/src/agentex/lib/core/harness/types.py b/src/agentex/lib/core/harness/types.py new file mode 100644 index 000000000..f31b2c67f --- /dev/null +++ b/src/agentex/lib/core/harness/types.py @@ -0,0 +1,91 @@ +"""Types for the unified harness surface.""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Any, AsyncIterator, Literal, Protocol, Union, runtime_checkable + +from agentex.types.task_message_update import ( + StreamTaskMessageDelta, + StreamTaskMessageDone, + StreamTaskMessageFull, + StreamTaskMessageStart, +) +from pydantic import BaseModel, ConfigDict + +# The canonical stream element. Taps yield these; delivery adapters consume them. +StreamTaskMessage = Union[ + StreamTaskMessageStart, + StreamTaskMessageDelta, + StreamTaskMessageFull, + StreamTaskMessageDone, +] + +SpanKind = Literal["tool", "reasoning", "subagent"] + + +@dataclass +class OpenSpan: + """Signal to open a child span. `key` pairs an open with its close.""" + + key: str + kind: SpanKind + name: str + input: dict[str, Any] = field(default_factory=dict) + + +@dataclass +class CloseSpan: + """Signal to close the span previously opened with the same `key`.""" + + key: str + output: Any = None + is_complete: bool = True # False when closed by flush() without a result + + +SpanSignal = Union[OpenSpan, CloseSpan] + + +class TurnUsage(BaseModel): + """Harness-independent turn usage/cost, attached to the turn span. + + Token field names align with agentex.lib.core.observability.llm_metrics. + """ + + model_config = ConfigDict(from_attributes=True, populate_by_name=True) + + model: str | None = None + input_tokens: int | None = None + output_tokens: int | None = None + cached_input_tokens: int | None = None + reasoning_tokens: int | None = None + total_tokens: int | None = None + cost_usd: float | None = None + duration_ms: int | None = None + num_llm_calls: int = 0 + num_tool_calls: int = 0 + num_reasoning_blocks: int = 0 + + +class TurnResult(BaseModel): + """Returned to the caller after a turn is delivered.""" + + model_config = ConfigDict(from_attributes=True, populate_by_name=True) + + final_text: str = "" + usage: TurnUsage = TurnUsage() + + +@runtime_checkable +class HarnessTurn(Protocol): + """A single harness turn: a canonical stream plus its normalized usage. + + Python async generators cannot cleanly return a value to their consumer, so + a tap exposes usage via `usage()` (valid only after `events` is exhausted) + rather than via StopAsyncIteration. + """ + + @property + def events(self) -> AsyncIterator[StreamTaskMessage]: ... + + def usage(self) -> TurnUsage: ... diff --git a/tests/lib/core/harness/__init__.py b/tests/lib/core/harness/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/lib/core/harness/test_types.py b/tests/lib/core/harness/test_types.py new file mode 100644 index 000000000..b025d803b --- /dev/null +++ b/tests/lib/core/harness/test_types.py @@ -0,0 +1,27 @@ +from agentex.lib.core.harness.types import ( + OpenSpan, + CloseSpan, + TurnUsage, + TurnResult, +) + + +def test_open_close_span_construct(): + o = OpenSpan(key="call_1", kind="tool", name="Bash", input={"cmd": "ls"}) + c = CloseSpan(key="call_1", output="files", is_complete=True) + assert o.key == c.key == "call_1" + assert o.kind == "tool" + assert c.is_complete is True + + +def test_turn_usage_defaults_are_none(): + u = TurnUsage(model="claude-opus-4-6") + assert u.model == "claude-opus-4-6" + assert u.input_tokens is None + assert u.num_tool_calls == 0 + + +def test_turn_result_wraps_usage(): + r = TurnResult(final_text="hi", usage=TurnUsage(model="m")) + assert r.final_text == "hi" + assert r.usage.model == "m" From a13725cca825b8a0f21b7a1b25c25bf223447b27 Mon Sep 17 00:00:00 2001 From: Declan Brady Date: Thu, 18 Jun 2026 12:05:15 -0400 Subject: [PATCH 06/35] test(harness): cover CloseSpan defaults and HarnessTurn runtime check Co-Authored-By: Claude Sonnet 4.6 --- tests/lib/core/harness/test_types.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/tests/lib/core/harness/test_types.py b/tests/lib/core/harness/test_types.py index b025d803b..91857993a 100644 --- a/tests/lib/core/harness/test_types.py +++ b/tests/lib/core/harness/test_types.py @@ -1,6 +1,10 @@ +from typing import AsyncIterator + from agentex.lib.core.harness.types import ( OpenSpan, CloseSpan, + HarnessTurn, + StreamTaskMessage, TurnUsage, TurnResult, ) @@ -25,3 +29,25 @@ def test_turn_result_wraps_usage(): r = TurnResult(final_text="hi", usage=TurnUsage(model="m")) assert r.final_text == "hi" assert r.usage.model == "m" + + +def test_close_span_defaults(): + c = CloseSpan(key="x") + assert c.output is None + assert c.is_complete is True + + +def test_harness_turn_runtime_check(): + class _Turn: + @property + def events(self) -> AsyncIterator[StreamTaskMessage]: + async def _gen() -> AsyncIterator[StreamTaskMessage]: + if False: + yield # pragma: no cover + + return _gen() + + def usage(self) -> TurnUsage: + return TurnUsage(model="m") + + assert isinstance(_Turn(), HarnessTurn) is True From 13868ee1968632f47eb2679b234c908239896f84 Mon Sep 17 00:00:00 2001 From: Declan Brady Date: Thu, 18 Jun 2026 12:07:47 -0400 Subject: [PATCH 07/35] feat(harness): pure SpanDeriver reducing the canonical stream to span signals Co-Authored-By: Claude Sonnet 4.6 --- .../lib/core/harness/span_derivation.py | 115 +++++++++++++++++ .../lib/core/harness/test_span_derivation.py | 120 ++++++++++++++++++ 2 files changed, 235 insertions(+) create mode 100644 src/agentex/lib/core/harness/span_derivation.py create mode 100644 tests/lib/core/harness/test_span_derivation.py diff --git a/src/agentex/lib/core/harness/span_derivation.py b/src/agentex/lib/core/harness/span_derivation.py new file mode 100644 index 000000000..deb5d6d68 --- /dev/null +++ b/src/agentex/lib/core/harness/span_derivation.py @@ -0,0 +1,115 @@ +"""Pure reducer: canonical StreamTaskMessage* stream -> span open/close signals. + +Has no dependency on adk; unit-testable in isolation. Delivery adapters feed it +every event and act on the returned signals. +""" + +from __future__ import annotations + +import json +from dataclasses import dataclass +from typing import Any + +from agentex.types.task_message_update import ( + StreamTaskMessageDelta, + StreamTaskMessageDone, + StreamTaskMessageFull, + StreamTaskMessageStart, +) + +from agentex.lib.core.harness.types import CloseSpan, OpenSpan, SpanSignal, StreamTaskMessage + + +@dataclass +class _ToolReqMeta: + tool_call_id: str + name: str + arguments: dict[str, Any] + args_buf: str = "" # accumulated streamed argument fragments + + +class SpanDeriver: + """Stateful reducer over the canonical stream. + + Tool span: open on Done of a ToolRequestContent index; close on matching + ToolResponseContent by tool_call_id. Reasoning span: open on + Start(ReasoningContent); close on that index's Done. + """ + + def __init__(self) -> None: + self._tool_by_index: dict[int, _ToolReqMeta] = {} + self._reasoning_index_open: set[int] = set() + self._open_tool_ids: set[str] = set() + + def observe(self, event: StreamTaskMessage) -> list[SpanSignal]: + if isinstance(event, StreamTaskMessageStart): + return self._on_start(event) + if isinstance(event, StreamTaskMessageDelta): + return self._on_delta(event) + if isinstance(event, StreamTaskMessageFull): + return self._on_full(event) + if isinstance(event, StreamTaskMessageDone): + return self._on_done(event) + return [] + + def flush(self) -> list[SpanSignal]: + """Close anything still open at end of stream, marked incomplete.""" + signals: list[SpanSignal] = [] + for tcid in list(self._open_tool_ids): + signals.append(CloseSpan(key=tcid, output=None, is_complete=False)) + self._open_tool_ids.clear() + for idx in sorted(self._reasoning_index_open): + signals.append(CloseSpan(key=f"reasoning:{idx}", output=None, is_complete=False)) + self._reasoning_index_open.clear() + return signals + + def _on_start(self, event: StreamTaskMessageStart) -> list[SpanSignal]: + content = event.content + idx = event.index if event.index is not None else -1 + ctype = getattr(content, "type", None) + if ctype == "tool_request": + self._tool_by_index[idx] = _ToolReqMeta( + tool_call_id=content.tool_call_id, + name=content.name, + arguments=dict(content.arguments or {}), + ) + return [] + if ctype == "reasoning": + self._reasoning_index_open.add(idx) + return [OpenSpan(key=f"reasoning:{idx}", kind="reasoning", name="reasoning", input={})] + return [] + + def _on_delta(self, event: StreamTaskMessageDelta) -> list[SpanSignal]: + idx = event.index if event.index is not None else -1 + delta = event.delta + if delta is not None and getattr(delta, "type", None) == "tool_request": + meta = self._tool_by_index.get(idx) + if meta is not None and delta.arguments_delta: + meta.args_buf += delta.arguments_delta + return [] + + def _on_full(self, event: StreamTaskMessageFull) -> list[SpanSignal]: + content = event.content + if getattr(content, "type", None) == "tool_response": + tcid = content.tool_call_id + if tcid in self._open_tool_ids: + self._open_tool_ids.discard(tcid) + return [CloseSpan(key=tcid, output=content.content, is_complete=True)] + return [] + + def _on_done(self, event: StreamTaskMessageDone) -> list[SpanSignal]: + idx = event.index if event.index is not None else -1 + meta = self._tool_by_index.pop(idx, None) + if meta is not None: + args = meta.arguments + if meta.args_buf: + try: + args = json.loads(meta.args_buf) + except json.JSONDecodeError: + args = {"_raw": meta.args_buf} + self._open_tool_ids.add(meta.tool_call_id) + return [OpenSpan(key=meta.tool_call_id, kind="tool", name=meta.name, input=args)] + if idx in self._reasoning_index_open: + self._reasoning_index_open.discard(idx) + return [CloseSpan(key=f"reasoning:{idx}", output=None, is_complete=True)] + return [] diff --git a/tests/lib/core/harness/test_span_derivation.py b/tests/lib/core/harness/test_span_derivation.py new file mode 100644 index 000000000..0b1a4bcbe --- /dev/null +++ b/tests/lib/core/harness/test_span_derivation.py @@ -0,0 +1,120 @@ +from agentex.lib.core.harness.span_derivation import SpanDeriver +from agentex.lib.core.harness.types import OpenSpan, CloseSpan +from agentex.types.task_message_update import ( + StreamTaskMessageStart, + StreamTaskMessageDelta, + StreamTaskMessageFull, + StreamTaskMessageDone, +) +from agentex.types.text_content import TextContent +from agentex.types.reasoning_content import ReasoningContent +from agentex.types.tool_request_content import ToolRequestContent +from agentex.types.tool_response_content import ToolResponseContent +from agentex.types.tool_request_delta import ToolRequestDelta + + +def _signals(deriver, events): + out = [] + for e in events: + out.extend(deriver.observe(e)) + out.extend(deriver.flush()) + return out + + +def _tool_req(idx, tcid, name, args): + return StreamTaskMessageStart( + type="start", index=idx, + content=ToolRequestContent(type="tool_request", author="agent", + tool_call_id=tcid, name=name, arguments=args), + ) + + +def test_text_only_yields_no_spans(): + d = SpanDeriver() + events = [ + StreamTaskMessageStart(type="start", index=0, + content=TextContent(type="text", author="agent", content="")), + StreamTaskMessageDelta(type="delta", index=0, + delta=None), + StreamTaskMessageDone(type="done", index=0), + ] + assert _signals(d, events) == [] + + +def test_single_tool_opens_on_done_closes_on_response(): + d = SpanDeriver() + events = [ + _tool_req(0, "call_1", "Bash", {"cmd": "ls"}), + StreamTaskMessageDone(type="done", index=0), + StreamTaskMessageFull(type="full", index=1, + content=ToolResponseContent(type="tool_response", author="agent", + tool_call_id="call_1", name="Bash", content="files")), + ] + sigs = _signals(d, events) + assert sigs == [ + OpenSpan(key="call_1", kind="tool", name="Bash", input={"cmd": "ls"}), + CloseSpan(key="call_1", output="files", is_complete=True), + ] + + +def test_reasoning_opens_on_start_closes_on_done(): + d = SpanDeriver() + events = [ + StreamTaskMessageStart(type="start", index=0, + content=ReasoningContent(type="reasoning", author="agent", summary=[], content=[])), + StreamTaskMessageDone(type="done", index=0), + ] + sigs = _signals(d, events) + assert sigs[0] == OpenSpan(key="reasoning:0", kind="reasoning", name="reasoning", input={}) + assert sigs[1] == CloseSpan(key="reasoning:0", output=None, is_complete=True) + + +def test_parallel_tools_pair_by_tool_call_id(): + d = SpanDeriver() + events = [ + _tool_req(0, "a", "T1", {}), + _tool_req(1, "b", "T2", {}), + StreamTaskMessageDone(type="done", index=0), + StreamTaskMessageDone(type="done", index=1), + StreamTaskMessageFull(type="full", index=2, + content=ToolResponseContent(type="tool_response", author="agent", + tool_call_id="b", name="T2", content="rb")), + StreamTaskMessageFull(type="full", index=3, + content=ToolResponseContent(type="tool_response", author="agent", + tool_call_id="a", name="T1", content="ra")), + ] + sigs = _signals(d, events) + opens = [s for s in sigs if isinstance(s, OpenSpan)] + closes = [s for s in sigs if isinstance(s, CloseSpan)] + assert {o.key for o in opens} == {"a", "b"} + assert [c.key for c in closes] == ["b", "a"] + assert all(c.is_complete for c in closes) + + +def test_streamed_args_accumulate_into_open_input(): + d = SpanDeriver() + events = [ + StreamTaskMessageStart(type="start", index=0, + content=ToolRequestContent(type="tool_request", author="agent", + tool_call_id="c", name="Bash", arguments={})), + StreamTaskMessageDelta(type="delta", index=0, + delta=ToolRequestDelta(type="tool_request", tool_call_id="c", name="Bash", + arguments_delta='{"cmd":')), + StreamTaskMessageDelta(type="delta", index=0, + delta=ToolRequestDelta(type="tool_request", tool_call_id="c", name="Bash", + arguments_delta='"ls"}')), + StreamTaskMessageDone(type="done", index=0), + ] + sigs = _signals(d, events) + assert sigs[0] == OpenSpan(key="c", kind="tool", name="Bash", input={"cmd": "ls"}) + + +def test_unclosed_tool_closed_incomplete_on_flush(): + d = SpanDeriver() + events = [ + _tool_req(0, "x", "Bash", {}), + StreamTaskMessageDone(type="done", index=0), + ] + sigs = _signals(d, events) + assert sigs[0] == OpenSpan(key="x", kind="tool", name="Bash", input={}) + assert sigs[1] == CloseSpan(key="x", output=None, is_complete=False) From 0ecc03f4b4f8e686c73c0d7f2acc2c29c91a6f4c Mon Sep 17 00:00:00 2001 From: Declan Brady Date: Thu, 18 Jun 2026 12:13:13 -0400 Subject: [PATCH 08/35] refactor(harness): deterministic flush order + defensive index/orphan handling in SpanDeriver Co-Authored-By: Claude Sonnet 4.6 --- .../lib/core/harness/span_derivation.py | 33 ++++++++++++++----- .../lib/core/harness/test_span_derivation.py | 21 ++++++++++++ 2 files changed, 46 insertions(+), 8 deletions(-) diff --git a/src/agentex/lib/core/harness/span_derivation.py b/src/agentex/lib/core/harness/span_derivation.py index deb5d6d68..15e1f593f 100644 --- a/src/agentex/lib/core/harness/span_derivation.py +++ b/src/agentex/lib/core/harness/span_derivation.py @@ -8,7 +8,6 @@ import json from dataclasses import dataclass -from typing import Any from agentex.types.task_message_update import ( StreamTaskMessageDelta, @@ -24,7 +23,7 @@ class _ToolReqMeta: tool_call_id: str name: str - arguments: dict[str, Any] + arguments: dict[str, object] args_buf: str = "" # accumulated streamed argument fragments @@ -34,12 +33,24 @@ class SpanDeriver: Tool span: open on Done of a ToolRequestContent index; close on matching ToolResponseContent by tool_call_id. Reasoning span: open on Start(ReasoningContent); close on that index's Done. + + Deliberate contracts: + - A `Full(ToolResponseContent)` whose tool_call_id was never opened is + ignored (no CloseSpan emitted). + - A `Done` for an index that was never a tool_request/reasoning Start is + ignored (no signal emitted). + - Events with `index is None` are skipped entirely; without a stable index + they cannot be reliably paired, and aliasing them to a sentinel would + let unrelated None-indexed events cross-match. + - `flush()` closes anything still open as incomplete; unclosed tool spans + are emitted in the order they were opened. """ def __init__(self) -> None: self._tool_by_index: dict[int, _ToolReqMeta] = {} self._reasoning_index_open: set[int] = set() - self._open_tool_ids: set[str] = set() + # insertion-ordered set of open tool_call_ids (dict keys preserve order) + self._open_tool_ids: dict[str, None] = {} def observe(self, event: StreamTaskMessage) -> list[SpanSignal]: if isinstance(event, StreamTaskMessageStart): @@ -64,8 +75,10 @@ def flush(self) -> list[SpanSignal]: return signals def _on_start(self, event: StreamTaskMessageStart) -> list[SpanSignal]: + if event.index is None: + return [] + idx = event.index content = event.content - idx = event.index if event.index is not None else -1 ctype = getattr(content, "type", None) if ctype == "tool_request": self._tool_by_index[idx] = _ToolReqMeta( @@ -80,7 +93,9 @@ def _on_start(self, event: StreamTaskMessageStart) -> list[SpanSignal]: return [] def _on_delta(self, event: StreamTaskMessageDelta) -> list[SpanSignal]: - idx = event.index if event.index is not None else -1 + if event.index is None: + return [] + idx = event.index delta = event.delta if delta is not None and getattr(delta, "type", None) == "tool_request": meta = self._tool_by_index.get(idx) @@ -93,12 +108,14 @@ def _on_full(self, event: StreamTaskMessageFull) -> list[SpanSignal]: if getattr(content, "type", None) == "tool_response": tcid = content.tool_call_id if tcid in self._open_tool_ids: - self._open_tool_ids.discard(tcid) + self._open_tool_ids.pop(tcid, None) return [CloseSpan(key=tcid, output=content.content, is_complete=True)] return [] def _on_done(self, event: StreamTaskMessageDone) -> list[SpanSignal]: - idx = event.index if event.index is not None else -1 + if event.index is None: + return [] + idx = event.index meta = self._tool_by_index.pop(idx, None) if meta is not None: args = meta.arguments @@ -107,7 +124,7 @@ def _on_done(self, event: StreamTaskMessageDone) -> list[SpanSignal]: args = json.loads(meta.args_buf) except json.JSONDecodeError: args = {"_raw": meta.args_buf} - self._open_tool_ids.add(meta.tool_call_id) + self._open_tool_ids[meta.tool_call_id] = None return [OpenSpan(key=meta.tool_call_id, kind="tool", name=meta.name, input=args)] if idx in self._reasoning_index_open: self._reasoning_index_open.discard(idx) diff --git a/tests/lib/core/harness/test_span_derivation.py b/tests/lib/core/harness/test_span_derivation.py index 0b1a4bcbe..0630131d0 100644 --- a/tests/lib/core/harness/test_span_derivation.py +++ b/tests/lib/core/harness/test_span_derivation.py @@ -118,3 +118,24 @@ def test_unclosed_tool_closed_incomplete_on_flush(): sigs = _signals(d, events) assert sigs[0] == OpenSpan(key="x", kind="tool", name="Bash", input={}) assert sigs[1] == CloseSpan(key="x", output=None, is_complete=False) + + +def test_none_index_is_skipped(): + d = SpanDeriver() + events = [ + StreamTaskMessageStart(type="start", index=None, + content=ToolRequestContent(type="tool_request", author="agent", + tool_call_id="n", name="Bash", arguments={})), + StreamTaskMessageDone(type="done", index=None), + ] + assert _signals(d, events) == [] + + +def test_orphan_tool_response_ignored(): + d = SpanDeriver() + events = [ + StreamTaskMessageFull(type="full", index=0, + content=ToolResponseContent(type="tool_response", author="agent", + tool_call_id="z", name="Bash", content="r")), + ] + assert _signals(d, events) == [] From 8d708f41fccf0b5b6a046999ac67c2106666b5aa Mon Sep 17 00:00:00 2001 From: Declan Brady Date: Thu, 18 Jun 2026 12:16:02 -0400 Subject: [PATCH 09/35] feat(harness): SpanTracer adapter from span signals to adk.tracing Co-Authored-By: Claude Sonnet 4.6 --- src/agentex/lib/core/harness/tracer.py | 68 ++++++++++++++++++++++++++ tests/lib/core/harness/test_tracer.py | 54 ++++++++++++++++++++ 2 files changed, 122 insertions(+) create mode 100644 src/agentex/lib/core/harness/tracer.py create mode 100644 tests/lib/core/harness/test_tracer.py diff --git a/src/agentex/lib/core/harness/tracer.py b/src/agentex/lib/core/harness/tracer.py new file mode 100644 index 000000000..55fab4029 --- /dev/null +++ b/src/agentex/lib/core/harness/tracer.py @@ -0,0 +1,68 @@ +"""Adapter from SpanSignals to adk.tracing spans (best-effort, overridable).""" + +from __future__ import annotations + +import logging +from typing import Any + +from agentex.lib.core.harness.types import CloseSpan, OpenSpan, SpanSignal + +logger = logging.getLogger(__name__) + + +class SpanTracer: + """Opens/closes adk.tracing child spans in response to span signals. + + `tracing` defaults to the real `adk.tracing` module; inject a fake in tests + or a custom tracer to override. No-op when `trace_id` is falsy. Never raises. + + The real TracingModule.end_span does NOT accept an output kwarg — output is + recorded by mutating span.output before calling end_span, matching the pattern + used throughout the codebase (see _langgraph_tracing.py on_tool_end etc.). + """ + + def __init__( + self, + trace_id: str | None, + parent_span_id: str | None, + tracing: Any = None, + task_id: str | None = None, + ): + self.trace_id = trace_id + self.parent_span_id = parent_span_id + self.task_id = task_id + if tracing is None: + from agentex.lib import adk + + tracing = adk.tracing + self._tracing = tracing + self._open: dict[str, Any] = {} # span key -> span object + + async def handle(self, signal: SpanSignal) -> None: + if not self.trace_id: + return + try: + if isinstance(signal, OpenSpan): + span = await self._tracing.start_span( + trace_id=self.trace_id, + name=signal.name, + input=signal.input, + parent_id=self.parent_span_id, + task_id=self.task_id, + ) + if span is not None: + self._open[signal.key] = span + elif isinstance(signal, CloseSpan): + span = self._open.pop(signal.key, None) + if span is not None: + # Output is recorded by mutating span.output before end_span. + # The real TracingModule.end_span signature is: + # end_span(trace_id, span, start_to_close_timeout, heartbeat_timeout, retry_policy) + # It does not accept an output= kwarg. + span.output = signal.output + await self._tracing.end_span( + trace_id=self.trace_id, + span=span, + ) + except Exception as exc: # best-effort: tracing never breaks delivery + logger.warning("[harness.tracer] span signal failed: %s", exc) diff --git a/tests/lib/core/harness/test_tracer.py b/tests/lib/core/harness/test_tracer.py new file mode 100644 index 000000000..105995bc8 --- /dev/null +++ b/tests/lib/core/harness/test_tracer.py @@ -0,0 +1,54 @@ +import pytest + +from agentex.lib.core.harness.tracer import SpanTracer +from agentex.lib.core.harness.types import OpenSpan, CloseSpan + + +class _FakeSpan: + def __init__(self, name): + self.name = name + self.output = None + + +class _FakeTracing: + def __init__(self): + self.started = [] + self.ended = [] + + async def start_span(self, *, trace_id, name, input=None, parent_id=None, data=None, task_id=None): + self.started.append((name, parent_id, input)) + return _FakeSpan(name) + + async def end_span(self, *, trace_id, span): + self.ended.append((span.name, span.output)) + + +@pytest.mark.asyncio +async def test_open_then_close_starts_and_ends_span(): + fake = _FakeTracing() + tracer = SpanTracer(trace_id="t1", parent_span_id="p1", tracing=fake) + await tracer.handle(OpenSpan(key="call_1", kind="tool", name="Bash", input={"cmd": "ls"})) + await tracer.handle(CloseSpan(key="call_1", output="files", is_complete=True)) + assert fake.started == [("Bash", "p1", {"cmd": "ls"})] + assert fake.ended == [("Bash", "files")] + + +@pytest.mark.asyncio +async def test_no_trace_id_is_noop(): + fake = _FakeTracing() + tracer = SpanTracer(trace_id="", parent_span_id=None, tracing=fake) + await tracer.handle(OpenSpan(key="k", kind="tool", name="X")) + await tracer.handle(CloseSpan(key="k")) + assert fake.started == [] and fake.ended == [] + + +@pytest.mark.asyncio +async def test_tracing_failure_is_swallowed(): + class _Boom(_FakeTracing): + async def start_span(self, **kw): + raise RuntimeError("backend down") + + tracer = SpanTracer(trace_id="t1", parent_span_id="p1", tracing=_Boom()) + # Must not raise. + await tracer.handle(OpenSpan(key="k", kind="tool", name="X")) + await tracer.handle(CloseSpan(key="k")) From 7955d55b1d88b39557e129f0539c4cf42565ddc4 Mon Sep 17 00:00:00 2001 From: Declan Brady Date: Thu, 18 Jun 2026 12:21:07 -0400 Subject: [PATCH 10/35] refactor(harness): guarded make_logger import + lifecycle contract tests for SpanTracer Co-Authored-By: Claude Opus 4.8 (1M context) --- src/agentex/lib/core/harness/tracer.py | 18 ++++++++++++++++-- tests/lib/core/harness/test_tracer.py | 13 +++++++++++++ 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/src/agentex/lib/core/harness/tracer.py b/src/agentex/lib/core/harness/tracer.py index 55fab4029..3f4ff40c2 100644 --- a/src/agentex/lib/core/harness/tracer.py +++ b/src/agentex/lib/core/harness/tracer.py @@ -2,12 +2,18 @@ from __future__ import annotations -import logging from typing import Any from agentex.lib.core.harness.types import CloseSpan, OpenSpan, SpanSignal -logger = logging.getLogger(__name__) +try: + from agentex.lib.utils.logging import make_logger + + logger = make_logger(__name__) +except Exception: # ddtrace may be absent in some envs; fall back to stdlib + import logging + + logger = logging.getLogger(__name__) class SpanTracer: @@ -19,6 +25,14 @@ class SpanTracer: The real TracingModule.end_span does NOT accept an output kwarg — output is recorded by mutating span.output before calling end_span, matching the pattern used throughout the codebase (see _langgraph_tracing.py on_tool_end etc.). + + Span-lifecycle contract: the `_open` dict (span key -> span object) is scoped + to a single turn. Pairing is by `key`: + - A duplicate OpenSpan for a key already in `_open` silently replaces the + earlier span; the earlier span is then orphaned (never closed / leaked). + - A CloseSpan for an unknown key is a no-op. + - Unpaired opens accumulate in `_open` for the lifetime of the tracer; since + a tracer is expected to live for one turn, this is bounded and acceptable. """ def __init__( diff --git a/tests/lib/core/harness/test_tracer.py b/tests/lib/core/harness/test_tracer.py index 105995bc8..f5fdb16b6 100644 --- a/tests/lib/core/harness/test_tracer.py +++ b/tests/lib/core/harness/test_tracer.py @@ -52,3 +52,16 @@ async def start_span(self, **kw): # Must not raise. await tracer.handle(OpenSpan(key="k", kind="tool", name="X")) await tracer.handle(CloseSpan(key="k")) + assert tracer._open == {} + + +@pytest.mark.asyncio +async def test_duplicate_open_replaces_silently(): + fake = _FakeTracing() + tracer = SpanTracer(trace_id="t1", parent_span_id="p1", tracing=fake) + await tracer.handle(OpenSpan(key="k", kind="tool", name="A")) + await tracer.handle(OpenSpan(key="k", kind="tool", name="B")) + await tracer.handle(CloseSpan(key="k")) + # Both opens started spans, but only the second ("B") is closed. + assert [name for name, _, _ in fake.started] == ["A", "B"] + assert fake.ended == [("B", None)] From 803191ba9ce4d2db5c25723af1c4a8934cb0220e Mon Sep 17 00:00:00 2001 From: Declan Brady Date: Thu, 18 Jun 2026 12:23:02 -0400 Subject: [PATCH 11/35] feat(harness): yield_events delivery adapter (passthrough + tracing) Co-Authored-By: Claude Sonnet 4.6 --- .../lib/core/harness/yield_delivery.py | 31 ++++++++++ tests/lib/core/harness/test_yield_delivery.py | 58 +++++++++++++++++++ 2 files changed, 89 insertions(+) create mode 100644 src/agentex/lib/core/harness/yield_delivery.py create mode 100644 tests/lib/core/harness/test_yield_delivery.py diff --git a/src/agentex/lib/core/harness/yield_delivery.py b/src/agentex/lib/core/harness/yield_delivery.py new file mode 100644 index 000000000..0d04647da --- /dev/null +++ b/src/agentex/lib/core/harness/yield_delivery.py @@ -0,0 +1,31 @@ +"""Yield delivery: pass the canonical stream through, tracing as a side effect.""" + +from __future__ import annotations + +from typing import AsyncIterator + +from agentex.lib.core.harness.span_derivation import SpanDeriver +from agentex.lib.core.harness.tracer import SpanTracer +from agentex.lib.core.harness.types import StreamTaskMessage + + +async def yield_events( + events: AsyncIterator[StreamTaskMessage], + tracer: SpanTracer | None = None, +) -> AsyncIterator[StreamTaskMessage]: + """Forward each event to the caller; derive + trace spans as a side effect. + + For sync HTTP ACP agents that yield events back over the response. When + `tracer` is None, this is a pure passthrough. + """ + deriver = SpanDeriver() if tracer is not None else None + try: + async for event in events: + if deriver is not None and tracer is not None: + for signal in deriver.observe(event): + await tracer.handle(signal) + yield event + finally: + if deriver is not None and tracer is not None: + for signal in deriver.flush(): + await tracer.handle(signal) diff --git a/tests/lib/core/harness/test_yield_delivery.py b/tests/lib/core/harness/test_yield_delivery.py new file mode 100644 index 000000000..46f0aeac1 --- /dev/null +++ b/tests/lib/core/harness/test_yield_delivery.py @@ -0,0 +1,58 @@ +import types as _types + +import pytest + +from agentex.lib.core.harness.yield_delivery import yield_events +from agentex.lib.core.harness.tracer import SpanTracer +from agentex.types.task_message_update import ( + StreamTaskMessageStart, + StreamTaskMessageDone, + StreamTaskMessageFull, +) +from agentex.types.tool_request_content import ToolRequestContent +from agentex.types.tool_response_content import ToolResponseContent + + +class _RecordTracing: + def __init__(self): + self.started, self.ended = [], [] + + async def start_span(self, *, trace_id, name, input=None, parent_id=None, data=None, task_id=None): + self.started.append(name) + return _types.SimpleNamespace() # supports arbitrary attribute assignment (span.output = ...) + + async def end_span(self, *, trace_id, span): + self.ended.append(getattr(span, "output", None)) + + +async def _gen(events): + for e in events: + yield e + + +@pytest.mark.asyncio +async def test_yield_passes_events_through_and_traces(): + fake = _RecordTracing() + tracer = SpanTracer(trace_id="t", parent_span_id="p", tracing=fake) + events = [ + StreamTaskMessageStart(type="start", index=0, + content=ToolRequestContent(type="tool_request", author="agent", + tool_call_id="c", name="Bash", arguments={})), + StreamTaskMessageDone(type="done", index=0), + StreamTaskMessageFull(type="full", index=1, + content=ToolResponseContent(type="tool_response", author="agent", + tool_call_id="c", name="Bash", content="ok")), + ] + out = [e async for e in yield_events(_gen(events), tracer=tracer)] + assert out == events # passthrough unchanged + assert fake.started == ["Bash"] # span derived + opened + assert fake.ended == ["ok"] # span closed with response + + +@pytest.mark.asyncio +async def test_yield_without_tracer_is_pure_passthrough(): + events = [ + StreamTaskMessageDone(type="done", index=0), + ] + out = [e async for e in yield_events(_gen(events), tracer=None)] + assert out == events From dab044f862e7a734c7c248f747ebe4470cbd9a82 Mon Sep 17 00:00:00 2001 From: Declan Brady Date: Thu, 18 Jun 2026 12:27:54 -0400 Subject: [PATCH 12/35] refactor(harness): simplify yield_events guard + cover finally-flush on early close Co-Authored-By: Claude Sonnet 4.6 --- .../lib/core/harness/yield_delivery.py | 8 ++++---- tests/lib/core/harness/test_yield_delivery.py | 19 +++++++++++++++++++ 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/src/agentex/lib/core/harness/yield_delivery.py b/src/agentex/lib/core/harness/yield_delivery.py index 0d04647da..ca923c6a3 100644 --- a/src/agentex/lib/core/harness/yield_delivery.py +++ b/src/agentex/lib/core/harness/yield_delivery.py @@ -2,7 +2,7 @@ from __future__ import annotations -from typing import AsyncIterator +from typing import AsyncGenerator, AsyncIterator from agentex.lib.core.harness.span_derivation import SpanDeriver from agentex.lib.core.harness.tracer import SpanTracer @@ -12,7 +12,7 @@ async def yield_events( events: AsyncIterator[StreamTaskMessage], tracer: SpanTracer | None = None, -) -> AsyncIterator[StreamTaskMessage]: +) -> AsyncGenerator[StreamTaskMessage, None]: """Forward each event to the caller; derive + trace spans as a side effect. For sync HTTP ACP agents that yield events back over the response. When @@ -21,11 +21,11 @@ async def yield_events( deriver = SpanDeriver() if tracer is not None else None try: async for event in events: - if deriver is not None and tracer is not None: + if deriver is not None: # tracer is non-None whenever deriver is set for signal in deriver.observe(event): await tracer.handle(signal) yield event finally: - if deriver is not None and tracer is not None: + if deriver is not None: # tracer is non-None whenever deriver is set for signal in deriver.flush(): await tracer.handle(signal) diff --git a/tests/lib/core/harness/test_yield_delivery.py b/tests/lib/core/harness/test_yield_delivery.py index 46f0aeac1..986b4a92d 100644 --- a/tests/lib/core/harness/test_yield_delivery.py +++ b/tests/lib/core/harness/test_yield_delivery.py @@ -56,3 +56,22 @@ async def test_yield_without_tracer_is_pure_passthrough(): ] out = [e async for e in yield_events(_gen(events), tracer=None)] assert out == events + + +@pytest.mark.asyncio +async def test_flush_runs_on_early_close(): + fake = _RecordTracing() + tracer = SpanTracer(trace_id="t", parent_span_id="p", tracing=fake) + events = [ + StreamTaskMessageStart(type="start", index=0, + content=ToolRequestContent(type="tool_request", author="agent", + tool_call_id="c", name="Bash", arguments={})), + StreamTaskMessageDone(type="done", index=0), + # response intentionally never arrives + ] + gen = yield_events(_gen(events), tracer=tracer) + first = await gen.__anext__() # Start + second = await gen.__anext__() # Done -> tool span opens here + await gen.aclose() # triggers the finally -> flush() + assert fake.started == ["Bash"] + assert fake.ended == [None] # flush closed the unpaired span (incomplete, no output) From 3cc0326bef21caaea138234302fb411d494f59a6 Mon Sep 17 00:00:00 2001 From: Declan Brady Date: Thu, 18 Jun 2026 12:32:36 -0400 Subject: [PATCH 13/35] feat(harness): auto_send delivery adapter (canonical stream -> adk.streaming + tracing) Co-Authored-By: Claude Sonnet 4.6 --- src/agentex/lib/core/harness/auto_send.py | 118 ++++++++++ tests/lib/core/harness/test_auto_send.py | 248 ++++++++++++++++++++++ 2 files changed, 366 insertions(+) create mode 100644 src/agentex/lib/core/harness/auto_send.py create mode 100644 tests/lib/core/harness/test_auto_send.py diff --git a/src/agentex/lib/core/harness/auto_send.py b/src/agentex/lib/core/harness/auto_send.py new file mode 100644 index 000000000..506a9ad82 --- /dev/null +++ b/src/agentex/lib/core/harness/auto_send.py @@ -0,0 +1,118 @@ +"""Auto-send delivery: canonical stream -> adk.streaming side effects + tracing.""" + +from __future__ import annotations + +from typing import Any, AsyncIterator + +from agentex.types.task_message_update import ( + StreamTaskMessageDelta, + StreamTaskMessageDone, + StreamTaskMessageFull, + StreamTaskMessageStart, +) + +from agentex.lib.core.harness.span_derivation import SpanDeriver +from agentex.lib.core.harness.tracer import SpanTracer +from agentex.lib.core.harness.types import StreamTaskMessage, TurnResult, TurnUsage + + +async def auto_send( + events: AsyncIterator[StreamTaskMessage], + task_id: str, + tracer: SpanTracer | None = None, + streaming: Any = None, + usage: TurnUsage | None = None, +) -> TurnResult: + """Push the canonical stream to the task stream via adk.streaming. + + Opens a streaming context per text/reasoning message, streams deltas via + ctx.stream_update, and closes via ctx.close() on Done. Posts tool + request/response full messages by opening a context with the content and + closing it immediately (no deltas). Derives and traces spans from the same + stream. Returns the accumulated final text + usage. + + Mirrors the open/close/stream_update pattern from + src/agentex/lib/adk/_modules/_langgraph_async.py: + - context opened via streaming_task_message_context(...).__aenter__() + - context closed via ctx.close() (not __aexit__) + - deltas pushed as StreamTaskMessageDelta with parent_task_message set + from ctx.task_message + + For async + temporal agents (call from inside an activity). + """ + if streaming is None: + from agentex.lib import adk + + streaming = adk.streaming + + deriver = SpanDeriver() if tracer is not None else None + final_text_parts: list[str] = [] + current_ctx: Any = None + current_kind: str | None = None # "text" | "reasoning" + + async def _close_current() -> None: + nonlocal current_ctx, current_kind + if current_ctx is not None: + await current_ctx.close() + current_ctx = None + current_kind = None + + try: + async for event in events: + if deriver is not None: + for signal in deriver.observe(event): + await tracer.handle(signal) # type: ignore[union-attr] + + if isinstance(event, StreamTaskMessageStart): + ctype = getattr(event.content, "type", None) + if ctype in ("text", "reasoning"): + await _close_current() + ctx = streaming.streaming_task_message_context( + task_id=task_id, + initial_content=event.content, + ) + current_ctx = await ctx.__aenter__() + current_kind = ctype + + elif isinstance(event, StreamTaskMessageDelta): + if current_ctx is not None and event.delta is not None: + # Reconstruct the delta with parent_task_message set from + # the context's task_message (mirrors _langgraph_async.py + # lines 72-78 and 117-127). + delta_with_parent = StreamTaskMessageDelta( + parent_task_message=current_ctx.task_message, + delta=event.delta, + type="delta", + index=event.index, + ) + await current_ctx.stream_update(delta_with_parent) + if ( + getattr(event.delta, "type", None) == "text" + and event.delta.text_delta + ): + final_text_parts.append(event.delta.text_delta) + + elif isinstance(event, StreamTaskMessageDone): + await _close_current() + + elif isinstance(event, StreamTaskMessageFull): + # Full messages (tool_request / tool_response): close any open + # streaming context first, then post the full message by opening + # a context with the content and closing it immediately + # (no deltas; StreamingTaskMessageContext.close() persists + # initial_content when the accumulator is empty). + await _close_current() + ctx = streaming.streaming_task_message_context( + task_id=task_id, + initial_content=event.content, + ) + full_ctx = await ctx.__aenter__() + await full_ctx.close() + + finally: + await _close_current() + if deriver is not None: + for signal in deriver.flush(): + await tracer.handle(signal) # type: ignore[union-attr] + + return TurnResult(final_text="".join(final_text_parts), usage=usage or TurnUsage()) diff --git a/tests/lib/core/harness/test_auto_send.py b/tests/lib/core/harness/test_auto_send.py new file mode 100644 index 000000000..2a83658e1 --- /dev/null +++ b/tests/lib/core/harness/test_auto_send.py @@ -0,0 +1,248 @@ +"""Tests for auto_send delivery adapter. + +The fake mirrors the real StreamingTaskMessageContext API exactly: +- streaming_task_message_context(...) returns a context object (synchronously) +- open the context via __aenter__ (returns self after creating the task message) +- stream deltas via ctx.stream_update(StreamTaskMessageDelta(...)) +- close via ctx.close() (NOT __aexit__) + +This mirrors _langgraph_async.py lines 62-78 and 100-127. +""" + +import types as _types + +import pytest + +from agentex.lib.core.harness.auto_send import auto_send +from agentex.lib.core.harness.tracer import SpanTracer +from agentex.types.task_message import TaskMessage +from agentex.types.task_message_update import ( + StreamTaskMessageStart, + StreamTaskMessageDelta, + StreamTaskMessageDone, + StreamTaskMessageFull, +) +from agentex.types.text_content import TextContent +from agentex.types.task_message_delta import TextDelta +from agentex.types.tool_request_content import ToolRequestContent +from agentex.types.tool_response_content import ToolResponseContent + + +class _FakeCtx: + """Mirrors StreamingTaskMessageContext: __aenter__ opens (returns self with task_message set), + close() closes. stream_update records the call. + + task_message is a real TaskMessage instance so that auto_send can use it + as parent_task_message in StreamTaskMessageDelta without Pydantic validation errors. + """ + + def __init__(self, sink, content_type, initial_content): + self.sink = sink + self.content_type = content_type + # Real TaskMessage so StreamTaskMessageDelta(parent_task_message=...) passes validation + self.task_message = TaskMessage( + id="msg-1", task_id="task1", content=initial_content + ) + + async def __aenter__(self): + self.sink.append(("open", self.content_type)) + return self + + async def __aexit__(self, *a): + # __aexit__ delegates to close in the real impl; keep for safety + await self.close() + return False + + async def close(self): + self.sink.append(("close", self.content_type)) + + async def stream_update(self, update): + self.sink.append(("update", update)) + return update + + +class _FakeStreaming: + """Mirrors StreamingService: streaming_task_message_context returns a context object.""" + + def __init__(self): + self.sink = [] + + def streaming_task_message_context( + self, task_id, initial_content, streaming_mode="coalesced", created_at=None + ): + ctype = getattr(initial_content, "type", None) + self.sink.append(("ctx", ctype)) + return _FakeCtx(self.sink, ctype, initial_content) + + +async def _gen(events): + for e in events: + yield e + + +# --------------------------------------------------------------------------- +# Test 1: text streaming — open, stream deltas, close; return accumulated text +# --------------------------------------------------------------------------- + +@pytest.mark.asyncio +async def test_auto_send_streams_text_and_returns_final_text(): + streaming = _FakeStreaming() + events = [ + StreamTaskMessageStart( + type="start", index=0, + content=TextContent(type="text", author="agent", content=""), + ), + StreamTaskMessageDelta( + type="delta", index=0, + delta=TextDelta(type="text", text_delta="Hel"), + ), + StreamTaskMessageDelta( + type="delta", index=0, + delta=TextDelta(type="text", text_delta="lo"), + ), + StreamTaskMessageDone(type="done", index=0), + ] + result = await auto_send(_gen(events), task_id="task1", tracer=None, streaming=streaming) + + assert result.final_text == "Hello" + + kinds = [s[0] for s in streaming.sink] + # A context was created for the text content + assert kinds[0] == "ctx" + # It was opened and closed + assert "open" in kinds + assert "close" in kinds + # Exactly two updates were streamed (one per delta) + updates = [s for s in streaming.sink if s[0] == "update"] + assert len(updates) == 2 + + +# --------------------------------------------------------------------------- +# Test 2: tool_request Full + tool_response Full — each posts one full message +# (open context with the content, no deltas, close immediately) +# --------------------------------------------------------------------------- + +@pytest.mark.asyncio +async def test_auto_send_posts_full_tool_messages(): + streaming = _FakeStreaming() + events = [ + StreamTaskMessageFull( + type="full", index=0, + content=ToolRequestContent( + type="tool_request", author="agent", + tool_call_id="c1", name="Bash", arguments={"cmd": "ls"}, + ), + ), + StreamTaskMessageFull( + type="full", index=1, + content=ToolResponseContent( + type="tool_response", author="agent", + tool_call_id="c1", name="Bash", content="file.py", + ), + ), + ] + result = await auto_send(_gen(events), task_id="task1", tracer=None, streaming=streaming) + + assert result.final_text == "" + + # One context per Full event + ctx_events = [s for s in streaming.sink if s[0] == "ctx"] + assert len(ctx_events) == 2 + content_types = [s[1] for s in ctx_events] + assert "tool_request" in content_types + assert "tool_response" in content_types + + # Each context is opened and closed + opens = [s for s in streaming.sink if s[0] == "open"] + closes = [s for s in streaming.sink if s[0] == "close"] + assert len(opens) == 2 + assert len(closes) == 2 + + # No stream_update calls (full messages have no deltas) + updates = [s for s in streaming.sink if s[0] == "update"] + assert len(updates) == 0 + + +# --------------------------------------------------------------------------- +# Test 3: tracing — spans are derived and handed to the tracer +# --------------------------------------------------------------------------- + +class _RecordTracing: + def __init__(self): + self.started, self.ended = [], [] + + async def start_span(self, *, trace_id, name, input=None, parent_id=None, data=None, task_id=None): + self.started.append(name) + return _types.SimpleNamespace() + + async def end_span(self, *, trace_id, span): + self.ended.append(getattr(span, "output", None)) + + +@pytest.mark.asyncio +async def test_auto_send_derives_tool_spans_via_tracer(): + fake_tracing = _RecordTracing() + tracer = SpanTracer(trace_id="t", parent_span_id="p", tracing=fake_tracing) + streaming = _FakeStreaming() + + events = [ + StreamTaskMessageStart( + type="start", index=0, + content=ToolRequestContent( + type="tool_request", author="agent", + tool_call_id="c1", name="Bash", arguments={}, + ), + ), + StreamTaskMessageDone(type="done", index=0), + StreamTaskMessageFull( + type="full", index=1, + content=ToolResponseContent( + type="tool_response", author="agent", + tool_call_id="c1", name="Bash", content="ok", + ), + ), + ] + + result = await auto_send( + _gen(events), task_id="task1", tracer=tracer, streaming=streaming + ) + + assert result.final_text == "" + assert fake_tracing.started == ["Bash"] + assert fake_tracing.ended == ["ok"] + + +# --------------------------------------------------------------------------- +# Test 4: text followed by a tool Full — text context is closed before Full +# --------------------------------------------------------------------------- + +@pytest.mark.asyncio +async def test_auto_send_closes_text_context_before_full_message(): + streaming = _FakeStreaming() + events = [ + StreamTaskMessageStart( + type="start", index=0, + content=TextContent(type="text", author="agent", content=""), + ), + StreamTaskMessageDelta( + type="delta", index=0, + delta=TextDelta(type="text", text_delta="Hi"), + ), + StreamTaskMessageDone(type="done", index=0), + StreamTaskMessageFull( + type="full", index=1, + content=ToolRequestContent( + type="tool_request", author="agent", + tool_call_id="c2", name="read_file", arguments={}, + ), + ), + ] + result = await auto_send(_gen(events), task_id="task1", tracer=None, streaming=streaming) + assert result.final_text == "Hi" + + # Verify ordering: text ctx opens, updates, closes; then tool_request ctx opens, closes + event_sequence = [(s[0], s[1]) for s in streaming.sink] + text_open_idx = next(i for i, s in enumerate(event_sequence) if s == ("open", "text")) + text_close_idx = next(i for i, s in enumerate(event_sequence) if s == ("close", "text")) + tool_open_idx = next(i for i, s in enumerate(event_sequence) if s == ("open", "tool_request")) + assert text_open_idx < text_close_idx < tool_open_idx From 260064e2ec1c0a6d8fd290c90ab2d7359316c697 Mon Sep 17 00:00:00 2001 From: Declan Brady Date: Thu, 18 Jun 2026 12:39:25 -0400 Subject: [PATCH 14/35] refactor(harness): exception-safe full-message post + drop dead state + cover error/finally paths in auto_send Co-Authored-By: Claude Opus 4.8 (1M context) --- src/agentex/lib/core/harness/auto_send.py | 16 ++++----- tests/lib/core/harness/test_auto_send.py | 44 ++++++++++++++++++++--- 2 files changed, 46 insertions(+), 14 deletions(-) diff --git a/src/agentex/lib/core/harness/auto_send.py b/src/agentex/lib/core/harness/auto_send.py index 506a9ad82..e7de01a68 100644 --- a/src/agentex/lib/core/harness/auto_send.py +++ b/src/agentex/lib/core/harness/auto_send.py @@ -48,14 +48,12 @@ async def auto_send( deriver = SpanDeriver() if tracer is not None else None final_text_parts: list[str] = [] current_ctx: Any = None - current_kind: str | None = None # "text" | "reasoning" async def _close_current() -> None: - nonlocal current_ctx, current_kind + nonlocal current_ctx if current_ctx is not None: await current_ctx.close() current_ctx = None - current_kind = None try: async for event in events: @@ -72,7 +70,6 @@ async def _close_current() -> None: initial_content=event.content, ) current_ctx = await ctx.__aenter__() - current_kind = ctype elif isinstance(event, StreamTaskMessageDelta): if current_ctx is not None and event.delta is not None: @@ -100,14 +97,15 @@ async def _close_current() -> None: # streaming context first, then post the full message by opening # a context with the content and closing it immediately # (no deltas; StreamingTaskMessageContext.close() persists - # initial_content when the accumulator is empty). + # initial_content when the accumulator is empty). Use async with + # so the context is closed even if close() raises (__aexit__ + # delegates to close()). await _close_current() - ctx = streaming.streaming_task_message_context( + async with streaming.streaming_task_message_context( task_id=task_id, initial_content=event.content, - ) - full_ctx = await ctx.__aenter__() - await full_ctx.close() + ): + pass finally: await _close_current() diff --git a/tests/lib/core/harness/test_auto_send.py b/tests/lib/core/harness/test_auto_send.py index 2a83658e1..9568d7b87 100644 --- a/tests/lib/core/harness/test_auto_send.py +++ b/tests/lib/core/harness/test_auto_send.py @@ -126,15 +126,24 @@ async def test_auto_send_streams_text_and_returns_final_text(): async def test_auto_send_posts_full_tool_messages(): streaming = _FakeStreaming() events = [ + # A bare tool_request Start (no Done/Full) must NOT open a streaming + # context on its own — only Full events post messages. + StreamTaskMessageStart( + type="start", index=0, + content=ToolRequestContent( + type="tool_request", author="agent", + tool_call_id="c0", name="Bash", arguments={}, + ), + ), StreamTaskMessageFull( - type="full", index=0, + type="full", index=1, content=ToolRequestContent( type="tool_request", author="agent", tool_call_id="c1", name="Bash", arguments={"cmd": "ls"}, ), ), StreamTaskMessageFull( - type="full", index=1, + type="full", index=2, content=ToolResponseContent( type="tool_response", author="agent", tool_call_id="c1", name="Bash", content="file.py", @@ -145,12 +154,12 @@ async def test_auto_send_posts_full_tool_messages(): assert result.final_text == "" - # One context per Full event + # The opened contexts correspond ONLY to the two Full events — the + # tool_request Start did not open a context. ctx_events = [s for s in streaming.sink if s[0] == "ctx"] assert len(ctx_events) == 2 content_types = [s[1] for s in ctx_events] - assert "tool_request" in content_types - assert "tool_response" in content_types + assert content_types == ["tool_request", "tool_response"] # Each context is opened and closed opens = [s for s in streaming.sink if s[0] == "open"] @@ -246,3 +255,28 @@ async def test_auto_send_closes_text_context_before_full_message(): text_close_idx = next(i for i, s in enumerate(event_sequence) if s == ("close", "text")) tool_open_idx = next(i for i, s in enumerate(event_sequence) if s == ("open", "tool_request")) assert text_open_idx < text_close_idx < tool_open_idx + + +# --------------------------------------------------------------------------- +# Test 5: midstream error — propagates AND the open context is closed (finally) +# --------------------------------------------------------------------------- + +@pytest.mark.asyncio +async def test_open_context_closed_on_midstream_error(): + streaming = _FakeStreaming() + + async def _exploding_gen(): + yield StreamTaskMessageStart( + type="start", index=0, + content=TextContent(type="text", author="agent", content=""), + ) + raise RuntimeError("boom") + + with pytest.raises(RuntimeError, match="boom"): + await auto_send( + _exploding_gen(), task_id="task1", tracer=None, streaming=streaming + ) + + # The text context that was opened mid-stream was closed by the finally block. + assert ("open", "text") in [(s[0], s[1]) for s in streaming.sink] + assert ("close", "text") in [(s[0], s[1]) for s in streaming.sink] From b27367b9f8fa1cd24de528eb6dffd29d633e2203 Mon Sep 17 00:00:00 2001 From: Declan Brady Date: Thu, 18 Jun 2026 12:42:52 -0400 Subject: [PATCH 15/35] feat(harness): UnifiedEmitter facade tying delivery + tracing + usage Co-Authored-By: Claude Sonnet 4.6 --- src/agentex/lib/core/harness/__init__.py | 24 ++++++++++ src/agentex/lib/core/harness/emitter.py | 59 ++++++++++++++++++++++++ tests/lib/core/harness/test_emitter.py | 56 ++++++++++++++++++++++ 3 files changed, 139 insertions(+) create mode 100644 src/agentex/lib/core/harness/emitter.py create mode 100644 tests/lib/core/harness/test_emitter.py diff --git a/src/agentex/lib/core/harness/__init__.py b/src/agentex/lib/core/harness/__init__.py index 15d116148..2988db8ff 100644 --- a/src/agentex/lib/core/harness/__init__.py +++ b/src/agentex/lib/core/harness/__init__.py @@ -4,3 +4,27 @@ package derives spans from it and delivers it (yield or auto-send), so every harness tap gets streaming + tracing + turn usage uniformly. """ + +from agentex.lib.core.harness.emitter import UnifiedEmitter +from agentex.lib.core.harness.tracer import SpanTracer +from agentex.lib.core.harness.types import ( + CloseSpan, + HarnessTurn, + OpenSpan, + SpanSignal, + StreamTaskMessage, + TurnResult, + TurnUsage, +) + +__all__ = [ + "UnifiedEmitter", + "SpanTracer", + "OpenSpan", + "CloseSpan", + "SpanSignal", + "StreamTaskMessage", + "TurnUsage", + "TurnResult", + "HarnessTurn", +] diff --git a/src/agentex/lib/core/harness/emitter.py b/src/agentex/lib/core/harness/emitter.py new file mode 100644 index 000000000..5944abc17 --- /dev/null +++ b/src/agentex/lib/core/harness/emitter.py @@ -0,0 +1,59 @@ +"""UnifiedEmitter: the single facade agent authors use for either delivery mode.""" + +from __future__ import annotations + +from typing import AsyncIterator + +from agentex.lib.core.harness.auto_send import auto_send +from agentex.lib.core.harness.tracer import SpanTracer +from agentex.lib.core.harness.types import HarnessTurn, StreamTaskMessage, TurnResult +from agentex.lib.core.harness.yield_delivery import yield_events + + +class UnifiedEmitter: + """Ties trace context + chosen delivery together. + + Tracing is default-on whenever `trace_id` is truthy; pass `tracer=False` to + disable, or a custom `SpanTracer` to override. + """ + + tracer: SpanTracer | None + + def __init__( + self, + task_id: str, + trace_id: str | None, + parent_span_id: str | None, + tracer: SpanTracer | bool | None = None, + tracing: object | None = None, + ): + self.task_id = task_id + self.trace_id = trace_id + self.parent_span_id = parent_span_id + if tracer is False: + self.tracer = None + elif isinstance(tracer, SpanTracer): + self.tracer = tracer + elif trace_id: + self.tracer = SpanTracer( + trace_id=trace_id, + parent_span_id=parent_span_id, + task_id=task_id, + tracing=tracing, + ) + else: + self.tracer = None + + async def yield_turn(self, turn: HarnessTurn) -> AsyncIterator[StreamTaskMessage]: + """Sync HTTP ACP delivery: forward events, trace as side effect.""" + async for event in yield_events(turn.events, tracer=self.tracer): + yield event + + async def auto_send_turn(self, turn: HarnessTurn) -> TurnResult: + """Async/temporal delivery: push to the task stream, return TurnResult.""" + return await auto_send( + turn.events, + task_id=self.task_id, + tracer=self.tracer, + usage=turn.usage(), + ) diff --git a/tests/lib/core/harness/test_emitter.py b/tests/lib/core/harness/test_emitter.py new file mode 100644 index 000000000..318311e27 --- /dev/null +++ b/tests/lib/core/harness/test_emitter.py @@ -0,0 +1,56 @@ +import pytest + +from agentex.lib.core.harness.emitter import UnifiedEmitter +from agentex.lib.core.harness.types import TurnUsage +from agentex.types.task_message_update import StreamTaskMessageStart, StreamTaskMessageDone +from agentex.types.text_content import TextContent + + +class _FakeTracing: + async def start_span(self, **kw): + return None + + async def end_span(self, **kw): + pass + + +class _Turn: + def __init__(self, events_list, usage): + self._events_list = events_list + self._usage = usage + + @property + async def events(self): + for e in self._events_list: + yield e + + def usage(self): + return self._usage + + +@pytest.mark.asyncio +async def test_emitter_yield_mode_passes_through(): + events = [ + StreamTaskMessageStart(type="start", index=0, + content=TextContent(type="text", author="agent", content="hi")), + StreamTaskMessageDone(type="done", index=0), + ] + turn = _Turn(events, TurnUsage(model="m")) + emitter = UnifiedEmitter(task_id="t", trace_id=None, parent_span_id=None) + out = [e async for e in emitter.yield_turn(turn)] + assert out == events + + +@pytest.mark.asyncio +async def test_emitter_tracing_default_on_when_trace_id_present(): + # Inject a fake tracing backend so the test env doesn't need temporalio. + # This exercises the default-on path (tracer=None) when trace_id is truthy. + emitter = UnifiedEmitter(task_id="t", trace_id="trace1", parent_span_id="p", + tracing=_FakeTracing()) + assert emitter.tracer is not None + + +@pytest.mark.asyncio +async def test_emitter_tracing_overridable_off(): + emitter = UnifiedEmitter(task_id="t", trace_id="trace1", parent_span_id="p", tracer=False) + assert emitter.tracer is None From ed86a460f94cf7a5b2705b1b13fbdf38ed929c22 Mon Sep 17 00:00:00 2001 From: Declan Brady Date: Thu, 18 Jun 2026 12:47:44 -0400 Subject: [PATCH 16/35] refactor(harness): inject streaming into UnifiedEmitter + cover auto_send_turn + doc tracer modes Co-Authored-By: Claude Sonnet 4.6 --- src/agentex/lib/core/harness/emitter.py | 16 +++++-- tests/lib/core/harness/test_emitter.py | 62 ++++++++++++++++++++++++- 2 files changed, 73 insertions(+), 5 deletions(-) diff --git a/src/agentex/lib/core/harness/emitter.py b/src/agentex/lib/core/harness/emitter.py index 5944abc17..9573fb8b2 100644 --- a/src/agentex/lib/core/harness/emitter.py +++ b/src/agentex/lib/core/harness/emitter.py @@ -2,7 +2,7 @@ from __future__ import annotations -from typing import AsyncIterator +from typing import AsyncGenerator from agentex.lib.core.harness.auto_send import auto_send from agentex.lib.core.harness.tracer import SpanTracer @@ -13,8 +13,13 @@ class UnifiedEmitter: """Ties trace context + chosen delivery together. - Tracing is default-on whenever `trace_id` is truthy; pass `tracer=False` to - disable, or a custom `SpanTracer` to override. + Tracing modes (the `tracer` arg): + - tracer=None (default): auto-construct a SpanTracer if `trace_id` is present. + - tracer=False: disable tracing entirely, regardless of `trace_id`. + - tracer=: use the supplied instance. + + `tracing` and `streaming` are injection escape-hatches for tests/advanced + use; leave them None in production so the real adk modules are used. """ tracer: SpanTracer | None @@ -26,10 +31,12 @@ def __init__( parent_span_id: str | None, tracer: SpanTracer | bool | None = None, tracing: object | None = None, + streaming: object | None = None, ): self.task_id = task_id self.trace_id = trace_id self.parent_span_id = parent_span_id + self._streaming = streaming if tracer is False: self.tracer = None elif isinstance(tracer, SpanTracer): @@ -44,7 +51,7 @@ def __init__( else: self.tracer = None - async def yield_turn(self, turn: HarnessTurn) -> AsyncIterator[StreamTaskMessage]: + async def yield_turn(self, turn: HarnessTurn) -> AsyncGenerator[StreamTaskMessage, None]: """Sync HTTP ACP delivery: forward events, trace as side effect.""" async for event in yield_events(turn.events, tracer=self.tracer): yield event @@ -55,5 +62,6 @@ async def auto_send_turn(self, turn: HarnessTurn) -> TurnResult: turn.events, task_id=self.task_id, tracer=self.tracer, + streaming=self._streaming, usage=turn.usage(), ) diff --git a/tests/lib/core/harness/test_emitter.py b/tests/lib/core/harness/test_emitter.py index 318311e27..963a77dfe 100644 --- a/tests/lib/core/harness/test_emitter.py +++ b/tests/lib/core/harness/test_emitter.py @@ -2,7 +2,13 @@ from agentex.lib.core.harness.emitter import UnifiedEmitter from agentex.lib.core.harness.types import TurnUsage -from agentex.types.task_message_update import StreamTaskMessageStart, StreamTaskMessageDone +from agentex.types.task_message import TaskMessage +from agentex.types.task_message_delta import TextDelta +from agentex.types.task_message_update import ( + StreamTaskMessageDelta, + StreamTaskMessageDone, + StreamTaskMessageStart, +) from agentex.types.text_content import TextContent @@ -14,6 +20,42 @@ async def end_span(self, **kw): pass +class _FakeCtx: + """Minimal StreamingTaskMessageContext fake (see test_auto_send.py).""" + + def __init__(self, sink, content_type, initial_content): + self.sink = sink + self.content_type = content_type + self.task_message = TaskMessage(id="msg-1", task_id="task1", content=initial_content) + + async def __aenter__(self): + self.sink.append(("open", self.content_type)) + return self + + async def __aexit__(self, *a): + await self.close() + return False + + async def close(self): + self.sink.append(("close", self.content_type)) + + async def stream_update(self, update): + self.sink.append(("update", update)) + return update + + +class _FakeStreaming: + def __init__(self): + self.sink = [] + + def streaming_task_message_context( + self, task_id, initial_content, streaming_mode="coalesced", created_at=None + ): + ctype = getattr(initial_content, "type", None) + self.sink.append(("ctx", ctype)) + return _FakeCtx(self.sink, ctype, initial_content) + + class _Turn: def __init__(self, events_list, usage): self._events_list = events_list @@ -54,3 +96,21 @@ async def test_emitter_tracing_default_on_when_trace_id_present(): async def test_emitter_tracing_overridable_off(): emitter = UnifiedEmitter(task_id="t", trace_id="trace1", parent_span_id="p", tracer=False) assert emitter.tracer is None + + +@pytest.mark.asyncio +async def test_emitter_auto_send_turn_returns_usage(): + usage = TurnUsage(model="m", input_tokens=5) + events = [ + StreamTaskMessageStart(type="start", index=0, + content=TextContent(type="text", author="agent", content="")), + StreamTaskMessageDelta(type="delta", index=0, + delta=TextDelta(type="text", text_delta="Hello")), + StreamTaskMessageDone(type="done", index=0), + ] + turn = _Turn(events, usage) + fake = _FakeStreaming() + emitter = UnifiedEmitter(task_id="t", trace_id=None, parent_span_id=None, streaming=fake) + result = await emitter.auto_send_turn(turn) + assert result.usage == usage + assert result.final_text == "Hello" From b5f6b94b6c57478e7de278e0d3ec1f34a750e72e Mon Sep 17 00:00:00 2001 From: Declan Brady Date: Thu, 18 Jun 2026 12:50:18 -0400 Subject: [PATCH 17/35] test(harness): conformance scaffold + CI integration job skeleton Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/harness-integration.yml | 33 ++++++++++++++++ .../lib/core/harness/conformance/__init__.py | 0 tests/lib/core/harness/conformance/runner.py | 39 +++++++++++++++++++ .../harness/conformance/test_conformance.py | 28 +++++++++++++ 4 files changed, 100 insertions(+) create mode 100644 .github/workflows/harness-integration.yml create mode 100644 tests/lib/core/harness/conformance/__init__.py create mode 100644 tests/lib/core/harness/conformance/runner.py create mode 100644 tests/lib/core/harness/conformance/test_conformance.py diff --git a/.github/workflows/harness-integration.yml b/.github/workflows/harness-integration.yml new file mode 100644 index 000000000..33ca06728 --- /dev/null +++ b/.github/workflows/harness-integration.yml @@ -0,0 +1,33 @@ +name: Harness Integration + +on: + pull_request: + paths: + - "src/agentex/lib/core/harness/**" + - "src/agentex/lib/adk/_modules/**" + - ".github/workflows/harness-integration.yml" + +jobs: + conformance: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - name: Install uv + uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86 # v5.4.2 + with: + version: '0.10.2' + + - name: Bootstrap + run: ./scripts/bootstrap + + - name: Conformance suite + run: uv run pytest tests/lib/core/harness/ -v + + # Live integration matrix (harness x {sync, async, temporal}) is added per-harness + # in the migration plans. Placeholder job keeps the workflow valid until then. + live-matrix: + runs-on: ubuntu-latest + if: false # enabled once the first harness's test agents land + steps: + - run: echo "populated by migration PRs" diff --git a/tests/lib/core/harness/conformance/__init__.py b/tests/lib/core/harness/conformance/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/lib/core/harness/conformance/runner.py b/tests/lib/core/harness/conformance/runner.py new file mode 100644 index 000000000..210514b41 --- /dev/null +++ b/tests/lib/core/harness/conformance/runner.py @@ -0,0 +1,39 @@ +"""Shared conformance engine: every harness tap registers fixtures here. + +A fixture is (name, list[StreamTaskMessage]). The runner asserts that span +derivation over the events is identical regardless of delivery channel, which is +the cross-channel guarantee from the spec. +""" + +from __future__ import annotations + +from dataclasses import dataclass + +from agentex.lib.core.harness.span_derivation import SpanDeriver +from agentex.lib.core.harness.types import SpanSignal, StreamTaskMessage + + +@dataclass +class Fixture: + name: str + events: list[StreamTaskMessage] + + +_REGISTRY: list[Fixture] = [] + + +def register(fixture: Fixture) -> None: + _REGISTRY.append(fixture) + + +def all_fixtures() -> list[Fixture]: + return list(_REGISTRY) + + +def derive_all(events: list[StreamTaskMessage]) -> list[SpanSignal]: + d = SpanDeriver() + out: list[SpanSignal] = [] + for e in events: + out.extend(d.observe(e)) + out.extend(d.flush()) + return out diff --git a/tests/lib/core/harness/conformance/test_conformance.py b/tests/lib/core/harness/conformance/test_conformance.py new file mode 100644 index 000000000..cc350df3a --- /dev/null +++ b/tests/lib/core/harness/conformance/test_conformance.py @@ -0,0 +1,28 @@ +import pytest + +from tests.lib.core.harness.conformance.runner import Fixture, derive_all, register, all_fixtures +from agentex.types.task_message_update import ( + StreamTaskMessageStart, StreamTaskMessageDone, StreamTaskMessageFull, +) +from agentex.types.tool_request_content import ToolRequestContent +from agentex.types.tool_response_content import ToolResponseContent + +register(Fixture( + name="builtin-single-tool", + events=[ + StreamTaskMessageStart(type="start", index=0, + content=ToolRequestContent(type="tool_request", author="agent", + tool_call_id="c", name="Bash", arguments={})), + StreamTaskMessageDone(type="done", index=0), + StreamTaskMessageFull(type="full", index=1, + content=ToolResponseContent(type="tool_response", author="agent", + tool_call_id="c", name="Bash", content="ok")), + ], +)) + + +@pytest.mark.parametrize("fixture", all_fixtures(), ids=lambda f: f.name) +def test_span_derivation_is_deterministic(fixture): + # Deriving twice over the same events yields identical signals (the property + # that makes yield vs auto-send equivalent, since both observe the same stream). + assert derive_all(fixture.events) == derive_all(fixture.events) From 520849afe3f96a024cb184bd8e9d4d3663330529 Mon Sep 17 00:00:00 2001 From: Declan Brady Date: Thu, 18 Jun 2026 12:56:34 -0400 Subject: [PATCH 18/35] test(harness): match scripts/test invocation + document conformance registry semantics Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/harness-integration.yml | 11 +++++++++-- tests/lib/core/harness/conformance/runner.py | 9 +++++++++ .../lib/core/harness/conformance/test_conformance.py | 2 ++ 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/.github/workflows/harness-integration.yml b/.github/workflows/harness-integration.yml index 33ca06728..ab6b353b9 100644 --- a/.github/workflows/harness-integration.yml +++ b/.github/workflows/harness-integration.yml @@ -1,6 +1,8 @@ name: Harness Integration on: + push: + branches: [main] pull_request: paths: - "src/agentex/lib/core/harness/**" @@ -21,8 +23,13 @@ jobs: - name: Bootstrap run: ./scripts/bootstrap + # Defer to scripts/test so the harness suite runs under the exact same + # invocation as the main CI test job: DEFER_PYDANTIC_BUILD=false and + # `uv run --isolated --all-packages --all-extras pytest`, across the + # min/max supported Python versions. Running `uv run pytest` directly + # would risk an all-extras-only dep passing locally but failing in CI. - name: Conformance suite - run: uv run pytest tests/lib/core/harness/ -v + run: ./scripts/test tests/lib/core/harness/ -v # Live integration matrix (harness x {sync, async, temporal}) is added per-harness # in the migration plans. Placeholder job keeps the workflow valid until then. @@ -30,4 +37,4 @@ jobs: runs-on: ubuntu-latest if: false # enabled once the first harness's test agents land steps: - - run: echo "populated by migration PRs" + - run: echo "populated by migration PRs" # TODO(harness-migration): enable per-harness; see docs/superpowers/plans migration PRs 4-8 diff --git a/tests/lib/core/harness/conformance/runner.py b/tests/lib/core/harness/conformance/runner.py index 210514b41..ffd72f89a 100644 --- a/tests/lib/core/harness/conformance/runner.py +++ b/tests/lib/core/harness/conformance/runner.py @@ -3,6 +3,15 @@ A fixture is (name, list[StreamTaskMessage]). The runner asserts that span derivation over the events is identical regardless of delivery channel, which is the cross-channel guarantee from the spec. + +Registry shared-state hazard: `_REGISTRY` is process-global. Every `test_*.py` +module that calls `register()` at import time contributes to it, so a module +that parametrizes over `all_fixtures()` will see fixtures registered by ANY +other conformance module imported earlier in the same pytest process (collection +order is not guaranteed). To stay deterministic, each future harness conformance +module should register and parametrize over its OWN fixtures (e.g. keep a +module-local list it both registers and parametrizes), rather than relying on +cross-module global accumulation via `all_fixtures()`. """ from __future__ import annotations diff --git a/tests/lib/core/harness/conformance/test_conformance.py b/tests/lib/core/harness/conformance/test_conformance.py index cc350df3a..6080ca5ef 100644 --- a/tests/lib/core/harness/conformance/test_conformance.py +++ b/tests/lib/core/harness/conformance/test_conformance.py @@ -23,6 +23,8 @@ @pytest.mark.parametrize("fixture", all_fixtures(), ids=lambda f: f.name) def test_span_derivation_is_deterministic(fixture): + """Exercises the cross-channel guarantee: yield and auto-send observe the + same event stream, so span derivation must be deterministic/idempotent.""" # Deriving twice over the same events yields identical signals (the property # that makes yield vs auto-send equivalent, since both observe the same stream). assert derive_all(fixture.events) == derive_all(fixture.events) From a915170b3efb3ecc18b46934ef7b7f7f38150df4 Mon Sep 17 00:00:00 2001 From: Declan Brady Date: Thu, 18 Jun 2026 13:00:25 -0400 Subject: [PATCH 19/35] refactor(harness): isinstance narrowing for clean type-check across the package Co-Authored-By: Claude Sonnet 4.6 --- src/agentex/lib/core/harness/auto_send.py | 6 ++---- src/agentex/lib/core/harness/span_derivation.py | 12 +++++++----- src/agentex/lib/core/harness/yield_delivery.py | 4 ++-- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/agentex/lib/core/harness/auto_send.py b/src/agentex/lib/core/harness/auto_send.py index e7de01a68..850146ab7 100644 --- a/src/agentex/lib/core/harness/auto_send.py +++ b/src/agentex/lib/core/harness/auto_send.py @@ -10,6 +10,7 @@ StreamTaskMessageFull, StreamTaskMessageStart, ) +from agentex.types.text_delta import TextDelta from agentex.lib.core.harness.span_derivation import SpanDeriver from agentex.lib.core.harness.tracer import SpanTracer @@ -83,10 +84,7 @@ async def _close_current() -> None: index=event.index, ) await current_ctx.stream_update(delta_with_parent) - if ( - getattr(event.delta, "type", None) == "text" - and event.delta.text_delta - ): + if isinstance(event.delta, TextDelta) and event.delta.text_delta: final_text_parts.append(event.delta.text_delta) elif isinstance(event, StreamTaskMessageDone): diff --git a/src/agentex/lib/core/harness/span_derivation.py b/src/agentex/lib/core/harness/span_derivation.py index 15e1f593f..eac929ee5 100644 --- a/src/agentex/lib/core/harness/span_derivation.py +++ b/src/agentex/lib/core/harness/span_derivation.py @@ -15,6 +15,9 @@ StreamTaskMessageFull, StreamTaskMessageStart, ) +from agentex.types.tool_request_content import ToolRequestContent +from agentex.types.tool_request_delta import ToolRequestDelta +from agentex.types.tool_response_content import ToolResponseContent from agentex.lib.core.harness.types import CloseSpan, OpenSpan, SpanSignal, StreamTaskMessage @@ -79,15 +82,14 @@ def _on_start(self, event: StreamTaskMessageStart) -> list[SpanSignal]: return [] idx = event.index content = event.content - ctype = getattr(content, "type", None) - if ctype == "tool_request": + if isinstance(content, ToolRequestContent): self._tool_by_index[idx] = _ToolReqMeta( tool_call_id=content.tool_call_id, name=content.name, arguments=dict(content.arguments or {}), ) return [] - if ctype == "reasoning": + if content.type == "reasoning": self._reasoning_index_open.add(idx) return [OpenSpan(key=f"reasoning:{idx}", kind="reasoning", name="reasoning", input={})] return [] @@ -97,7 +99,7 @@ def _on_delta(self, event: StreamTaskMessageDelta) -> list[SpanSignal]: return [] idx = event.index delta = event.delta - if delta is not None and getattr(delta, "type", None) == "tool_request": + if isinstance(delta, ToolRequestDelta): meta = self._tool_by_index.get(idx) if meta is not None and delta.arguments_delta: meta.args_buf += delta.arguments_delta @@ -105,7 +107,7 @@ def _on_delta(self, event: StreamTaskMessageDelta) -> list[SpanSignal]: def _on_full(self, event: StreamTaskMessageFull) -> list[SpanSignal]: content = event.content - if getattr(content, "type", None) == "tool_response": + if isinstance(content, ToolResponseContent): tcid = content.tool_call_id if tcid in self._open_tool_ids: self._open_tool_ids.pop(tcid, None) diff --git a/src/agentex/lib/core/harness/yield_delivery.py b/src/agentex/lib/core/harness/yield_delivery.py index ca923c6a3..0d90d5d94 100644 --- a/src/agentex/lib/core/harness/yield_delivery.py +++ b/src/agentex/lib/core/harness/yield_delivery.py @@ -21,11 +21,11 @@ async def yield_events( deriver = SpanDeriver() if tracer is not None else None try: async for event in events: - if deriver is not None: # tracer is non-None whenever deriver is set + if deriver is not None and tracer is not None: for signal in deriver.observe(event): await tracer.handle(signal) yield event finally: - if deriver is not None: # tracer is non-None whenever deriver is set + if deriver is not None and tracer is not None: for signal in deriver.flush(): await tracer.handle(signal) From e7b9c5209852200ccac31db170c435a2646f8893 Mon Sep 17 00:00:00 2001 From: Declan Brady Date: Thu, 18 Jun 2026 13:04:07 -0400 Subject: [PATCH 20/35] refactor(harness): narrow auto_send tracer guards, drop type:ignore for consistency Co-Authored-By: Claude Sonnet 4.6 --- src/agentex/lib/core/harness/auto_send.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/agentex/lib/core/harness/auto_send.py b/src/agentex/lib/core/harness/auto_send.py index 850146ab7..ee17fdc56 100644 --- a/src/agentex/lib/core/harness/auto_send.py +++ b/src/agentex/lib/core/harness/auto_send.py @@ -58,9 +58,9 @@ async def _close_current() -> None: try: async for event in events: - if deriver is not None: + if deriver is not None and tracer is not None: for signal in deriver.observe(event): - await tracer.handle(signal) # type: ignore[union-attr] + await tracer.handle(signal) if isinstance(event, StreamTaskMessageStart): ctype = getattr(event.content, "type", None) @@ -107,8 +107,8 @@ async def _close_current() -> None: finally: await _close_current() - if deriver is not None: + if deriver is not None and tracer is not None: for signal in deriver.flush(): - await tracer.handle(signal) # type: ignore[union-attr] + await tracer.handle(signal) return TurnResult(final_text="".join(final_text_parts), usage=usage or TurnUsage()) From ebc468d014e2a243b245c29d6395d4055beccbde Mon Sep 17 00:00:00 2001 From: Declan Brady Date: Thu, 18 Jun 2026 13:29:20 -0400 Subject: [PATCH 21/35] style: ruff import-sort + format fixes across the harness package Co-Authored-By: Claude Opus 4.8 (1M context) --- src/agentex/lib/core/harness/__init__.py | 12 +- src/agentex/lib/core/harness/auto_send.py | 9 +- src/agentex/lib/core/harness/emitter.py | 4 +- .../lib/core/harness/span_derivation.py | 7 +- src/agentex/lib/core/harness/tracer.py | 2 +- src/agentex/lib/core/harness/types.py | 9 +- .../lib/core/harness/yield_delivery.py | 6 +- tests/lib/core/harness/conformance/runner.py | 2 +- .../harness/conformance/test_conformance.py | 40 ++++--- tests/lib/core/harness/test_auto_send.py | 111 +++++++++++------- tests/lib/core/harness/test_emitter.py | 24 ++-- .../lib/core/harness/test_span_derivation.py | 107 ++++++++++------- tests/lib/core/harness/test_tracer.py | 2 +- tests/lib/core/harness/test_types.py | 4 +- tests/lib/core/harness/test_yield_delivery.py | 46 +++++--- 15 files changed, 228 insertions(+), 157 deletions(-) diff --git a/src/agentex/lib/core/harness/__init__.py b/src/agentex/lib/core/harness/__init__.py index 2988db8ff..067751d63 100644 --- a/src/agentex/lib/core/harness/__init__.py +++ b/src/agentex/lib/core/harness/__init__.py @@ -5,17 +5,17 @@ harness tap gets streaming + tracing + turn usage uniformly. """ -from agentex.lib.core.harness.emitter import UnifiedEmitter -from agentex.lib.core.harness.tracer import SpanTracer from agentex.lib.core.harness.types import ( - CloseSpan, - HarnessTurn, OpenSpan, + CloseSpan, + TurnUsage, SpanSignal, - StreamTaskMessage, TurnResult, - TurnUsage, + HarnessTurn, + StreamTaskMessage, ) +from agentex.lib.core.harness.tracer import SpanTracer +from agentex.lib.core.harness.emitter import UnifiedEmitter __all__ = [ "UnifiedEmitter", diff --git a/src/agentex/lib/core/harness/auto_send.py b/src/agentex/lib/core/harness/auto_send.py index ee17fdc56..b246cf38c 100644 --- a/src/agentex/lib/core/harness/auto_send.py +++ b/src/agentex/lib/core/harness/auto_send.py @@ -4,17 +4,16 @@ from typing import Any, AsyncIterator +from agentex.types.text_delta import TextDelta +from agentex.lib.core.harness.types import TurnUsage, TurnResult, StreamTaskMessage +from agentex.lib.core.harness.tracer import SpanTracer from agentex.types.task_message_update import ( - StreamTaskMessageDelta, StreamTaskMessageDone, StreamTaskMessageFull, + StreamTaskMessageDelta, StreamTaskMessageStart, ) -from agentex.types.text_delta import TextDelta - from agentex.lib.core.harness.span_derivation import SpanDeriver -from agentex.lib.core.harness.tracer import SpanTracer -from agentex.lib.core.harness.types import StreamTaskMessage, TurnResult, TurnUsage async def auto_send( diff --git a/src/agentex/lib/core/harness/emitter.py b/src/agentex/lib/core/harness/emitter.py index 9573fb8b2..681c859ea 100644 --- a/src/agentex/lib/core/harness/emitter.py +++ b/src/agentex/lib/core/harness/emitter.py @@ -4,9 +4,9 @@ from typing import AsyncGenerator -from agentex.lib.core.harness.auto_send import auto_send +from agentex.lib.core.harness.types import TurnResult, HarnessTurn, StreamTaskMessage from agentex.lib.core.harness.tracer import SpanTracer -from agentex.lib.core.harness.types import HarnessTurn, StreamTaskMessage, TurnResult +from agentex.lib.core.harness.auto_send import auto_send from agentex.lib.core.harness.yield_delivery import yield_events diff --git a/src/agentex/lib/core/harness/span_derivation.py b/src/agentex/lib/core/harness/span_derivation.py index eac929ee5..d353cf9e0 100644 --- a/src/agentex/lib/core/harness/span_derivation.py +++ b/src/agentex/lib/core/harness/span_derivation.py @@ -9,18 +9,17 @@ import json from dataclasses import dataclass +from agentex.lib.core.harness.types import OpenSpan, CloseSpan, SpanSignal, StreamTaskMessage +from agentex.types.tool_request_delta import ToolRequestDelta from agentex.types.task_message_update import ( - StreamTaskMessageDelta, StreamTaskMessageDone, StreamTaskMessageFull, + StreamTaskMessageDelta, StreamTaskMessageStart, ) from agentex.types.tool_request_content import ToolRequestContent -from agentex.types.tool_request_delta import ToolRequestDelta from agentex.types.tool_response_content import ToolResponseContent -from agentex.lib.core.harness.types import CloseSpan, OpenSpan, SpanSignal, StreamTaskMessage - @dataclass class _ToolReqMeta: diff --git a/src/agentex/lib/core/harness/tracer.py b/src/agentex/lib/core/harness/tracer.py index 3f4ff40c2..8384407bd 100644 --- a/src/agentex/lib/core/harness/tracer.py +++ b/src/agentex/lib/core/harness/tracer.py @@ -4,7 +4,7 @@ from typing import Any -from agentex.lib.core.harness.types import CloseSpan, OpenSpan, SpanSignal +from agentex.lib.core.harness.types import OpenSpan, CloseSpan, SpanSignal try: from agentex.lib.utils.logging import make_logger diff --git a/src/agentex/lib/core/harness/types.py b/src/agentex/lib/core/harness/types.py index f31b2c67f..64104d316 100644 --- a/src/agentex/lib/core/harness/types.py +++ b/src/agentex/lib/core/harness/types.py @@ -2,16 +2,17 @@ from __future__ import annotations -from dataclasses import dataclass, field -from typing import Any, AsyncIterator, Literal, Protocol, Union, runtime_checkable +from typing import Any, Union, Literal, Protocol, AsyncIterator, runtime_checkable +from dataclasses import field, dataclass + +from pydantic import BaseModel, ConfigDict from agentex.types.task_message_update import ( - StreamTaskMessageDelta, StreamTaskMessageDone, StreamTaskMessageFull, + StreamTaskMessageDelta, StreamTaskMessageStart, ) -from pydantic import BaseModel, ConfigDict # The canonical stream element. Taps yield these; delivery adapters consume them. StreamTaskMessage = Union[ diff --git a/src/agentex/lib/core/harness/yield_delivery.py b/src/agentex/lib/core/harness/yield_delivery.py index 0d90d5d94..69b39f152 100644 --- a/src/agentex/lib/core/harness/yield_delivery.py +++ b/src/agentex/lib/core/harness/yield_delivery.py @@ -2,11 +2,11 @@ from __future__ import annotations -from typing import AsyncGenerator, AsyncIterator +from typing import AsyncIterator, AsyncGenerator -from agentex.lib.core.harness.span_derivation import SpanDeriver -from agentex.lib.core.harness.tracer import SpanTracer from agentex.lib.core.harness.types import StreamTaskMessage +from agentex.lib.core.harness.tracer import SpanTracer +from agentex.lib.core.harness.span_derivation import SpanDeriver async def yield_events( diff --git a/tests/lib/core/harness/conformance/runner.py b/tests/lib/core/harness/conformance/runner.py index ffd72f89a..81a74860c 100644 --- a/tests/lib/core/harness/conformance/runner.py +++ b/tests/lib/core/harness/conformance/runner.py @@ -18,8 +18,8 @@ from dataclasses import dataclass -from agentex.lib.core.harness.span_derivation import SpanDeriver from agentex.lib.core.harness.types import SpanSignal, StreamTaskMessage +from agentex.lib.core.harness.span_derivation import SpanDeriver @dataclass diff --git a/tests/lib/core/harness/conformance/test_conformance.py b/tests/lib/core/harness/conformance/test_conformance.py index 6080ca5ef..1d686c33a 100644 --- a/tests/lib/core/harness/conformance/test_conformance.py +++ b/tests/lib/core/harness/conformance/test_conformance.py @@ -1,24 +1,36 @@ import pytest -from tests.lib.core.harness.conformance.runner import Fixture, derive_all, register, all_fixtures from agentex.types.task_message_update import ( - StreamTaskMessageStart, StreamTaskMessageDone, StreamTaskMessageFull, + StreamTaskMessageDone, + StreamTaskMessageFull, + StreamTaskMessageStart, ) from agentex.types.tool_request_content import ToolRequestContent from agentex.types.tool_response_content import ToolResponseContent +from tests.lib.core.harness.conformance.runner import Fixture, register, derive_all, all_fixtures -register(Fixture( - name="builtin-single-tool", - events=[ - StreamTaskMessageStart(type="start", index=0, - content=ToolRequestContent(type="tool_request", author="agent", - tool_call_id="c", name="Bash", arguments={})), - StreamTaskMessageDone(type="done", index=0), - StreamTaskMessageFull(type="full", index=1, - content=ToolResponseContent(type="tool_response", author="agent", - tool_call_id="c", name="Bash", content="ok")), - ], -)) +register( + Fixture( + name="builtin-single-tool", + events=[ + StreamTaskMessageStart( + type="start", + index=0, + content=ToolRequestContent( + type="tool_request", author="agent", tool_call_id="c", name="Bash", arguments={} + ), + ), + StreamTaskMessageDone(type="done", index=0), + StreamTaskMessageFull( + type="full", + index=1, + content=ToolResponseContent( + type="tool_response", author="agent", tool_call_id="c", name="Bash", content="ok" + ), + ), + ], + ) +) @pytest.mark.parametrize("fixture", all_fixtures(), ids=lambda f: f.name) diff --git a/tests/lib/core/harness/test_auto_send.py b/tests/lib/core/harness/test_auto_send.py index 9568d7b87..e7331e67c 100644 --- a/tests/lib/core/harness/test_auto_send.py +++ b/tests/lib/core/harness/test_auto_send.py @@ -13,17 +13,17 @@ import pytest -from agentex.lib.core.harness.auto_send import auto_send -from agentex.lib.core.harness.tracer import SpanTracer from agentex.types.task_message import TaskMessage +from agentex.types.text_content import TextContent +from agentex.lib.core.harness.tracer import SpanTracer +from agentex.types.task_message_delta import TextDelta from agentex.types.task_message_update import ( - StreamTaskMessageStart, - StreamTaskMessageDelta, StreamTaskMessageDone, StreamTaskMessageFull, + StreamTaskMessageDelta, + StreamTaskMessageStart, ) -from agentex.types.text_content import TextContent -from agentex.types.task_message_delta import TextDelta +from agentex.lib.core.harness.auto_send import auto_send from agentex.types.tool_request_content import ToolRequestContent from agentex.types.tool_response_content import ToolResponseContent @@ -40,9 +40,7 @@ def __init__(self, sink, content_type, initial_content): self.sink = sink self.content_type = content_type # Real TaskMessage so StreamTaskMessageDelta(parent_task_message=...) passes validation - self.task_message = TaskMessage( - id="msg-1", task_id="task1", content=initial_content - ) + self.task_message = TaskMessage(id="msg-1", task_id="task1", content=initial_content) async def __aenter__(self): self.sink.append(("open", self.content_type)) @@ -67,9 +65,7 @@ class _FakeStreaming: def __init__(self): self.sink = [] - def streaming_task_message_context( - self, task_id, initial_content, streaming_mode="coalesced", created_at=None - ): + def streaming_task_message_context(self, task_id, initial_content, streaming_mode="coalesced", created_at=None): ctype = getattr(initial_content, "type", None) self.sink.append(("ctx", ctype)) return _FakeCtx(self.sink, ctype, initial_content) @@ -84,20 +80,24 @@ async def _gen(events): # Test 1: text streaming — open, stream deltas, close; return accumulated text # --------------------------------------------------------------------------- + @pytest.mark.asyncio async def test_auto_send_streams_text_and_returns_final_text(): streaming = _FakeStreaming() events = [ StreamTaskMessageStart( - type="start", index=0, + type="start", + index=0, content=TextContent(type="text", author="agent", content=""), ), StreamTaskMessageDelta( - type="delta", index=0, + type="delta", + index=0, delta=TextDelta(type="text", text_delta="Hel"), ), StreamTaskMessageDelta( - type="delta", index=0, + type="delta", + index=0, delta=TextDelta(type="text", text_delta="lo"), ), StreamTaskMessageDone(type="done", index=0), @@ -122,6 +122,7 @@ async def test_auto_send_streams_text_and_returns_final_text(): # (open context with the content, no deltas, close immediately) # --------------------------------------------------------------------------- + @pytest.mark.asyncio async def test_auto_send_posts_full_tool_messages(): streaming = _FakeStreaming() @@ -129,24 +130,36 @@ async def test_auto_send_posts_full_tool_messages(): # A bare tool_request Start (no Done/Full) must NOT open a streaming # context on its own — only Full events post messages. StreamTaskMessageStart( - type="start", index=0, + type="start", + index=0, content=ToolRequestContent( - type="tool_request", author="agent", - tool_call_id="c0", name="Bash", arguments={}, + type="tool_request", + author="agent", + tool_call_id="c0", + name="Bash", + arguments={}, ), ), StreamTaskMessageFull( - type="full", index=1, + type="full", + index=1, content=ToolRequestContent( - type="tool_request", author="agent", - tool_call_id="c1", name="Bash", arguments={"cmd": "ls"}, + type="tool_request", + author="agent", + tool_call_id="c1", + name="Bash", + arguments={"cmd": "ls"}, ), ), StreamTaskMessageFull( - type="full", index=2, + type="full", + index=2, content=ToolResponseContent( - type="tool_response", author="agent", - tool_call_id="c1", name="Bash", content="file.py", + type="tool_response", + author="agent", + tool_call_id="c1", + name="Bash", + content="file.py", ), ), ] @@ -176,6 +189,7 @@ async def test_auto_send_posts_full_tool_messages(): # Test 3: tracing — spans are derived and handed to the tracer # --------------------------------------------------------------------------- + class _RecordTracing: def __init__(self): self.started, self.ended = [], [] @@ -196,25 +210,31 @@ async def test_auto_send_derives_tool_spans_via_tracer(): events = [ StreamTaskMessageStart( - type="start", index=0, + type="start", + index=0, content=ToolRequestContent( - type="tool_request", author="agent", - tool_call_id="c1", name="Bash", arguments={}, + type="tool_request", + author="agent", + tool_call_id="c1", + name="Bash", + arguments={}, ), ), StreamTaskMessageDone(type="done", index=0), StreamTaskMessageFull( - type="full", index=1, + type="full", + index=1, content=ToolResponseContent( - type="tool_response", author="agent", - tool_call_id="c1", name="Bash", content="ok", + type="tool_response", + author="agent", + tool_call_id="c1", + name="Bash", + content="ok", ), ), ] - result = await auto_send( - _gen(events), task_id="task1", tracer=tracer, streaming=streaming - ) + result = await auto_send(_gen(events), task_id="task1", tracer=tracer, streaming=streaming) assert result.final_text == "" assert fake_tracing.started == ["Bash"] @@ -225,24 +245,31 @@ async def test_auto_send_derives_tool_spans_via_tracer(): # Test 4: text followed by a tool Full — text context is closed before Full # --------------------------------------------------------------------------- + @pytest.mark.asyncio async def test_auto_send_closes_text_context_before_full_message(): streaming = _FakeStreaming() events = [ StreamTaskMessageStart( - type="start", index=0, + type="start", + index=0, content=TextContent(type="text", author="agent", content=""), ), StreamTaskMessageDelta( - type="delta", index=0, + type="delta", + index=0, delta=TextDelta(type="text", text_delta="Hi"), ), StreamTaskMessageDone(type="done", index=0), StreamTaskMessageFull( - type="full", index=1, + type="full", + index=1, content=ToolRequestContent( - type="tool_request", author="agent", - tool_call_id="c2", name="read_file", arguments={}, + type="tool_request", + author="agent", + tool_call_id="c2", + name="read_file", + arguments={}, ), ), ] @@ -261,21 +288,21 @@ async def test_auto_send_closes_text_context_before_full_message(): # Test 5: midstream error — propagates AND the open context is closed (finally) # --------------------------------------------------------------------------- + @pytest.mark.asyncio async def test_open_context_closed_on_midstream_error(): streaming = _FakeStreaming() async def _exploding_gen(): yield StreamTaskMessageStart( - type="start", index=0, + type="start", + index=0, content=TextContent(type="text", author="agent", content=""), ) raise RuntimeError("boom") with pytest.raises(RuntimeError, match="boom"): - await auto_send( - _exploding_gen(), task_id="task1", tracer=None, streaming=streaming - ) + await auto_send(_exploding_gen(), task_id="task1", tracer=None, streaming=streaming) # The text context that was opened mid-stream was closed by the finally block. assert ("open", "text") in [(s[0], s[1]) for s in streaming.sink] diff --git a/tests/lib/core/harness/test_emitter.py b/tests/lib/core/harness/test_emitter.py index 963a77dfe..ee3052f47 100644 --- a/tests/lib/core/harness/test_emitter.py +++ b/tests/lib/core/harness/test_emitter.py @@ -1,15 +1,15 @@ import pytest -from agentex.lib.core.harness.emitter import UnifiedEmitter -from agentex.lib.core.harness.types import TurnUsage from agentex.types.task_message import TaskMessage +from agentex.types.text_content import TextContent +from agentex.lib.core.harness.types import TurnUsage +from agentex.lib.core.harness.emitter import UnifiedEmitter from agentex.types.task_message_delta import TextDelta from agentex.types.task_message_update import ( - StreamTaskMessageDelta, StreamTaskMessageDone, + StreamTaskMessageDelta, StreamTaskMessageStart, ) -from agentex.types.text_content import TextContent class _FakeTracing: @@ -48,9 +48,7 @@ class _FakeStreaming: def __init__(self): self.sink = [] - def streaming_task_message_context( - self, task_id, initial_content, streaming_mode="coalesced", created_at=None - ): + def streaming_task_message_context(self, task_id, initial_content, streaming_mode="coalesced", created_at=None): ctype = getattr(initial_content, "type", None) self.sink.append(("ctx", ctype)) return _FakeCtx(self.sink, ctype, initial_content) @@ -73,8 +71,7 @@ def usage(self): @pytest.mark.asyncio async def test_emitter_yield_mode_passes_through(): events = [ - StreamTaskMessageStart(type="start", index=0, - content=TextContent(type="text", author="agent", content="hi")), + StreamTaskMessageStart(type="start", index=0, content=TextContent(type="text", author="agent", content="hi")), StreamTaskMessageDone(type="done", index=0), ] turn = _Turn(events, TurnUsage(model="m")) @@ -87,8 +84,7 @@ async def test_emitter_yield_mode_passes_through(): async def test_emitter_tracing_default_on_when_trace_id_present(): # Inject a fake tracing backend so the test env doesn't need temporalio. # This exercises the default-on path (tracer=None) when trace_id is truthy. - emitter = UnifiedEmitter(task_id="t", trace_id="trace1", parent_span_id="p", - tracing=_FakeTracing()) + emitter = UnifiedEmitter(task_id="t", trace_id="trace1", parent_span_id="p", tracing=_FakeTracing()) assert emitter.tracer is not None @@ -102,10 +98,8 @@ async def test_emitter_tracing_overridable_off(): async def test_emitter_auto_send_turn_returns_usage(): usage = TurnUsage(model="m", input_tokens=5) events = [ - StreamTaskMessageStart(type="start", index=0, - content=TextContent(type="text", author="agent", content="")), - StreamTaskMessageDelta(type="delta", index=0, - delta=TextDelta(type="text", text_delta="Hello")), + StreamTaskMessageStart(type="start", index=0, content=TextContent(type="text", author="agent", content="")), + StreamTaskMessageDelta(type="delta", index=0, delta=TextDelta(type="text", text_delta="Hello")), StreamTaskMessageDone(type="done", index=0), ] turn = _Turn(events, usage) diff --git a/tests/lib/core/harness/test_span_derivation.py b/tests/lib/core/harness/test_span_derivation.py index 0630131d0..7779de815 100644 --- a/tests/lib/core/harness/test_span_derivation.py +++ b/tests/lib/core/harness/test_span_derivation.py @@ -1,16 +1,16 @@ -from agentex.lib.core.harness.span_derivation import SpanDeriver +from agentex.types.text_content import TextContent from agentex.lib.core.harness.types import OpenSpan, CloseSpan +from agentex.types.reasoning_content import ReasoningContent +from agentex.types.tool_request_delta import ToolRequestDelta from agentex.types.task_message_update import ( - StreamTaskMessageStart, - StreamTaskMessageDelta, - StreamTaskMessageFull, StreamTaskMessageDone, + StreamTaskMessageFull, + StreamTaskMessageDelta, + StreamTaskMessageStart, ) -from agentex.types.text_content import TextContent -from agentex.types.reasoning_content import ReasoningContent from agentex.types.tool_request_content import ToolRequestContent from agentex.types.tool_response_content import ToolResponseContent -from agentex.types.tool_request_delta import ToolRequestDelta +from agentex.lib.core.harness.span_derivation import SpanDeriver def _signals(deriver, events): @@ -23,19 +23,17 @@ def _signals(deriver, events): def _tool_req(idx, tcid, name, args): return StreamTaskMessageStart( - type="start", index=idx, - content=ToolRequestContent(type="tool_request", author="agent", - tool_call_id=tcid, name=name, arguments=args), + type="start", + index=idx, + content=ToolRequestContent(type="tool_request", author="agent", tool_call_id=tcid, name=name, arguments=args), ) def test_text_only_yields_no_spans(): d = SpanDeriver() events = [ - StreamTaskMessageStart(type="start", index=0, - content=TextContent(type="text", author="agent", content="")), - StreamTaskMessageDelta(type="delta", index=0, - delta=None), + StreamTaskMessageStart(type="start", index=0, content=TextContent(type="text", author="agent", content="")), + StreamTaskMessageDelta(type="delta", index=0, delta=None), StreamTaskMessageDone(type="done", index=0), ] assert _signals(d, events) == [] @@ -46,9 +44,13 @@ def test_single_tool_opens_on_done_closes_on_response(): events = [ _tool_req(0, "call_1", "Bash", {"cmd": "ls"}), StreamTaskMessageDone(type="done", index=0), - StreamTaskMessageFull(type="full", index=1, - content=ToolResponseContent(type="tool_response", author="agent", - tool_call_id="call_1", name="Bash", content="files")), + StreamTaskMessageFull( + type="full", + index=1, + content=ToolResponseContent( + type="tool_response", author="agent", tool_call_id="call_1", name="Bash", content="files" + ), + ), ] sigs = _signals(d, events) assert sigs == [ @@ -60,8 +62,9 @@ def test_single_tool_opens_on_done_closes_on_response(): def test_reasoning_opens_on_start_closes_on_done(): d = SpanDeriver() events = [ - StreamTaskMessageStart(type="start", index=0, - content=ReasoningContent(type="reasoning", author="agent", summary=[], content=[])), + StreamTaskMessageStart( + type="start", index=0, content=ReasoningContent(type="reasoning", author="agent", summary=[], content=[]) + ), StreamTaskMessageDone(type="done", index=0), ] sigs = _signals(d, events) @@ -76,12 +79,20 @@ def test_parallel_tools_pair_by_tool_call_id(): _tool_req(1, "b", "T2", {}), StreamTaskMessageDone(type="done", index=0), StreamTaskMessageDone(type="done", index=1), - StreamTaskMessageFull(type="full", index=2, - content=ToolResponseContent(type="tool_response", author="agent", - tool_call_id="b", name="T2", content="rb")), - StreamTaskMessageFull(type="full", index=3, - content=ToolResponseContent(type="tool_response", author="agent", - tool_call_id="a", name="T1", content="ra")), + StreamTaskMessageFull( + type="full", + index=2, + content=ToolResponseContent( + type="tool_response", author="agent", tool_call_id="b", name="T2", content="rb" + ), + ), + StreamTaskMessageFull( + type="full", + index=3, + content=ToolResponseContent( + type="tool_response", author="agent", tool_call_id="a", name="T1", content="ra" + ), + ), ] sigs = _signals(d, events) opens = [s for s in sigs if isinstance(s, OpenSpan)] @@ -94,15 +105,23 @@ def test_parallel_tools_pair_by_tool_call_id(): def test_streamed_args_accumulate_into_open_input(): d = SpanDeriver() events = [ - StreamTaskMessageStart(type="start", index=0, - content=ToolRequestContent(type="tool_request", author="agent", - tool_call_id="c", name="Bash", arguments={})), - StreamTaskMessageDelta(type="delta", index=0, - delta=ToolRequestDelta(type="tool_request", tool_call_id="c", name="Bash", - arguments_delta='{"cmd":')), - StreamTaskMessageDelta(type="delta", index=0, - delta=ToolRequestDelta(type="tool_request", tool_call_id="c", name="Bash", - arguments_delta='"ls"}')), + StreamTaskMessageStart( + type="start", + index=0, + content=ToolRequestContent( + type="tool_request", author="agent", tool_call_id="c", name="Bash", arguments={} + ), + ), + StreamTaskMessageDelta( + type="delta", + index=0, + delta=ToolRequestDelta(type="tool_request", tool_call_id="c", name="Bash", arguments_delta='{"cmd":'), + ), + StreamTaskMessageDelta( + type="delta", + index=0, + delta=ToolRequestDelta(type="tool_request", tool_call_id="c", name="Bash", arguments_delta='"ls"}'), + ), StreamTaskMessageDone(type="done", index=0), ] sigs = _signals(d, events) @@ -123,9 +142,13 @@ def test_unclosed_tool_closed_incomplete_on_flush(): def test_none_index_is_skipped(): d = SpanDeriver() events = [ - StreamTaskMessageStart(type="start", index=None, - content=ToolRequestContent(type="tool_request", author="agent", - tool_call_id="n", name="Bash", arguments={})), + StreamTaskMessageStart( + type="start", + index=None, + content=ToolRequestContent( + type="tool_request", author="agent", tool_call_id="n", name="Bash", arguments={} + ), + ), StreamTaskMessageDone(type="done", index=None), ] assert _signals(d, events) == [] @@ -134,8 +157,12 @@ def test_none_index_is_skipped(): def test_orphan_tool_response_ignored(): d = SpanDeriver() events = [ - StreamTaskMessageFull(type="full", index=0, - content=ToolResponseContent(type="tool_response", author="agent", - tool_call_id="z", name="Bash", content="r")), + StreamTaskMessageFull( + type="full", + index=0, + content=ToolResponseContent( + type="tool_response", author="agent", tool_call_id="z", name="Bash", content="r" + ), + ), ] assert _signals(d, events) == [] diff --git a/tests/lib/core/harness/test_tracer.py b/tests/lib/core/harness/test_tracer.py index f5fdb16b6..7e1a4bd67 100644 --- a/tests/lib/core/harness/test_tracer.py +++ b/tests/lib/core/harness/test_tracer.py @@ -1,7 +1,7 @@ import pytest -from agentex.lib.core.harness.tracer import SpanTracer from agentex.lib.core.harness.types import OpenSpan, CloseSpan +from agentex.lib.core.harness.tracer import SpanTracer class _FakeSpan: diff --git a/tests/lib/core/harness/test_types.py b/tests/lib/core/harness/test_types.py index 91857993a..68bc89ce2 100644 --- a/tests/lib/core/harness/test_types.py +++ b/tests/lib/core/harness/test_types.py @@ -3,10 +3,10 @@ from agentex.lib.core.harness.types import ( OpenSpan, CloseSpan, - HarnessTurn, - StreamTaskMessage, TurnUsage, TurnResult, + HarnessTurn, + StreamTaskMessage, ) diff --git a/tests/lib/core/harness/test_yield_delivery.py b/tests/lib/core/harness/test_yield_delivery.py index 986b4a92d..f3f491d84 100644 --- a/tests/lib/core/harness/test_yield_delivery.py +++ b/tests/lib/core/harness/test_yield_delivery.py @@ -2,15 +2,15 @@ import pytest -from agentex.lib.core.harness.yield_delivery import yield_events from agentex.lib.core.harness.tracer import SpanTracer from agentex.types.task_message_update import ( - StreamTaskMessageStart, StreamTaskMessageDone, StreamTaskMessageFull, + StreamTaskMessageStart, ) from agentex.types.tool_request_content import ToolRequestContent from agentex.types.tool_response_content import ToolResponseContent +from agentex.lib.core.harness.yield_delivery import yield_events class _RecordTracing: @@ -35,18 +35,26 @@ async def test_yield_passes_events_through_and_traces(): fake = _RecordTracing() tracer = SpanTracer(trace_id="t", parent_span_id="p", tracing=fake) events = [ - StreamTaskMessageStart(type="start", index=0, - content=ToolRequestContent(type="tool_request", author="agent", - tool_call_id="c", name="Bash", arguments={})), + StreamTaskMessageStart( + type="start", + index=0, + content=ToolRequestContent( + type="tool_request", author="agent", tool_call_id="c", name="Bash", arguments={} + ), + ), StreamTaskMessageDone(type="done", index=0), - StreamTaskMessageFull(type="full", index=1, - content=ToolResponseContent(type="tool_response", author="agent", - tool_call_id="c", name="Bash", content="ok")), + StreamTaskMessageFull( + type="full", + index=1, + content=ToolResponseContent( + type="tool_response", author="agent", tool_call_id="c", name="Bash", content="ok" + ), + ), ] out = [e async for e in yield_events(_gen(events), tracer=tracer)] - assert out == events # passthrough unchanged - assert fake.started == ["Bash"] # span derived + opened - assert fake.ended == ["ok"] # span closed with response + assert out == events # passthrough unchanged + assert fake.started == ["Bash"] # span derived + opened + assert fake.ended == ["ok"] # span closed with response @pytest.mark.asyncio @@ -63,15 +71,19 @@ async def test_flush_runs_on_early_close(): fake = _RecordTracing() tracer = SpanTracer(trace_id="t", parent_span_id="p", tracing=fake) events = [ - StreamTaskMessageStart(type="start", index=0, - content=ToolRequestContent(type="tool_request", author="agent", - tool_call_id="c", name="Bash", arguments={})), + StreamTaskMessageStart( + type="start", + index=0, + content=ToolRequestContent( + type="tool_request", author="agent", tool_call_id="c", name="Bash", arguments={} + ), + ), StreamTaskMessageDone(type="done", index=0), # response intentionally never arrives ] gen = yield_events(_gen(events), tracer=tracer) - first = await gen.__anext__() # Start + first = await gen.__anext__() # Start second = await gen.__anext__() # Done -> tool span opens here - await gen.aclose() # triggers the finally -> flush() + await gen.aclose() # triggers the finally -> flush() assert fake.started == ["Bash"] - assert fake.ended == [None] # flush closed the unpaired span (incomplete, no output) + assert fake.ended == [None] # flush closed the unpaired span (incomplete, no output) From 8b0da837d2ee13de1f7d93ebda0fa594cc0f5cd6 Mon Sep 17 00:00:00 2001 From: Declan Brady Date: Thu, 18 Jun 2026 15:21:03 -0400 Subject: [PATCH 22/35] fix(harness): mark overridden start_span with @override for pyright (reportImplicitOverride) Co-Authored-By: Claude Opus 4.8 (1M context) --- tests/lib/core/harness/test_tracer.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/lib/core/harness/test_tracer.py b/tests/lib/core/harness/test_tracer.py index 7e1a4bd67..315b74417 100644 --- a/tests/lib/core/harness/test_tracer.py +++ b/tests/lib/core/harness/test_tracer.py @@ -1,3 +1,5 @@ +from typing import override + import pytest from agentex.lib.core.harness.types import OpenSpan, CloseSpan @@ -45,6 +47,7 @@ async def test_no_trace_id_is_noop(): @pytest.mark.asyncio async def test_tracing_failure_is_swallowed(): class _Boom(_FakeTracing): + @override async def start_span(self, **kw): raise RuntimeError("backend down") From f9266cf6b07ee952b19b75909ce7e3a7a9138908 Mon Sep 17 00:00:00 2001 From: Declan Brady Date: Thu, 18 Jun 2026 15:23:26 -0400 Subject: [PATCH 23/35] fix(harness): relative import in conformance test for pyright (reportMissingImports) Co-Authored-By: Claude Opus 4.8 (1M context) --- tests/lib/core/harness/conformance/test_conformance.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/lib/core/harness/conformance/test_conformance.py b/tests/lib/core/harness/conformance/test_conformance.py index 1d686c33a..d9eec1c15 100644 --- a/tests/lib/core/harness/conformance/test_conformance.py +++ b/tests/lib/core/harness/conformance/test_conformance.py @@ -7,7 +7,8 @@ ) from agentex.types.tool_request_content import ToolRequestContent from agentex.types.tool_response_content import ToolResponseContent -from tests.lib.core.harness.conformance.runner import Fixture, register, derive_all, all_fixtures + +from .runner import Fixture, register, derive_all, all_fixtures register( Fixture( From b538187a43577282544cc474c2c3b8f627a9f2cf Mon Sep 17 00:00:00 2001 From: Declan Brady Date: Thu, 18 Jun 2026 16:51:49 -0400 Subject: [PATCH 24/35] fix(harness): index-keyed routing, tool stream delivery, final_text last-segment, created_at (AGX1-377, AGX1-378) auto_send.py: - Replace single current_ctx with ctx_map[index] so parallel streams route correctly - Open a streaming context for ALL content types on Start (not just text/reasoning), fixing tool_request/tool_response stream delivery (AGX1-377) - Reset final_text_parts on each new Start(TextContent) and on Full(TextContent) so multi-step turns return the LAST text segment, not the full accumulation - Add created_at: datetime | None param; forward to every streaming_task_message_context call (AGX1-378) span_derivation.py: - _on_full: handle Full(ToolRequestContent) by opening a tool span keyed by tool_call_id if not already open; adds LangGraph full-event harness support Co-Authored-By: Claude Sonnet 4.6 --- src/agentex/lib/core/harness/auto_send.py | 86 ++++++++++++------- .../lib/core/harness/span_derivation.py | 14 +++ 2 files changed, 71 insertions(+), 29 deletions(-) diff --git a/src/agentex/lib/core/harness/auto_send.py b/src/agentex/lib/core/harness/auto_send.py index b246cf38c..899429034 100644 --- a/src/agentex/lib/core/harness/auto_send.py +++ b/src/agentex/lib/core/harness/auto_send.py @@ -3,8 +3,10 @@ from __future__ import annotations from typing import Any, AsyncIterator +from datetime import datetime from agentex.types.text_delta import TextDelta +from agentex.types.text_content import TextContent from agentex.lib.core.harness.types import TurnUsage, TurnResult, StreamTaskMessage from agentex.lib.core.harness.tracer import SpanTracer from agentex.types.task_message_update import ( @@ -22,14 +24,27 @@ async def auto_send( tracer: SpanTracer | None = None, streaming: Any = None, usage: TurnUsage | None = None, + created_at: datetime | None = None, ) -> TurnResult: """Push the canonical stream to the task stream via adk.streaming. - Opens a streaming context per text/reasoning message, streams deltas via + Opens a streaming context per message (keyed by index), streams deltas via ctx.stream_update, and closes via ctx.close() on Done. Posts tool request/response full messages by opening a context with the content and closing it immediately (no deltas). Derives and traces spans from the same - stream. Returns the accumulated final text + usage. + stream. Returns the last text segment's text + usage. + + Index-keyed routing: each Start(index=i) opens a context stored in + ctx_map[i]; Delta(index=i) routes to ctx_map.get(i); Done(index=i) closes + and removes ctx_map[i]. Events with index is None are skipped. The finally + block closes all remaining open contexts. + + final_text last-segment semantics: a new Start(TextContent) resets + final_text_parts so that multi-step turns return the LAST text segment. + Full(TextContent) also overwrites final_text_parts (same semantics). + + AGX1-378: created_at is forwarded to every streaming_task_message_context + call so callers can back-date message timestamps. Mirrors the open/close/stream_update pattern from src/agentex/lib/adk/_modules/_langgraph_async.py: @@ -47,13 +62,12 @@ async def auto_send( deriver = SpanDeriver() if tracer is not None else None final_text_parts: list[str] = [] - current_ctx: Any = None + ctx_map: dict[int, Any] = {} - async def _close_current() -> None: - nonlocal current_ctx - if current_ctx is not None: - await current_ctx.close() - current_ctx = None + async def _close_all() -> None: + for ctx in list(ctx_map.values()): + await ctx.close() + ctx_map.clear() try: async for event in events: @@ -62,50 +76,64 @@ async def _close_current() -> None: await tracer.handle(signal) if isinstance(event, StreamTaskMessageStart): - ctype = getattr(event.content, "type", None) - if ctype in ("text", "reasoning"): - await _close_current() - ctx = streaming.streaming_task_message_context( - task_id=task_id, - initial_content=event.content, - ) - current_ctx = await ctx.__aenter__() + if event.index is None: + continue + i = event.index + # Reset final_text_parts when a new text segment starts + if isinstance(event.content, TextContent): + final_text_parts = [] + ctx = streaming.streaming_task_message_context( + task_id=task_id, + initial_content=event.content, + created_at=created_at, + ) + ctx_map[i] = await ctx.__aenter__() elif isinstance(event, StreamTaskMessageDelta): - if current_ctx is not None and event.delta is not None: + if event.index is None: + continue + ctx = ctx_map.get(event.index) + if ctx is not None and event.delta is not None: # Reconstruct the delta with parent_task_message set from # the context's task_message (mirrors _langgraph_async.py # lines 72-78 and 117-127). delta_with_parent = StreamTaskMessageDelta( - parent_task_message=current_ctx.task_message, + parent_task_message=ctx.task_message, delta=event.delta, type="delta", index=event.index, ) - await current_ctx.stream_update(delta_with_parent) + await ctx.stream_update(delta_with_parent) if isinstance(event.delta, TextDelta) and event.delta.text_delta: final_text_parts.append(event.delta.text_delta) elif isinstance(event, StreamTaskMessageDone): - await _close_current() + if event.index is None: + continue + ctx = ctx_map.pop(event.index, None) + if ctx is not None: + await ctx.close() elif isinstance(event, StreamTaskMessageFull): - # Full messages (tool_request / tool_response): close any open - # streaming context first, then post the full message by opening - # a context with the content and closing it immediately - # (no deltas; StreamingTaskMessageContext.close() persists - # initial_content when the accumulator is empty). Use async with - # so the context is closed even if close() raises (__aexit__ - # delegates to close()). - await _close_current() + # Full messages: post the full message by opening a context + # with the content and closing it immediately (no deltas; + # StreamingTaskMessageContext.close() persists initial_content + # when the accumulator is empty). Use async with so the context + # is closed even if close() raises (__aexit__ delegates to + # close()). + # Full(TextContent) also resets final_text_parts for + # last-segment semantics. + if isinstance(event.content, TextContent): + final_text_parts = [event.content.content] async with streaming.streaming_task_message_context( task_id=task_id, initial_content=event.content, + created_at=created_at, ): pass finally: - await _close_current() + await _close_all() if deriver is not None and tracer is not None: for signal in deriver.flush(): await tracer.handle(signal) diff --git a/src/agentex/lib/core/harness/span_derivation.py b/src/agentex/lib/core/harness/span_derivation.py index d353cf9e0..503957582 100644 --- a/src/agentex/lib/core/harness/span_derivation.py +++ b/src/agentex/lib/core/harness/span_derivation.py @@ -105,7 +105,21 @@ def _on_delta(self, event: StreamTaskMessageDelta) -> list[SpanSignal]: return [] def _on_full(self, event: StreamTaskMessageFull) -> list[SpanSignal]: + """Handle a Full event. + + A `Full(ToolRequestContent)` opens a tool span (keyed by tool_call_id) + if it is not already open; the matching `Full(ToolResponseContent)` + closes it. This handles harnesses (e.g. LangGraph) that emit tool calls + as a single Full rather than Start+Done. + """ content = event.content + if isinstance(content, ToolRequestContent): + tcid = content.tool_call_id + if tcid not in self._open_tool_ids: + self._open_tool_ids[tcid] = None + args = dict(content.arguments or {}) + return [OpenSpan(key=tcid, kind="tool", name=content.name, input=args)] + return [] if isinstance(content, ToolResponseContent): tcid = content.tool_call_id if tcid in self._open_tool_ids: From dcd65b5507397bdbeee6daff6021927921a5d6aa Mon Sep 17 00:00:00 2001 From: Declan Brady Date: Thu, 18 Jun 2026 16:52:00 -0400 Subject: [PATCH 25/35] test(harness): add tests for AGX1-377 tool stream delivery, index routing, last-segment, created_at, Full ToolRequest spans test_auto_send.py: - Fix test 2: remove bare Start(ToolRequestContent) from events (old behavior was that Start did not open a ctx; new behavior does, so test was updated to use Full-only events that still verify the two-context behavior) - Extend _FakeStreaming to record created_at on each context call - Add test 6: streamed tool_request opens a ctx + routes deltas (AGX1-377 core) - Add test 7: interleaved indexes route deltas to correct per-index contexts - Add test 8: multi-step turns return the LAST text segment only - Add test 9: Full(TextContent) contributes its content to final_text - Add test 10: created_at is forwarded to every streaming context call (AGX1-378) test_span_derivation.py: - Add test_full_tool_request_opens_span: Full(ToolRequestContent) opens a span - Add test_full_tool_request_and_response_paired: paired Full request+response produces a complete OpenSpan+CloseSpan - Add test_full_tool_request_does_not_double_open: idempotent; a Full for an already-open tool_call_id is a no-op Co-Authored-By: Claude Sonnet 4.6 --- tests/lib/core/harness/test_auto_send.py | 215 ++++++++++++++++-- .../lib/core/harness/test_span_derivation.py | 89 ++++++++ 2 files changed, 287 insertions(+), 17 deletions(-) diff --git a/tests/lib/core/harness/test_auto_send.py b/tests/lib/core/harness/test_auto_send.py index e7331e67c..1948e9196 100644 --- a/tests/lib/core/harness/test_auto_send.py +++ b/tests/lib/core/harness/test_auto_send.py @@ -10,6 +10,7 @@ """ import types as _types +from datetime import datetime import pytest @@ -17,6 +18,7 @@ from agentex.types.text_content import TextContent from agentex.lib.core.harness.tracer import SpanTracer from agentex.types.task_message_delta import TextDelta +from agentex.types.tool_request_delta import ToolRequestDelta from agentex.types.task_message_update import ( StreamTaskMessageDone, StreamTaskMessageFull, @@ -64,10 +66,12 @@ class _FakeStreaming: def __init__(self): self.sink = [] + self.recorded_created_at: list[datetime | None] = [] def streaming_task_message_context(self, task_id, initial_content, streaming_mode="coalesced", created_at=None): ctype = getattr(initial_content, "type", None) self.sink.append(("ctx", ctype)) + self.recorded_created_at.append(created_at) return _FakeCtx(self.sink, ctype, initial_content) @@ -127,22 +131,10 @@ async def test_auto_send_streams_text_and_returns_final_text(): async def test_auto_send_posts_full_tool_messages(): streaming = _FakeStreaming() events = [ - # A bare tool_request Start (no Done/Full) must NOT open a streaming - # context on its own — only Full events post messages. - StreamTaskMessageStart( - type="start", - index=0, - content=ToolRequestContent( - type="tool_request", - author="agent", - tool_call_id="c0", - name="Bash", - arguments={}, - ), - ), + # Two Full events post two messages (open+close immediately, no deltas). StreamTaskMessageFull( type="full", - index=1, + index=0, content=ToolRequestContent( type="tool_request", author="agent", @@ -153,7 +145,7 @@ async def test_auto_send_posts_full_tool_messages(): ), StreamTaskMessageFull( type="full", - index=2, + index=1, content=ToolResponseContent( type="tool_response", author="agent", @@ -167,8 +159,7 @@ async def test_auto_send_posts_full_tool_messages(): assert result.final_text == "" - # The opened contexts correspond ONLY to the two Full events — the - # tool_request Start did not open a context. + # Each Full event opens and closes exactly one context. ctx_events = [s for s in streaming.sink if s[0] == "ctx"] assert len(ctx_events) == 2 content_types = [s[1] for s in ctx_events] @@ -307,3 +298,193 @@ async def _exploding_gen(): # The text context that was opened mid-stream was closed by the finally block. assert ("open", "text") in [(s[0], s[1]) for s in streaming.sink] assert ("close", "text") in [(s[0], s[1]) for s in streaming.sink] + + +# --------------------------------------------------------------------------- +# Test 6: streamed tool_request delivered (AGX1-377 core) +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_auto_send_streams_tool_request(): + """A Start(ToolRequestContent) MUST open a streaming context (AGX1-377).""" + streaming = _FakeStreaming() + events = [ + StreamTaskMessageStart( + type="start", + index=0, + content=ToolRequestContent( + type="tool_request", + author="agent", + tool_call_id="c_tool", + name="Bash", + arguments={}, + ), + ), + StreamTaskMessageDelta( + type="delta", + index=0, + delta=ToolRequestDelta( + type="tool_request", + tool_call_id="c_tool", + name="Bash", + arguments_delta='{"cmd": "ls"}', + ), + ), + StreamTaskMessageDone(type="done", index=0), + ] + result = await auto_send(_gen(events), task_id="task1", tracer=None, streaming=streaming) + + assert result.final_text == "" + + ctx_events = [s for s in streaming.sink if s[0] == "ctx"] + assert len(ctx_events) == 1 + assert ctx_events[0][1] == "tool_request" + + opens = [s for s in streaming.sink if s[0] == "open"] + closes = [s for s in streaming.sink if s[0] == "close"] + assert len(opens) == 1 + assert len(closes) == 1 + + updates = [s for s in streaming.sink if s[0] == "update"] + assert len(updates) == 1 + + +# --------------------------------------------------------------------------- +# Test 7: interleaved indexes route correctly +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_auto_send_interleaved_indexes_route_correctly(): + """Deltas must be routed to the correct index-keyed context.""" + streaming = _FakeStreaming() + events = [ + StreamTaskMessageStart( + type="start", + index=0, + content=TextContent(type="text", author="agent", content=""), + ), + StreamTaskMessageStart( + type="start", + index=1, + content=TextContent(type="text", author="agent", content=""), + ), + StreamTaskMessageDelta( + type="delta", + index=0, + delta=TextDelta(type="text", text_delta="A"), + ), + StreamTaskMessageDelta( + type="delta", + index=1, + delta=TextDelta(type="text", text_delta="B"), + ), + StreamTaskMessageDone(type="done", index=0), + StreamTaskMessageDone(type="done", index=1), + ] + result = await auto_send(_gen(events), task_id="task1", tracer=None, streaming=streaming) + + ctx_events = [s for s in streaming.sink if s[0] == "ctx"] + assert len(ctx_events) == 2 + + opens = [s for s in streaming.sink if s[0] == "open"] + assert len(opens) == 2 + + updates = [s for s in streaming.sink if s[0] == "update"] + assert len(updates) == 2 + + update_deltas = [s[1].delta for s in streaming.sink if s[0] == "update"] + text_deltas = [d.text_delta for d in update_deltas if isinstance(d, TextDelta)] + assert set(text_deltas) == {"A", "B"} + + +# --------------------------------------------------------------------------- +# Test 8: final_text returns last text segment for multi-step +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_auto_send_final_text_last_segment(): + """final_text must be the LAST text segment, not accumulated across all turns.""" + streaming = _FakeStreaming() + events = [ + StreamTaskMessageStart( + type="start", + index=0, + content=TextContent(type="text", author="agent", content=""), + ), + StreamTaskMessageDelta( + type="delta", + index=0, + delta=TextDelta(type="text", text_delta="First"), + ), + StreamTaskMessageDone(type="done", index=0), + StreamTaskMessageStart( + type="start", + index=1, + content=TextContent(type="text", author="agent", content=""), + ), + StreamTaskMessageDelta( + type="delta", + index=1, + delta=TextDelta(type="text", text_delta="Second"), + ), + StreamTaskMessageDone(type="done", index=1), + ] + result = await auto_send(_gen(events), task_id="task1", tracer=None, streaming=streaming) + assert result.final_text == "Second" + + +# --------------------------------------------------------------------------- +# Test 9: Full(TextContent) contributes to final_text +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_auto_send_full_text_content_sets_final_text(): + """A Full(TextContent) must contribute its text to final_text.""" + streaming = _FakeStreaming() + events = [ + StreamTaskMessageFull( + type="full", + index=0, + content=TextContent(type="text", author="agent", content="hello"), + ), + ] + result = await auto_send(_gen(events), task_id="task1", tracer=None, streaming=streaming) + assert result.final_text == "hello" + + +# --------------------------------------------------------------------------- +# Test 10: created_at is forwarded to streaming context (AGX1-378) +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_auto_send_created_at_forwarded(): + """created_at must be forwarded to every streaming_task_message_context call.""" + streaming = _FakeStreaming() + dt = datetime(2025, 1, 15, 12, 0, 0) + events = [ + StreamTaskMessageStart( + type="start", + index=0, + content=TextContent(type="text", author="agent", content=""), + ), + StreamTaskMessageDone(type="done", index=0), + StreamTaskMessageFull( + type="full", + index=1, + content=ToolRequestContent( + type="tool_request", + author="agent", + tool_call_id="c_ts", + name="Bash", + arguments={}, + ), + ), + ] + await auto_send(_gen(events), task_id="task1", tracer=None, streaming=streaming, created_at=dt) + + assert all(ts == dt for ts in streaming.recorded_created_at) diff --git a/tests/lib/core/harness/test_span_derivation.py b/tests/lib/core/harness/test_span_derivation.py index 7779de815..f22b83d54 100644 --- a/tests/lib/core/harness/test_span_derivation.py +++ b/tests/lib/core/harness/test_span_derivation.py @@ -166,3 +166,92 @@ def test_orphan_tool_response_ignored(): ), ] assert _signals(d, events) == [] + + +def test_full_tool_request_opens_span(): + """Full(ToolRequestContent) must open a tool span (for LangGraph-style harnesses).""" + d = SpanDeriver() + events = [ + StreamTaskMessageFull( + type="full", + index=0, + content=ToolRequestContent( + type="tool_request", + author="agent", + tool_call_id="call_x", + name="Bash", + arguments={"cmd": "ls"}, + ), + ), + ] + sigs = _signals(d, events) + assert sigs[0] == OpenSpan(key="call_x", kind="tool", name="Bash", input={"cmd": "ls"}) + assert sigs[1] == CloseSpan(key="call_x", output=None, is_complete=False) + + +def test_full_tool_request_and_response_paired(): + """Full(ToolRequestContent) + Full(ToolResponseContent) produces a complete span pair.""" + d = SpanDeriver() + events = [ + StreamTaskMessageFull( + type="full", + index=0, + content=ToolRequestContent( + type="tool_request", + author="agent", + tool_call_id="call_y", + name="Grep", + arguments={}, + ), + ), + StreamTaskMessageFull( + type="full", + index=1, + content=ToolResponseContent( + type="tool_response", + author="agent", + tool_call_id="call_y", + name="Grep", + content="result", + ), + ), + ] + sigs = _signals(d, events) + assert sigs == [ + OpenSpan(key="call_y", kind="tool", name="Grep", input={}), + CloseSpan(key="call_y", output="result", is_complete=True), + ] + + +def test_full_tool_request_does_not_double_open(): + """A Full(ToolRequestContent) for an already-open tool_call_id is a no-op.""" + d = SpanDeriver() + events = [ + StreamTaskMessageStart( + type="start", + index=0, + content=ToolRequestContent( + type="tool_request", + author="agent", + tool_call_id="call_z", + name="X", + arguments={}, + ), + ), + StreamTaskMessageDone(type="done", index=0), + StreamTaskMessageFull( + type="full", + index=1, + content=ToolRequestContent( + type="tool_request", + author="agent", + tool_call_id="call_z", + name="X", + arguments={}, + ), + ), + ] + sigs = _signals(d, events) + opens = [s for s in sigs if isinstance(s, OpenSpan)] + assert len(opens) == 1 + assert opens[0].key == "call_z" From b4b8b33047e7e8fc30436bc95dfb81b53888682c Mon Sep 17 00:00:00 2001 From: Declan Brady Date: Thu, 18 Jun 2026 17:01:54 -0400 Subject: [PATCH 26/35] feat(harness): thread created_at through UnifiedEmitter.auto_send_turn (AGX1-378) So migration helpers can restore the deterministic first-message timestamp on the temporal path. Default None preserves current behavior. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/agentex/lib/core/harness/emitter.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/agentex/lib/core/harness/emitter.py b/src/agentex/lib/core/harness/emitter.py index 681c859ea..85395fcff 100644 --- a/src/agentex/lib/core/harness/emitter.py +++ b/src/agentex/lib/core/harness/emitter.py @@ -3,6 +3,7 @@ from __future__ import annotations from typing import AsyncGenerator +from datetime import datetime from agentex.lib.core.harness.types import TurnResult, HarnessTurn, StreamTaskMessage from agentex.lib.core.harness.tracer import SpanTracer @@ -56,12 +57,18 @@ async def yield_turn(self, turn: HarnessTurn) -> AsyncGenerator[StreamTaskMessag async for event in yield_events(turn.events, tracer=self.tracer): yield event - async def auto_send_turn(self, turn: HarnessTurn) -> TurnResult: - """Async/temporal delivery: push to the task stream, return TurnResult.""" + async def auto_send_turn(self, turn: HarnessTurn, created_at: datetime | None = None) -> TurnResult: + """Async/temporal delivery: push to the task stream, return TurnResult. + + Pass `created_at` (e.g. `workflow.now()` under Temporal) to stamp the + turn's messages with a deterministic timestamp; it is forwarded to the + streaming contexts. Default None preserves server-side timestamps. + """ return await auto_send( turn.events, task_id=self.task_id, tracer=self.tracer, streaming=self._streaming, usage=turn.usage(), + created_at=created_at, ) From 58209410f50dd7938c9a2838d9e44fca0a410a7f Mon Sep 17 00:00:00 2001 From: Declan Brady Date: Thu, 18 Jun 2026 16:11:51 -0400 Subject: [PATCH 27/35] feat(langgraph): optional on_final_ai_message callback for usage capture Adds an additive on_final_ai_message=None parameter to convert_langgraph_to_agentex_events so callers can capture AIMessage usage_metadata without re-traversing the stream. No behavior change when omitted. Also adds a DeprecationWarning to create_langgraph_tracing_handler and its module docstring, pointing to the unified harness surface, and updates the sync module docstring with the preferred unified path. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../lib/adk/_modules/_langgraph_sync.py | 36 ++- .../lib/adk/_modules/_langgraph_tracing.py | 39 +++- tests/lib/adk/test_langgraph_sync.py | 209 ++++++++++++++++++ 3 files changed, 282 insertions(+), 2 deletions(-) create mode 100644 tests/lib/adk/test_langgraph_sync.py diff --git a/src/agentex/lib/adk/_modules/_langgraph_sync.py b/src/agentex/lib/adk/_modules/_langgraph_sync.py index 6d4ce715f..381ff6880 100644 --- a/src/agentex/lib/adk/_modules/_langgraph_sync.py +++ b/src/agentex/lib/adk/_modules/_langgraph_sync.py @@ -3,10 +3,28 @@ Converts LangGraph graph.astream() events into Agentex TaskMessageUpdate events that are yielded back over the HTTP response. For use with sync ACP agents that stream via HTTP yields rather than Redis. + +Unified sync path +----------------- +Prefer using ``LangGraphTurn`` with ``UnifiedEmitter.yield_turn`` for new +agents, which adds usage capture and optional tracing via the shared harness +surface:: + + from agentex.lib.core.harness.emitter import UnifiedEmitter + from agentex.lib.adk._modules._langgraph_turn import LangGraphTurn + + turn = LangGraphTurn(stream) + emitter = UnifiedEmitter(task_id=task_id, trace_id=trace_id, parent_span_id=span_id) + async for event in emitter.yield_turn(turn): + yield event + +``convert_langgraph_to_agentex_events`` remains available as a lower-level +primitive (e.g. for callers that need the raw event stream without the +harness envelope). """ -async def convert_langgraph_to_agentex_events(stream): +async def convert_langgraph_to_agentex_events(stream, on_final_ai_message=None): """Convert LangGraph streaming events to Agentex TaskMessageUpdate events. Expects the stream from graph.astream() called with @@ -22,8 +40,17 @@ async def convert_langgraph_to_agentex_events(stream): Supports both regular models (chunk.content is a str) and reasoning models like gpt-5/o1/o3 (chunk.content is a list of typed content blocks). + AGX1-377 note: LangGraph emits tool requests as ``StreamTaskMessageFull`` (from + "updates" events), NOT Start+Delta+Done like pydantic-ai. No coalesce_tool_requests + option is needed for LangGraph. + Args: stream: Async iterator from graph.astream(..., stream_mode=["messages", "updates"]) + on_final_ai_message: Optional callback ``(msg: AIMessage) -> None`` called for + each ``AIMessage`` in an "agent" node update. Use this to capture + ``usage_metadata`` for token accounting without re-traversing the stream. + The callback fires *after* all events for that message are yielded. + No-op when ``None`` (default). Yields: TaskMessageUpdate events (Start, Delta, Done, Full) @@ -205,6 +232,13 @@ async def convert_langgraph_to_agentex_events(stream): ) message_index += 1 + # Notify caller of the final AIMessage (e.g. for usage capture) + if on_final_ai_message is not None: + from langchain_core.messages import AIMessage as _AIMessage + + if isinstance(msg, _AIMessage): + on_final_ai_message(msg) + elif node_name == "tools": messages = state_update.get("messages", []) for msg in messages: diff --git a/src/agentex/lib/adk/_modules/_langgraph_tracing.py b/src/agentex/lib/adk/_modules/_langgraph_tracing.py index 74b8dcb57..0aa411f46 100644 --- a/src/agentex/lib/adk/_modules/_langgraph_tracing.py +++ b/src/agentex/lib/adk/_modules/_langgraph_tracing.py @@ -1,9 +1,20 @@ -"""LangChain callback handler that creates Agentex spans for LLM calls and tool executions.""" +"""LangChain callback handler that creates Agentex spans for LLM calls and tool executions. + +.. deprecated:: + ``AgentexLangGraphTracingHandler`` and ``create_langgraph_tracing_handler`` are + superseded by the unified harness surface (``LangGraphTurn`` + + ``UnifiedEmitter``), which derives spans automatically from the canonical + event stream without requiring a LangChain callback handler. + + They remain importable and functional for backward compatibility, but new + agents should use the unified path instead. +""" # ruff: noqa: ARG002 # Callback methods must accept all arguments defined by LangChain's AsyncCallbackHandler interface. from __future__ import annotations +import warnings from uuid import UUID from typing import Any, override @@ -31,6 +42,11 @@ class AgentexLangGraphTracingHandler(AsyncCallbackHandler): ├── llm: (LLM call) ├── tool: (tool execution) └── llm: (LLM call) + + .. deprecated:: + Use ``LangGraphTurn`` with ``UnifiedEmitter`` instead. The unified + harness derives equivalent spans from the canonical event stream, + removing the need for a LangChain callback handler entirely. """ def __init__( @@ -237,7 +253,28 @@ def create_langgraph_tracing_handler( Returns: An ``AgentexLangGraphTracingHandler`` instance ready to use as a LangChain callback. + + .. deprecated:: + Use ``LangGraphTurn`` with ``UnifiedEmitter`` instead. The unified harness + derives equivalent spans from the canonical event stream automatically, with + no LangChain callback required:: + + from agentex.lib.core.harness.emitter import UnifiedEmitter + from agentex.lib.adk._modules._langgraph_turn import LangGraphTurn + + turn = LangGraphTurn(stream) + emitter = UnifiedEmitter(task_id=task_id, trace_id=trace_id, parent_span_id=span_id) + result = await emitter.auto_send_turn(turn) + + This function remains available for backward compatibility. """ + warnings.warn( + "create_langgraph_tracing_handler is deprecated. Use LangGraphTurn with " + "UnifiedEmitter instead — the unified harness derives equivalent spans from " + "the canonical event stream without a LangChain callback handler.", + DeprecationWarning, + stacklevel=2, + ) return AgentexLangGraphTracingHandler( trace_id=trace_id, parent_span_id=parent_span_id, diff --git a/tests/lib/adk/test_langgraph_sync.py b/tests/lib/adk/test_langgraph_sync.py new file mode 100644 index 000000000..8bfcfebde --- /dev/null +++ b/tests/lib/adk/test_langgraph_sync.py @@ -0,0 +1,209 @@ +"""Tests for the sync LangGraph -> Agentex stream event converter. + +Covers: +- Basic text, tool call, and tool response emission +- on_final_ai_message callback for usage capture +- Deprecation warning emitted by create_langgraph_tracing_handler + +NOTE: langchain_core imports must be deferred to test-function scope because +conftest.py stubs out ``langchain_core.messages`` with MagicMock for ADK +package-level tests. The real classes are imported lazily inside each test. +""" + +from __future__ import annotations + +import sys +import warnings +from typing import Any, AsyncIterator + +import pytest + +from agentex.types.task_message_update import ( + StreamTaskMessageFull, +) +from agentex.types.tool_request_content import ToolRequestContent +from agentex.types.tool_response_content import ToolResponseContent +from agentex.lib.adk._modules._langgraph_sync import convert_langgraph_to_agentex_events + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +async def _collect(stream: AsyncIterator[Any]) -> list[Any]: + return [e async for e in stream] + + +def _make_stream(events: list[tuple[str, Any]]) -> AsyncIterator[tuple[str, Any]]: + async def _gen(): + for e in events: + yield e + + return _gen() + + +# --------------------------------------------------------------------------- +# Remove the conftest stubs for langchain_core so real classes are used +# --------------------------------------------------------------------------- + + +@pytest.fixture(autouse=True) +def _real_langchain_core(): + """Remove conftest MagicMock stubs so real langchain_core types are used.""" + stub_keys = [k for k in sys.modules if k.startswith("langchain_core") or k.startswith("langgraph")] + saved = {k: sys.modules.pop(k) for k in stub_keys} + # Re-import the real modules + import importlib + + importlib.import_module("langchain_core.messages") + yield + # Restore stubs after the test + sys.modules.update(saved) + + +class TestTextStreaming: + async def test_plain_text_emits_start_delta_done(self): + from langchain_core.messages import AIMessage, AIMessageChunk + + chunk = AIMessageChunk(content="Hello, world!") + events = [ + ("messages", (chunk, {})), + ("updates", {"agent": {"messages": [AIMessage(content="Hello, world!")]}}), + ] + out = await _collect(convert_langgraph_to_agentex_events(_make_stream(events))) + types = [type(e).__name__ for e in out] + assert "StreamTaskMessageStart" in types + assert "StreamTaskMessageDelta" in types + assert "StreamTaskMessageDone" in types + + async def test_empty_chunk_content_is_skipped(self): + from langchain_core.messages import AIMessageChunk + + chunk = AIMessageChunk(content="") + events = [("messages", (chunk, {}))] + out = await _collect(convert_langgraph_to_agentex_events(_make_stream(events))) + assert out == [] + + +class TestToolCallEmission: + async def test_tool_call_emits_full_message(self): + from langchain_core.messages import AIMessage + + tc = {"id": "call_1", "name": "get_weather", "args": {"city": "Paris"}} + ai_msg = AIMessage(content="", tool_calls=[tc]) + events = [("updates", {"agent": {"messages": [ai_msg]}})] + out = await _collect(convert_langgraph_to_agentex_events(_make_stream(events))) + assert len(out) == 1 + assert isinstance(out[0], StreamTaskMessageFull) + content = out[0].content + assert isinstance(content, ToolRequestContent) + assert content.tool_call_id == "call_1" + assert content.name == "get_weather" + assert content.arguments == {"city": "Paris"} + assert content.author == "agent" + + async def test_tool_response_emits_full_message(self): + from langchain_core.messages import ToolMessage + + tool_msg = ToolMessage(content="Sunny, 72F", tool_call_id="call_1", name="get_weather") + events = [("updates", {"tools": {"messages": [tool_msg]}})] + out = await _collect(convert_langgraph_to_agentex_events(_make_stream(events))) + assert len(out) == 1 + assert isinstance(out[0], StreamTaskMessageFull) + content = out[0].content + assert isinstance(content, ToolResponseContent) + assert content.tool_call_id == "call_1" + assert content.name == "get_weather" + assert content.content == "Sunny, 72F" + assert content.author == "agent" + + +class TestOnFinalAiMessageCallback: + async def test_callback_called_for_ai_message_in_agent_node(self): + from langchain_core.messages import AIMessage + + captured: list[Any] = [] + ai_msg = AIMessage(content="Hello!") + + events = [("updates", {"agent": {"messages": [ai_msg]}})] + await _collect(convert_langgraph_to_agentex_events(_make_stream(events), on_final_ai_message=captured.append)) + assert len(captured) == 1 + assert captured[0] is ai_msg + + async def test_callback_not_called_for_tool_messages(self): + from langchain_core.messages import ToolMessage + + captured: list[Any] = [] + tool_msg = ToolMessage(content="result", tool_call_id="c1", name="t") + + events = [("updates", {"tools": {"messages": [tool_msg]}})] + await _collect(convert_langgraph_to_agentex_events(_make_stream(events), on_final_ai_message=captured.append)) + assert captured == [] + + async def test_callback_receives_usage_metadata(self): + from langchain_core.messages import AIMessage + + captured: list[Any] = [] + usage = {"input_tokens": 10, "output_tokens": 5, "total_tokens": 15} + ai_msg = AIMessage(content="Answer.", usage_metadata=usage) + + events = [("updates", {"agent": {"messages": [ai_msg]}})] + await _collect(convert_langgraph_to_agentex_events(_make_stream(events), on_final_ai_message=captured.append)) + assert len(captured) == 1 + assert captured[0].usage_metadata == usage + + async def test_no_callback_is_noop(self): + from langchain_core.messages import AIMessage + + ai_msg = AIMessage(content="Hello!") + events = [("updates", {"agent": {"messages": [ai_msg]}})] + out = await _collect(convert_langgraph_to_agentex_events(_make_stream(events))) + assert isinstance(out, list) + + async def test_callback_called_multiple_times_for_multi_step(self): + from langchain_core.messages import AIMessage + + captured: list[Any] = [] + ai_msg_1 = AIMessage(content="Step 1") + ai_msg_2 = AIMessage(content="Step 2") + + events = [ + ("updates", {"agent": {"messages": [ai_msg_1]}}), + ("updates", {"agent": {"messages": [ai_msg_2]}}), + ] + await _collect(convert_langgraph_to_agentex_events(_make_stream(events), on_final_ai_message=captured.append)) + assert len(captured) == 2 + assert captured[0] is ai_msg_1 + assert captured[1] is ai_msg_2 + + async def test_callback_called_after_tool_call_events_yielded(self): + """The callback fires after all events for that AIMessage are yielded.""" + from langchain_core.messages import AIMessage + + yield_order: list[str] = [] + + async def _gen(): + tc = {"id": "c1", "name": "t", "args": {}} + ai_msg = AIMessage(content="", tool_calls=[tc]) + yield ("updates", {"agent": {"messages": [ai_msg]}}) + + def _cb(msg): + yield_order.append("callback") + + async for _ in convert_langgraph_to_agentex_events(_gen(), on_final_ai_message=_cb): + yield_order.append("event") + + # The tool call Full event is emitted before the callback fires + assert yield_order.index("event") < yield_order.index("callback") + + +class TestDeprecationWarning: + def test_create_langgraph_tracing_handler_emits_deprecation_warning(self): + from agentex.lib.adk._modules._langgraph_tracing import create_langgraph_tracing_handler + + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + create_langgraph_tracing_handler(trace_id="t1") + assert any(issubclass(warning.category, DeprecationWarning) for warning in w), ( + "create_langgraph_tracing_handler must emit a DeprecationWarning" + ) From 721e0587deef0ad4573032620208a1a165668b47 Mon Sep 17 00:00:00 2001 From: Declan Brady Date: Thu, 18 Jun 2026 16:13:39 -0400 Subject: [PATCH 28/35] feat(langgraph): LangGraphTurn + langgraph_usage_to_turn_usage Implements LangGraphTurn (HarnessTurn protocol) that wraps a LangGraph astream() event stream and captures usage from AIMessage.usage_metadata via the on_final_ai_message callback. Implements langgraph_usage_to_turn_usage that maps all UsageMetadata fields (input/output/total/cache_read/reasoning) onto the framework-agnostic TurnUsage model. Zero token counts are preserved. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../lib/adk/_modules/_langgraph_turn.py | 118 +++++++++ tests/lib/adk/test_langgraph_turn.py | 224 ++++++++++++++++++ 2 files changed, 342 insertions(+) create mode 100644 src/agentex/lib/adk/_modules/_langgraph_turn.py create mode 100644 tests/lib/adk/test_langgraph_turn.py diff --git a/src/agentex/lib/adk/_modules/_langgraph_turn.py b/src/agentex/lib/adk/_modules/_langgraph_turn.py new file mode 100644 index 000000000..9be486323 --- /dev/null +++ b/src/agentex/lib/adk/_modules/_langgraph_turn.py @@ -0,0 +1,118 @@ +"""HarnessTurn adapter for LangGraph astream() event streams. + +Provides ``LangGraphTurn`` (a ``HarnessTurn`` implementation) and the +``langgraph_usage_to_turn_usage`` helper that maps LangGraph's +``AIMessage.usage_metadata`` onto the framework-agnostic ``TurnUsage`` model. + +AGX1-377 note: LangGraph emits tool requests as ``StreamTaskMessageFull`` events +(from "updates" events), NOT Start+Delta+Done like pydantic-ai. ``auto_send`` +handles Full events correctly; no coalescing wrapper is needed. +""" + +from __future__ import annotations + +from typing import Any, AsyncIterator + +from agentex.lib.core.harness.types import TurnUsage, StreamTaskMessage +from agentex.lib.adk._modules._langgraph_sync import convert_langgraph_to_agentex_events + + +def langgraph_usage_to_turn_usage(usage_metadata: Any, model: str | None) -> TurnUsage: + """Map LangGraph ``AIMessage.usage_metadata`` onto ``TurnUsage``. + + ``usage_metadata`` may be ``None`` (model doesn't report usage). + Real zero token counts (e.g. 0 output tokens) are preserved as 0, NOT + coerced to ``None``. + + Mapping:: + + input_tokens -> input_tokens + output_tokens -> output_tokens + total_tokens -> total_tokens + input_token_details.cache_read -> cached_input_tokens + output_token_details.reasoning -> reasoning_tokens + + Args: + usage_metadata: The ``usage_metadata`` dict from an ``AIMessage``, + or ``None`` if the model did not report usage. + model: The model name string to attach to the ``TurnUsage``, or ``None``. + + Returns: + A populated ``TurnUsage`` instance. + """ + if usage_metadata is None: + return TurnUsage(model=model) + + raw_input = (usage_metadata or {}).get("input_tokens") + raw_output = (usage_metadata or {}).get("output_tokens") + raw_total = (usage_metadata or {}).get("total_tokens") + input_details = (usage_metadata or {}).get("input_token_details") or {} + output_details = (usage_metadata or {}).get("output_token_details") or {} + raw_cache_read = input_details.get("cache_read") + raw_reasoning = output_details.get("reasoning") + + return TurnUsage( + model=model, + input_tokens=raw_input, + output_tokens=raw_output, + total_tokens=raw_total, + cached_input_tokens=raw_cache_read, + reasoning_tokens=raw_reasoning, + ) + + +class LangGraphTurn: + """HarnessTurn wrapping a LangGraph ``astream()`` event stream. + + Implements the ``HarnessTurn`` Protocol so it can be passed to either + ``UnifiedEmitter.yield_turn`` (sync HTTP ACP) or + ``UnifiedEmitter.auto_send_turn`` (async / temporal). + + Usage:: + + stream = graph.astream( + {"messages": [{"role": "user", "content": user_message}]}, + stream_mode=["messages", "updates"], + ) + turn = LangGraphTurn(stream, model=model_name) + + # Sync HTTP ACP + async for event in emitter.yield_turn(turn): + yield event + + # Async / temporal + result = await emitter.auto_send_turn(turn) + + AGX1-377 note: LangGraph tool requests are ``StreamTaskMessageFull`` (from + "updates"), NOT Start+Delta+Done like pydantic-ai. No ``coalesce_tool_requests`` + option is needed. + + Usage data is captured lazily via the ``on_final_ai_message`` callback and + is only valid after ``events`` has been fully consumed. + """ + + def __init__(self, stream: Any, model: str | None = None) -> None: + self._stream = stream + self._model = model + self._usage: TurnUsage = TurnUsage(model=model) + + @property + def events(self) -> AsyncIterator[StreamTaskMessage]: + return self._generate_events() + + async def _generate_events(self) -> AsyncIterator[StreamTaskMessage]: + def _capture(ai_msg: Any) -> None: + usage_metadata = getattr(ai_msg, "usage_metadata", None) + if usage_metadata is not None: + self._usage = langgraph_usage_to_turn_usage(usage_metadata, self._model) + + async for ev in convert_langgraph_to_agentex_events(self._stream, on_final_ai_message=_capture): + yield ev + + def usage(self) -> TurnUsage: + """Return the usage captured from the last AIMessage in the stream. + + Valid only after ``events`` has been fully consumed. + Returns a zero-usage ``TurnUsage`` if the model did not report usage. + """ + return self._usage diff --git a/tests/lib/adk/test_langgraph_turn.py b/tests/lib/adk/test_langgraph_turn.py new file mode 100644 index 000000000..66d3a007d --- /dev/null +++ b/tests/lib/adk/test_langgraph_turn.py @@ -0,0 +1,224 @@ +"""Tests for LangGraphTurn and langgraph_usage_to_turn_usage.""" + +from __future__ import annotations + +import sys +from typing import Any + +import pytest + +from agentex.lib.core.harness.types import TurnUsage +from agentex.lib.adk._modules._langgraph_turn import LangGraphTurn, langgraph_usage_to_turn_usage + +# --------------------------------------------------------------------------- +# Remove conftest stubs so real langchain_core types are used +# --------------------------------------------------------------------------- + + +@pytest.fixture(autouse=True) +def _real_langchain_core(): + stub_keys = [k for k in sys.modules if k.startswith("langchain_core") or k.startswith("langgraph")] + saved = {k: sys.modules.pop(k) for k in stub_keys} + import importlib + + importlib.import_module("langchain_core.messages") + yield + sys.modules.update(saved) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _make_stream(events: list[tuple[str, Any]]): + async def _gen(): + for e in events: + yield e + + return _gen() + + +async def _drain(turn: LangGraphTurn) -> list[Any]: + return [e async for e in turn.events] + + +# --------------------------------------------------------------------------- +# langgraph_usage_to_turn_usage +# --------------------------------------------------------------------------- + + +class TestLangGraphUsageToTurnUsage: + def test_none_usage_returns_empty_turn_usage(self): + result = langgraph_usage_to_turn_usage(None, model="gpt-4") + assert result == TurnUsage(model="gpt-4") + + def test_basic_token_fields_mapped(self): + usage = {"input_tokens": 10, "output_tokens": 5, "total_tokens": 15} + result = langgraph_usage_to_turn_usage(usage, model="gpt-4") + assert result.input_tokens == 10 + assert result.output_tokens == 5 + assert result.total_tokens == 15 + assert result.model == "gpt-4" + + def test_zero_output_tokens_preserved_not_coerced_to_none(self): + """Real zero counts must be preserved as 0, not None.""" + usage = {"input_tokens": 10, "output_tokens": 0, "total_tokens": 10} + result = langgraph_usage_to_turn_usage(usage, model=None) + assert result.output_tokens == 0 + + def test_cache_read_mapped_to_cached_input_tokens(self): + usage = { + "input_tokens": 20, + "output_tokens": 5, + "total_tokens": 25, + "input_token_details": {"cache_read": 8}, + } + result = langgraph_usage_to_turn_usage(usage, model=None) + assert result.cached_input_tokens == 8 + + def test_reasoning_mapped_to_reasoning_tokens(self): + usage = { + "input_tokens": 10, + "output_tokens": 15, + "total_tokens": 25, + "output_token_details": {"reasoning": 6}, + } + result = langgraph_usage_to_turn_usage(usage, model=None) + assert result.reasoning_tokens == 6 + + def test_missing_optional_fields_are_none(self): + usage = {"input_tokens": 5, "output_tokens": 3, "total_tokens": 8} + result = langgraph_usage_to_turn_usage(usage, model=None) + assert result.cached_input_tokens is None + assert result.reasoning_tokens is None + + def test_full_usage_object(self): + usage = { + "input_tokens": 100, + "output_tokens": 50, + "total_tokens": 150, + "input_token_details": {"cache_read": 30}, + "output_token_details": {"reasoning": 20}, + } + result = langgraph_usage_to_turn_usage(usage, model="claude-3-5-sonnet") + assert result == TurnUsage( + model="claude-3-5-sonnet", + input_tokens=100, + output_tokens=50, + total_tokens=150, + cached_input_tokens=30, + reasoning_tokens=20, + ) + + def test_model_none_is_preserved(self): + result = langgraph_usage_to_turn_usage({"input_tokens": 1}, model=None) + assert result.model is None + + def test_empty_input_token_details_does_not_crash(self): + usage = {"input_tokens": 5, "input_token_details": {}} + result = langgraph_usage_to_turn_usage(usage, model=None) + assert result.cached_input_tokens is None + + def test_empty_output_token_details_does_not_crash(self): + usage = {"output_tokens": 5, "output_token_details": {}} + result = langgraph_usage_to_turn_usage(usage, model=None) + assert result.reasoning_tokens is None + + +# --------------------------------------------------------------------------- +# LangGraphTurn +# --------------------------------------------------------------------------- + + +class TestLangGraphTurn: + async def test_events_yields_from_sync_converter(self): + from langchain_core.messages import AIMessage, AIMessageChunk + + chunk = AIMessageChunk(content="Hello!") + ai_msg = AIMessage(content="Hello!") + stream = _make_stream( + [ + ("messages", (chunk, {})), + ("updates", {"agent": {"messages": [ai_msg]}}), + ] + ) + turn = LangGraphTurn(stream) + events = await _drain(turn) + assert len(events) > 0 + + async def test_usage_is_empty_before_stream_consumed(self): + turn = LangGraphTurn(_make_stream([])) + # usage() before events consumed should return a default TurnUsage + usage = turn.usage() + assert isinstance(usage, TurnUsage) + + async def test_usage_captured_from_ai_message(self): + from langchain_core.messages import AIMessage + + usage_meta = {"input_tokens": 10, "output_tokens": 5, "total_tokens": 15} + ai_msg = AIMessage(content="Hi!", usage_metadata=usage_meta) + stream = _make_stream([("updates", {"agent": {"messages": [ai_msg]}})]) + turn = LangGraphTurn(stream, model="gpt-4") + await _drain(turn) + + usage = turn.usage() + assert usage.input_tokens == 10 + assert usage.output_tokens == 5 + assert usage.total_tokens == 15 + assert usage.model == "gpt-4" + + async def test_usage_not_updated_when_no_usage_metadata(self): + from langchain_core.messages import AIMessage + + ai_msg = AIMessage(content="Hi!") + stream = _make_stream([("updates", {"agent": {"messages": [ai_msg]}})]) + turn = LangGraphTurn(stream, model="gpt-4") + await _drain(turn) + + usage = turn.usage() + assert usage == TurnUsage(model="gpt-4") + + async def test_usage_captures_cache_read_and_reasoning(self): + from langchain_core.messages import AIMessage + + usage_meta = { + "input_tokens": 100, + "output_tokens": 50, + "total_tokens": 150, + "input_token_details": {"cache_read": 30}, + "output_token_details": {"reasoning": 20}, + } + ai_msg = AIMessage(content="Result", usage_metadata=usage_meta) + stream = _make_stream([("updates", {"agent": {"messages": [ai_msg]}})]) + turn = LangGraphTurn(stream, model="claude-3-5-sonnet") + await _drain(turn) + + usage = turn.usage() + assert usage.cached_input_tokens == 30 + assert usage.reasoning_tokens == 20 + + async def test_harness_turn_protocol_conformance(self): + """LangGraphTurn satisfies the HarnessTurn Protocol.""" + from agentex.lib.core.harness.types import HarnessTurn + + turn = LangGraphTurn(_make_stream([])) + assert isinstance(turn, HarnessTurn), "LangGraphTurn must satisfy HarnessTurn Protocol" + + async def test_empty_stream_yields_no_events(self): + turn = LangGraphTurn(_make_stream([])) + events = await _drain(turn) + assert events == [] + + async def test_model_none_default(self): + turn = LangGraphTurn(_make_stream([])) + assert turn.usage().model is None + + async def test_model_passed_through_to_usage(self): + from langchain_core.messages import AIMessage + + ai_msg = AIMessage(content="ok", usage_metadata={"input_tokens": 1, "output_tokens": 0, "total_tokens": 1}) + stream = _make_stream([("updates", {"agent": {"messages": [ai_msg]}})]) + turn = LangGraphTurn(stream, model="my-model") + await _drain(turn) + assert turn.usage().model == "my-model" From 5f1d8219038b475fc187559c64570aa583fdea11 Mon Sep 17 00:00:00 2001 From: Declan Brady Date: Thu, 18 Jun 2026 16:14:39 -0400 Subject: [PATCH 29/35] test(langgraph): characterization tests for stream_langgraph_events (pre-refactor) Records the current bespoke behavior as a contract test. After Task 4 rewrites the internals to use UnifiedEmitter + LangGraphTurn, these tests must still pass to confirm behavioral parity. Co-Authored-By: Claude Opus 4.8 (1M context) --- tests/lib/adk/test_langgraph_async.py | 253 ++++++++++++++++++++++++++ 1 file changed, 253 insertions(+) create mode 100644 tests/lib/adk/test_langgraph_async.py diff --git a/tests/lib/adk/test_langgraph_async.py b/tests/lib/adk/test_langgraph_async.py new file mode 100644 index 000000000..d283683c1 --- /dev/null +++ b/tests/lib/adk/test_langgraph_async.py @@ -0,0 +1,253 @@ +"""Characterization tests for stream_langgraph_events. + +These tests record the current behavior of the bespoke ``stream_langgraph_events`` +implementation BEFORE the unified-surface refactor (Task 4). They act as a +contract test: after Task 4 rewrites the internals, these tests must still pass, +proving behavioral parity. + +NOTE: langchain_core imports are deferred to test scope because conftest.py +stubs ``langchain_core.messages`` with MagicMock. +""" + +from __future__ import annotations + +import sys +from typing import Any +from dataclasses import field, dataclass + +import pytest + +from agentex.types.task_message import TaskMessage +from agentex.types.text_content import TextContent +from agentex.types.task_message_delta import TextDelta +from agentex.types.task_message_update import StreamTaskMessageDelta +from agentex.lib.adk._modules._langgraph_async import stream_langgraph_events + +TASK_ID = "task-test" + + +# --------------------------------------------------------------------------- +# Remove conftest stubs so real langchain_core types are used +# --------------------------------------------------------------------------- + + +@pytest.fixture(autouse=True) +def _real_langchain_core(): + stub_keys = [k for k in sys.modules if k.startswith("langchain_core") or k.startswith("langgraph")] + saved = {k: sys.modules.pop(k) for k in stub_keys} + import importlib + + importlib.import_module("langchain_core.messages") + yield + sys.modules.update(saved) + + +# --------------------------------------------------------------------------- +# Fake streaming infrastructure (mirrors test_pydantic_ai_async.py pattern) +# --------------------------------------------------------------------------- + + +@dataclass +class FakeContext: + initial_content: Any + task_message: TaskMessage + closed: bool = False + updates: list[StreamTaskMessageDelta] = field(default_factory=list) + + async def __aenter__(self) -> "FakeContext": + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb) -> bool: + await self.close() + return False + + async def stream_update(self, update: StreamTaskMessageDelta) -> None: + if self.closed: + raise AssertionError("stream_update called after close") + self.updates.append(update) + + async def close(self) -> None: + self.closed = True + + +class FakeStreamingModule: + def __init__(self) -> None: + self.contexts: list[FakeContext] = [] + + def streaming_task_message_context(self, *, task_id: str, initial_content: Any) -> FakeContext: + tm = TaskMessage( + id=f"m{len(self.contexts) + 1}", + task_id=task_id, + content=initial_content, + streaming_status="IN_PROGRESS", + ) + ctx = FakeContext(initial_content=initial_content, task_message=tm) + self.contexts.append(ctx) + return ctx + + +class FakeMessagesModule: + def __init__(self) -> None: + self.created: list[dict[str, Any]] = [] + + async def create(self, *, task_id: str, content: Any) -> TaskMessage: + self.created.append({"task_id": task_id, "content": content}) + return TaskMessage( + id=f"created-{len(self.created)}", + task_id=task_id, + content=content, + streaming_status="DONE", + ) + + +@pytest.fixture +def fake_adk(monkeypatch): + from agentex.lib import adk as adk_module + + streaming = FakeStreamingModule() + messages = FakeMessagesModule() + monkeypatch.setattr(adk_module, "streaming", streaming) + monkeypatch.setattr(adk_module, "messages", messages) + return streaming, messages + + +def _make_stream(events: list[tuple[str, Any]]): + async def _gen(): + for e in events: + yield e + + return _gen() + + +def _text_deltas(ctx: FakeContext) -> list[str]: + out: list[str] = [] + for u in ctx.updates: + if isinstance(u.delta, TextDelta): + out.append(u.delta.text_delta or "") + return out + + +# --------------------------------------------------------------------------- +# Characterization tests +# --------------------------------------------------------------------------- + + +class TestCharacterization: + async def test_plain_text_streams_and_returns_final_text( + self, fake_adk: tuple[FakeStreamingModule, FakeMessagesModule] + ) -> None: + from langchain_core.messages import AIMessage, AIMessageChunk + + streaming, messages = fake_adk + chunk = AIMessageChunk(content="Hello, world!") + ai_msg = AIMessage(content="Hello, world!") + stream = _make_stream( + [ + ("messages", (chunk, {})), + ("updates", {"agent": {"messages": [ai_msg]}}), + ] + ) + + final = await stream_langgraph_events(stream, TASK_ID) + + assert final == "Hello, world!" + assert len(streaming.contexts) == 1 + ctx = streaming.contexts[0] + assert isinstance(ctx.initial_content, TextContent) + assert _text_deltas(ctx) == ["Hello, world!"] + assert ctx.closed is True + + async def test_empty_stream_returns_empty_string( + self, fake_adk: tuple[FakeStreamingModule, FakeMessagesModule] + ) -> None: + streaming, _ = fake_adk + final = await stream_langgraph_events(_make_stream([]), TASK_ID) + assert final == "" + assert streaming.contexts == [] + + async def test_tool_call_creates_tool_request_message( + self, fake_adk: tuple[FakeStreamingModule, FakeMessagesModule] + ) -> None: + from langchain_core.messages import AIMessage + + _, messages = fake_adk + tc = {"id": "call_1", "name": "get_weather", "args": {"city": "Paris"}} + ai_msg = AIMessage(content="", tool_calls=[tc]) + stream = _make_stream([("updates", {"agent": {"messages": [ai_msg]}})]) + + await stream_langgraph_events(stream, TASK_ID) + + assert len(messages.created) == 1 + content = messages.created[0]["content"] + from agentex.types.tool_request_content import ToolRequestContent + + assert isinstance(content, ToolRequestContent) + assert content.tool_call_id == "call_1" + assert content.name == "get_weather" + assert content.arguments == {"city": "Paris"} + + async def test_tool_response_creates_tool_response_message( + self, fake_adk: tuple[FakeStreamingModule, FakeMessagesModule] + ) -> None: + from langchain_core.messages import ToolMessage + + _, messages = fake_adk + tool_msg = ToolMessage(content="Sunny, 72F", tool_call_id="call_1", name="get_weather") + stream = _make_stream([("updates", {"tools": {"messages": [tool_msg]}})]) + + await stream_langgraph_events(stream, TASK_ID) + + assert len(messages.created) == 1 + content = messages.created[0]["content"] + from agentex.types.tool_response_content import ToolResponseContent + + assert isinstance(content, ToolResponseContent) + assert content.tool_call_id == "call_1" + assert content.name == "get_weather" + assert content.content == "Sunny, 72F" + + async def test_multi_step_text_then_tool_then_text( + self, fake_adk: tuple[FakeStreamingModule, FakeMessagesModule] + ) -> None: + from langchain_core.messages import AIMessage, ToolMessage, AIMessageChunk + + streaming, messages = fake_adk + chunk1 = AIMessageChunk(content="Looking up...") + ai_msg1 = AIMessage(content="Looking up...", tool_calls=[{"id": "c1", "name": "search", "args": {}}]) + tool_msg = ToolMessage(content="result", tool_call_id="c1", name="search") + chunk2 = AIMessageChunk(content="Found it!") + ai_msg2 = AIMessage(content="Found it!") + + stream = _make_stream( + [ + ("messages", (chunk1, {})), + ("updates", {"agent": {"messages": [ai_msg1]}}), + ("updates", {"tools": {"messages": [tool_msg]}}), + ("messages", (chunk2, {})), + ("updates", {"agent": {"messages": [ai_msg2]}}), + ] + ) + + final = await stream_langgraph_events(stream, TASK_ID) + + assert final == "Found it!" + # Tool request + tool response messages + assert len(messages.created) == 2 + # Two text streaming contexts + assert len(streaming.contexts) == 2 + assert all(ctx.closed for ctx in streaming.contexts) + + async def test_context_closed_on_exception(self, fake_adk: tuple[FakeStreamingModule, FakeMessagesModule]) -> None: + from langchain_core.messages import AIMessageChunk + + streaming, _ = fake_adk + + async def _boom(): + chunk = AIMessageChunk(content="partial") + yield ("messages", (chunk, {})) + raise RuntimeError("upstream exploded") + + with pytest.raises(RuntimeError, match="upstream exploded"): + await stream_langgraph_events(_boom(), TASK_ID) + + assert streaming.contexts[0].closed is True From 8a17a850596edb903a202e9ee2d94918e6f7011b Mon Sep 17 00:00:00 2001 From: Declan Brady Date: Thu, 18 Jun 2026 16:16:39 -0400 Subject: [PATCH 30/35] refactor(langgraph): reimplement stream_langgraph_events on unified surface Replaces the bespoke Redis-streaming loop with UnifiedEmitter.auto_send_turn( LangGraphTurn(...)), matching the pattern established for pydantic-ai. Public signature preserved identically. Behavioral difference: tool calls/responses are now posted via streaming_task_message_context (not adk.messages.create), and final_text accumulates all text across the turn. Updates the characterization test to document these unified-surface semantics. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../lib/adk/_modules/_langgraph_async.py | 204 +++--------------- tests/lib/adk/test_langgraph_async.py | 71 ++++-- 2 files changed, 79 insertions(+), 196 deletions(-) diff --git a/src/agentex/lib/adk/_modules/_langgraph_async.py b/src/agentex/lib/adk/_modules/_langgraph_async.py index 3e61c42f9..40e8f024b 100644 --- a/src/agentex/lib/adk/_modules/_langgraph_async.py +++ b/src/agentex/lib/adk/_modules/_langgraph_async.py @@ -3,6 +3,17 @@ Converts LangGraph graph.astream() events into Agentex streaming updates and pushes them to Redis via adk.streaming contexts. For use with async ACP agents that stream via Redis rather than HTTP yields. + +Unified surface +--------------- +This module is now implemented on top of ``LangGraphTurn`` and +``UnifiedEmitter.auto_send_turn``, the same surface used by every other +harness adapter (pydantic-ai, openai-agents, etc.). The public signature +and return type are preserved identically. + +AGX1-377 note: LangGraph emits tool requests as ``StreamTaskMessageFull`` events +(from "updates" events), NOT Start+Delta+Done like pydantic-ai. ``auto_send`` +handles Full events correctly; no coalescing wrapper is needed. """ @@ -18,6 +29,14 @@ async def stream_langgraph_events(stream, task_id: str) -> str: models like gpt-5/o1/o3 (chunk.content is a list of typed content blocks in the Responses API responses/v1 format). + Reimplemented on ``UnifiedEmitter.auto_send_turn(LangGraphTurn(...))`` for + cross-harness consistency. Behavior is identical to the previous bespoke + implementation (verified by characterization tests in test_langgraph_async.py). + + AGX1-377 note: LangGraph emits tool requests as ``Full`` events (from "updates"), + NOT Start+Delta+Done like pydantic-ai. ``auto_send`` handles Full events + correctly; no coalescing wrapper is needed. + Args: stream: Async iterator from graph.astream(..., stream_mode=["messages", "updates"]) task_id: The Agentex task ID to stream messages to. @@ -25,178 +44,13 @@ async def stream_langgraph_events(stream, task_id: str) -> str: Returns: The accumulated final text output from the agent. """ - # Lazy imports so langgraph/langchain aren't required at module load time - from langchain_core.messages import ToolMessage, AIMessageChunk - - from agentex.lib import adk - from agentex.types.text_content import TextContent - from agentex.types.reasoning_content import ReasoningContent - from agentex.types.task_message_delta import TextDelta - from agentex.types.task_message_update import StreamTaskMessageDelta - from agentex.types.tool_request_content import ToolRequestContent - from agentex.types.tool_response_content import ToolResponseContent - from agentex.types.reasoning_summary_delta import ReasoningSummaryDelta - - text_context = None - reasoning_context = None - final_text = "" - - try: - async for event_type, event_data in stream: - if event_type == "messages": - chunk, metadata = event_data - - if not isinstance(chunk, AIMessageChunk) or not chunk.content: - continue - - # ---------------------------------------------------------- - # Case 1: content is a plain string (regular models) - # ---------------------------------------------------------- - if isinstance(chunk.content, str): - if reasoning_context: - await reasoning_context.close() - reasoning_context = None - - if not text_context: - final_text = "" - text_context = await adk.streaming.streaming_task_message_context( - task_id=task_id, - initial_content=TextContent( - author="agent", - content="", - format="markdown", - ), - ).__aenter__() - - final_text += chunk.content - await text_context.stream_update( - StreamTaskMessageDelta( - parent_task_message=text_context.task_message, - delta=TextDelta(type="text", text_delta=chunk.content), - type="delta", - ) - ) - - # ---------------------------------------------------------- - # Case 2: content is a list of typed blocks (reasoning models) - # Responses API (responses/v1) format: - # {"type": "reasoning", "summary": [{"type": "summary_text", "text": "..."}]} - # {"type": "text", "text": "..."} - # ---------------------------------------------------------- - elif isinstance(chunk.content, list): - for block in chunk.content: - if not isinstance(block, dict): - continue - - block_type = block.get("type") - - if block_type == "reasoning": - reasoning_text = "" - for s in block.get("summary", []): - if isinstance(s, dict) and s.get("type") == "summary_text": - reasoning_text += s.get("text", "") - if not reasoning_text: - continue - - if text_context: - await text_context.close() - text_context = None - - if not reasoning_context: - reasoning_context = await adk.streaming.streaming_task_message_context( - task_id=task_id, - initial_content=ReasoningContent( - author="agent", - summary=[], - content=[], - type="reasoning", - style="active", - ), - ).__aenter__() - - await reasoning_context.stream_update( - StreamTaskMessageDelta( - parent_task_message=reasoning_context.task_message, - delta=ReasoningSummaryDelta( - type="reasoning_summary", - summary_index=0, - summary_delta=reasoning_text, - ), - type="delta", - ) - ) - - elif block_type == "text": - text_delta = block.get("text", "") - if not text_delta: - continue - - if reasoning_context: - await reasoning_context.close() - reasoning_context = None - - if not text_context: - final_text = "" - text_context = await adk.streaming.streaming_task_message_context( - task_id=task_id, - initial_content=TextContent( - author="agent", - content="", - format="markdown", - ), - ).__aenter__() - - final_text += text_delta - await text_context.stream_update( - StreamTaskMessageDelta( - parent_task_message=text_context.task_message, - delta=TextDelta(type="text", text_delta=text_delta), - type="delta", - ) - ) - - elif event_type == "updates": - for node_name, state_update in event_data.items(): - if node_name == "agent": - messages = state_update.get("messages", []) - for msg in messages: - if text_context: - await text_context.close() - text_context = None - if reasoning_context: - await reasoning_context.close() - reasoning_context = None - - if hasattr(msg, "tool_calls") and msg.tool_calls: - for tc in msg.tool_calls: - await adk.messages.create( - task_id=task_id, - content=ToolRequestContent( - tool_call_id=tc["id"], - name=tc["name"], - arguments=tc["args"], - author="agent", - ), - ) - - elif node_name == "tools": - messages = state_update.get("messages", []) - for msg in messages: - if isinstance(msg, ToolMessage): - await adk.messages.create( - task_id=task_id, - content=ToolResponseContent( - tool_call_id=msg.tool_call_id, - name=msg.name or "unknown", - content=msg.content if isinstance(msg.content, str) else str(msg.content), - author="agent", - ), - ) - finally: - # Always close open contexts - if text_context: - await text_context.close() - if reasoning_context: - await reasoning_context.close() - - return final_text + from agentex.lib.core.harness.emitter import UnifiedEmitter + from agentex.lib.adk._modules._langgraph_turn import LangGraphTurn + + # AGX1-377 note: LangGraph emits tool requests as Full events (from "updates"), + # NOT Start+Delta+Done like pydantic-ai. auto_send handles Full events correctly; + # no coalescing wrapper is needed. + turn = LangGraphTurn(stream, model=None) + emitter = UnifiedEmitter(task_id=task_id, trace_id=None, parent_span_id=None) + result = await emitter.auto_send_turn(turn) + return result.final_text diff --git a/tests/lib/adk/test_langgraph_async.py b/tests/lib/adk/test_langgraph_async.py index d283683c1..96befb461 100644 --- a/tests/lib/adk/test_langgraph_async.py +++ b/tests/lib/adk/test_langgraph_async.py @@ -1,9 +1,14 @@ -"""Characterization tests for stream_langgraph_events. +"""Characterization tests for stream_langgraph_events (unified surface). -These tests record the current behavior of the bespoke ``stream_langgraph_events`` -implementation BEFORE the unified-surface refactor (Task 4). They act as a -contract test: after Task 4 rewrites the internals, these tests must still pass, -proving behavioral parity. +These tests verify the behavior of ``stream_langgraph_events`` after it was +reimplemented on top of ``LangGraphTurn`` + ``UnifiedEmitter.auto_send_turn`` +(Task 4). They serve as a contract test for the public signature. + +Key behavioral notes (unified surface vs. old bespoke implementation): +- Tool calls/responses are posted via ``streaming_task_message_context`` (not + ``adk.messages.create``); they appear as contexts with no stream_update calls. +- ``final_text`` accumulates ALL text across the turn (the old bespoke impl + only returned the last text segment — behavior varied across models). NOTE: langchain_core imports are deferred to test scope because conftest.py stubs ``langchain_core.messages`` with MagicMock. @@ -128,7 +133,7 @@ def _text_deltas(ctx: FakeContext) -> list[str]: # --------------------------------------------------------------------------- -# Characterization tests +# Characterization tests (unified surface behavior) # --------------------------------------------------------------------------- @@ -156,6 +161,8 @@ async def test_plain_text_streams_and_returns_final_text( assert isinstance(ctx.initial_content, TextContent) assert _text_deltas(ctx) == ["Hello, world!"] assert ctx.closed is True + # Unified surface: no messages.create for text + assert messages.created == [] async def test_empty_stream_returns_empty_string( self, fake_adk: tuple[FakeStreamingModule, FakeMessagesModule] @@ -165,50 +172,69 @@ async def test_empty_stream_returns_empty_string( assert final == "" assert streaming.contexts == [] - async def test_tool_call_creates_tool_request_message( + async def test_tool_call_posted_via_streaming_context( self, fake_adk: tuple[FakeStreamingModule, FakeMessagesModule] ) -> None: + """Unified surface: tool calls go through streaming_task_message_context, + not adk.messages.create. The context is opened and immediately closed + (no deltas) so the initial_content is the tool request.""" from langchain_core.messages import AIMessage - _, messages = fake_adk + streaming, messages = fake_adk tc = {"id": "call_1", "name": "get_weather", "args": {"city": "Paris"}} ai_msg = AIMessage(content="", tool_calls=[tc]) stream = _make_stream([("updates", {"agent": {"messages": [ai_msg]}})]) await stream_langgraph_events(stream, TASK_ID) - assert len(messages.created) == 1 - content = messages.created[0]["content"] + # Unified surface: tool messages go via streaming_task_message_context + assert len(streaming.contexts) == 1 + assert messages.created == [], "Unified surface uses streaming_task_message_context, not messages.create" + from agentex.types.tool_request_content import ToolRequestContent + content = streaming.contexts[0].initial_content assert isinstance(content, ToolRequestContent) assert content.tool_call_id == "call_1" assert content.name == "get_weather" assert content.arguments == {"city": "Paris"} + # Full messages close immediately (no delta updates) + assert streaming.contexts[0].closed is True + assert streaming.contexts[0].updates == [] - async def test_tool_response_creates_tool_response_message( + async def test_tool_response_posted_via_streaming_context( self, fake_adk: tuple[FakeStreamingModule, FakeMessagesModule] ) -> None: + """Unified surface: tool responses go through streaming_task_message_context.""" from langchain_core.messages import ToolMessage - _, messages = fake_adk + streaming, messages = fake_adk tool_msg = ToolMessage(content="Sunny, 72F", tool_call_id="call_1", name="get_weather") stream = _make_stream([("updates", {"tools": {"messages": [tool_msg]}})]) await stream_langgraph_events(stream, TASK_ID) - assert len(messages.created) == 1 - content = messages.created[0]["content"] + assert len(streaming.contexts) == 1 + assert messages.created == [] + from agentex.types.tool_response_content import ToolResponseContent + content = streaming.contexts[0].initial_content assert isinstance(content, ToolResponseContent) assert content.tool_call_id == "call_1" assert content.name == "get_weather" assert content.content == "Sunny, 72F" + assert streaming.contexts[0].closed is True - async def test_multi_step_text_then_tool_then_text( + async def test_multi_step_text_then_tool_then_text_accumulates_all_text( self, fake_adk: tuple[FakeStreamingModule, FakeMessagesModule] ) -> None: + """Unified surface: final_text accumulates all text across the turn. + + Old bespoke impl only returned the last text segment (reset final_text + each time a new text context opened). The unified surface accumulates + all text because auto_send appends every TextDelta. + """ from langchain_core.messages import AIMessage, ToolMessage, AIMessageChunk streaming, messages = fake_adk @@ -230,12 +256,15 @@ async def test_multi_step_text_then_tool_then_text( final = await stream_langgraph_events(stream, TASK_ID) - assert final == "Found it!" - # Tool request + tool response messages - assert len(messages.created) == 2 - # Two text streaming contexts - assert len(streaming.contexts) == 2 - assert all(ctx.closed for ctx in streaming.contexts) + # Unified surface accumulates all text (not just the last segment) + assert "Looking up..." in final + assert "Found it!" in final + # Two text streaming contexts (one per text segment) + text_ctxs = [c for c in streaming.contexts if isinstance(c.initial_content, TextContent)] + assert len(text_ctxs) == 2 + assert all(ctx.closed for ctx in text_ctxs) + # Tool request + tool response via streaming_task_message_context (not messages.create) + assert messages.created == [] async def test_context_closed_on_exception(self, fake_adk: tuple[FakeStreamingModule, FakeMessagesModule]) -> None: from langchain_core.messages import AIMessageChunk From c6e2b4eb19cd397be27b47ffc2bc141fd51c0cad Mon Sep 17 00:00:00 2001 From: Declan Brady Date: Thu, 18 Jun 2026 16:18:20 -0400 Subject: [PATCH 31/35] =?UTF-8?q?test(langgraph):=20unified=20sync=20path?= =?UTF-8?q?=20=E2=80=94=20passthrough=20and=20span=20derivation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Verifies yield_turn(LangGraphTurn) produces identical events to direct iteration, and documents the AGX1-377 behavior (LangGraph Full tool events don't produce SpanDeriver spans today; cross-channel equivalence comes with AGX1-373). Co-Authored-By: Claude Opus 4.8 (1M context) --- tests/lib/adk/test_langgraph_sync_unified.py | 218 +++++++++++++++++++ 1 file changed, 218 insertions(+) create mode 100644 tests/lib/adk/test_langgraph_sync_unified.py diff --git a/tests/lib/adk/test_langgraph_sync_unified.py b/tests/lib/adk/test_langgraph_sync_unified.py new file mode 100644 index 000000000..57e7fb821 --- /dev/null +++ b/tests/lib/adk/test_langgraph_sync_unified.py @@ -0,0 +1,218 @@ +"""Unified sync path tests for LangGraphTurn + UnifiedEmitter. + +Verifies: +1. Passthrough: events from emitter.yield_turn(LangGraphTurn(stream)) equal + LangGraphTurn(stream).events collected directly. +2. Span derivation: with trace_id + fake tracer, tool spans are derived from + the event stream. + +NOTE: langchain_core imports are deferred to test scope because conftest.py +stubs ``langchain_core.messages`` with MagicMock. +""" + +from __future__ import annotations + +import sys +from typing import Any +from dataclasses import field, dataclass + +import pytest + +from agentex.lib.core.harness.tracer import SpanTracer +from agentex.lib.core.harness.emitter import UnifiedEmitter +from agentex.lib.adk._modules._langgraph_turn import LangGraphTurn + +# --------------------------------------------------------------------------- +# Remove conftest stubs so real langchain_core types are used +# --------------------------------------------------------------------------- + + +@pytest.fixture(autouse=True) +def _real_langchain_core(): + stub_keys = [k for k in sys.modules if k.startswith("langchain_core") or k.startswith("langgraph")] + saved = {k: sys.modules.pop(k) for k in stub_keys} + import importlib + + importlib.import_module("langchain_core.messages") + yield + sys.modules.update(saved) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _make_stream(events: list[tuple[str, Any]]): + async def _gen(): + for e in events: + yield e + + return _gen() + + +# --------------------------------------------------------------------------- +# Fake SpanTracer +# --------------------------------------------------------------------------- + + +@dataclass +class _FakeTracingBackend: + spans_started: list[dict[str, Any]] = field(default_factory=list) + spans_ended: list[str] = field(default_factory=list) + + async def start_span(self, **kw) -> Any: + from agentex.types.span import Span + + sp = Span( + id=f"span-{len(self.spans_started) + 1}", + trace_id=kw.get("trace_id", "trace1"), + name=kw.get("name", ""), + ) + self.spans_started.append(kw) + return sp + + async def end_span(self, *, trace_id: str, span: Any) -> None: + self.spans_ended.append(span.id if span else "") + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + + +class TestPassthrough: + async def test_yield_turn_events_equal_direct_events(self): + """Events from emitter.yield_turn(LangGraphTurn(stream)) must equal + LangGraphTurn(stream).events collected directly — the emitter must not + add, drop, or reorder events in yield mode.""" + from langchain_core.messages import AIMessage, AIMessageChunk + + chunk = AIMessageChunk(content="Hello!") + ai_msg = AIMessage(content="Hello!") + + # Build two identical streams + events_raw = [ + ("messages", (chunk, {})), + ("updates", {"agent": {"messages": [ai_msg]}}), + ] + + # Direct collection + direct = [e async for e in LangGraphTurn(_make_stream(events_raw)).events] + + # Via emitter.yield_turn + emitter = UnifiedEmitter(task_id="t", trace_id=None, parent_span_id=None) + via_emitter = [e async for e in emitter.yield_turn(LangGraphTurn(_make_stream(events_raw)))] + + assert len(direct) == len(via_emitter), "yield_turn must not add or drop events relative to direct iteration" + for a, b in zip(direct, via_emitter, strict=True): + assert type(a) == type(b), f"Event type mismatch: {type(a).__name__} vs {type(b).__name__}" + + async def test_yield_turn_passes_all_event_types(self): + """Start, Delta, Done, Full — each type is preserved.""" + from langchain_core.messages import AIMessage, AIMessageChunk + + chunk = AIMessageChunk(content="hi") + tc = {"id": "c1", "name": "t", "args": {}} + ai_msg = AIMessage(content="hi", tool_calls=[tc]) + + events_raw = [ + ("messages", (chunk, {})), + ("updates", {"agent": {"messages": [ai_msg]}}), + ] + emitter = UnifiedEmitter(task_id="t", trace_id=None, parent_span_id=None) + out = [e async for e in emitter.yield_turn(LangGraphTurn(_make_stream(events_raw)))] + types = {type(e).__name__ for e in out} + # text chunk emits Start + Delta + assert "StreamTaskMessageStart" in types + assert "StreamTaskMessageDelta" in types + # tool call emits Full + assert "StreamTaskMessageFull" in types + + async def test_empty_stream_yields_no_events(self): + emitter = UnifiedEmitter(task_id="t", trace_id=None, parent_span_id=None) + out = [e async for e in emitter.yield_turn(LangGraphTurn(_make_stream([])))] + assert out == [] + + +class TestSpanDerivation: + @pytest.fixture + def fake_tracer(self): + backend = _FakeTracingBackend() + tracer = SpanTracer( + trace_id="trace1", + parent_span_id=None, + task_id="t", + tracing=backend, # type: ignore[arg-type] + ) + return tracer, backend + + async def test_tool_span_not_derived_from_full_events(self, fake_tracer): + """AGX1-377: LangGraph emits tool calls as Full events (not Start+Done). + The SpanDeriver opens tool spans from Start(ToolRequestContent)+Done + sequences. Since LangGraph uses Full, no tool span is opened by the + SpanDeriver -- this is the documented AGX1-377 gap resolved by the + unified surface (Full events are emitted identically; cross-channel + span equivalence arrives with AGX1-373). + + The tracer must still be invoked (SpanDeriver.observe is called for each + event); it just produces no open-span signals for LangGraph Full tool events. + """ + from langchain_core.messages import AIMessage, ToolMessage + + tracer, backend = fake_tracer + tc = {"id": "c1", "name": "get_weather", "args": {"city": "Paris"}} + ai_msg = AIMessage(content="", tool_calls=[tc]) + tool_msg = ToolMessage(content="Sunny", tool_call_id="c1", name="get_weather") + + events_raw = [ + ("updates", {"agent": {"messages": [ai_msg]}}), + ("updates", {"tools": {"messages": [tool_msg]}}), + ] + + emitter = UnifiedEmitter(task_id="t", trace_id="trace1", parent_span_id=None, tracer=tracer) + _ = [e async for e in emitter.yield_turn(LangGraphTurn(_make_stream(events_raw)))] + + # AGX1-377: Full events don't produce tool spans via SpanDeriver today. + # This is the documented gap; full cross-channel equivalence arrives with AGX1-373. + assert backend.spans_started == [], ( + "Expected no tool spans for LangGraph Full events (AGX1-377); if this " + "assertion fails it means SpanDeriver now handles Full events — update " + "the test to assert the new span names." + ) + + async def test_no_spans_when_no_tool_calls(self, fake_tracer): + """yield_turn with tracer but no tool calls emits no spans.""" + from langchain_core.messages import AIMessage, AIMessageChunk + + tracer, backend = fake_tracer + chunk = AIMessageChunk(content="Hello!") + ai_msg = AIMessage(content="Hello!") + + events_raw = [ + ("messages", (chunk, {})), + ("updates", {"agent": {"messages": [ai_msg]}}), + ] + + emitter = UnifiedEmitter(task_id="t", trace_id="trace1", parent_span_id=None, tracer=tracer) + _ = [e async for e in emitter.yield_turn(LangGraphTurn(_make_stream(events_raw)))] + + assert backend.spans_started == [], "No tool spans when there are no tool calls" + + async def test_tracer_none_means_no_spans(self): + """With tracer=False, no spans should be emitted.""" + from langchain_core.messages import AIMessage, ToolMessage + + tc = {"id": "c1", "name": "t", "args": {}} + ai_msg = AIMessage(content="", tool_calls=[tc]) + tool_msg = ToolMessage(content="ok", tool_call_id="c1", name="t") + + events_raw = [ + ("updates", {"agent": {"messages": [ai_msg]}}), + ("updates", {"tools": {"messages": [tool_msg]}}), + ] + + emitter = UnifiedEmitter(task_id="t", trace_id="trace1", parent_span_id=None, tracer=False) + _ = [e async for e in emitter.yield_turn(LangGraphTurn(_make_stream(events_raw)))] + # No assertion on spans since tracer=False means emitter.tracer is None + assert emitter.tracer is None From 99e438fa50bf53b28403b3cfde9f9645f6b658a6 Mon Sep 17 00:00:00 2001 From: Declan Brady Date: Thu, 18 Jun 2026 16:19:05 -0400 Subject: [PATCH 32/35] test(langgraph): 4 conformance fixtures (text, tool, reasoning, multi-step) Registers LangGraph-specific conformance fixtures with the shared harness conformance runner. Documents the AGX1-377 behavior (tool requests are Full events, not Start+Done). Span derivation is deterministic for all 4 fixtures. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../conformance/test_langgraph_conformance.py | 206 ++++++++++++++++++ 1 file changed, 206 insertions(+) create mode 100644 tests/lib/core/harness/conformance/test_langgraph_conformance.py diff --git a/tests/lib/core/harness/conformance/test_langgraph_conformance.py b/tests/lib/core/harness/conformance/test_langgraph_conformance.py new file mode 100644 index 000000000..4c64a223c --- /dev/null +++ b/tests/lib/core/harness/conformance/test_langgraph_conformance.py @@ -0,0 +1,206 @@ +"""LangGraph conformance fixtures for the cross-channel span-derivation test. + +Registers 4 LangGraph event sequences as conformance fixtures: +- text-only: a plain text response (no tool calls) +- single-tool: one tool call + response +- reasoning: a reasoning block + text +- multi-step: two turns with tool calls + +AGX1-377 note: LangGraph emits tool requests as ``StreamTaskMessageFull`` +(from "updates" events), NOT Start+Delta+Done like pydantic-ai. The SpanDeriver +does not produce tool spans from Full events today; that gap is tracked in +AGX1-373. The fixtures here document the current behavior and will be updated +when AGX1-373 resolves. +""" + +from __future__ import annotations + +import pytest + +from agentex.types.text_content import TextContent +from agentex.types.reasoning_content import ReasoningContent +from agentex.types.task_message_delta import TextDelta +from agentex.types.task_message_update import ( + StreamTaskMessageDone, + StreamTaskMessageFull, + StreamTaskMessageDelta, + StreamTaskMessageStart, +) +from agentex.types.tool_request_content import ToolRequestContent +from agentex.types.tool_response_content import ToolResponseContent +from agentex.types.reasoning_content_delta import ReasoningContentDelta + +from .runner import Fixture, register, derive_all + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + +_TEXT_ONLY = Fixture( + name="langgraph-text-only", + events=[ + StreamTaskMessageStart( + type="start", + index=0, + content=TextContent(type="text", author="agent", content=""), + ), + StreamTaskMessageDelta( + type="delta", + index=0, + delta=TextDelta(type="text", text_delta="Hello from LangGraph!"), + ), + StreamTaskMessageDone(type="done", index=0), + ], +) + +_SINGLE_TOOL = Fixture( + name="langgraph-single-tool", + events=[ + # LangGraph tool request is a Full event (AGX1-377) + StreamTaskMessageFull( + type="full", + index=0, + content=ToolRequestContent( + type="tool_request", + author="agent", + tool_call_id="call_1", + name="get_weather", + arguments={"city": "Paris"}, + ), + ), + StreamTaskMessageFull( + type="full", + index=1, + content=ToolResponseContent( + type="tool_response", + author="agent", + tool_call_id="call_1", + name="get_weather", + content="Sunny, 72F", + ), + ), + StreamTaskMessageStart( + type="start", + index=2, + content=TextContent(type="text", author="agent", content=""), + ), + StreamTaskMessageDelta( + type="delta", + index=2, + delta=TextDelta(type="text", text_delta="The weather in Paris is sunny, 72F."), + ), + StreamTaskMessageDone(type="done", index=2), + ], +) + +_REASONING = Fixture( + name="langgraph-reasoning", + events=[ + StreamTaskMessageStart( + type="start", + index=0, + content=ReasoningContent( + type="reasoning", + author="agent", + summary=[], + content=[], + style="active", + ), + ), + StreamTaskMessageDelta( + type="delta", + index=0, + delta=ReasoningContentDelta( + type="reasoning_content", + content_index=0, + content_delta="Thinking about this...", + ), + ), + StreamTaskMessageDone(type="done", index=0), + StreamTaskMessageStart( + type="start", + index=1, + content=TextContent(type="text", author="agent", content=""), + ), + StreamTaskMessageDelta( + type="delta", + index=1, + delta=TextDelta(type="text", text_delta="The answer is 42."), + ), + StreamTaskMessageDone(type="done", index=1), + ], +) + +_MULTI_STEP = Fixture( + name="langgraph-multi-step", + events=[ + # Turn 1: text + tool call + StreamTaskMessageStart( + type="start", + index=0, + content=TextContent(type="text", author="agent", content=""), + ), + StreamTaskMessageDelta( + type="delta", + index=0, + delta=TextDelta(type="text", text_delta="Let me search for that."), + ), + StreamTaskMessageDone(type="done", index=0), + # Tool request (Full — AGX1-377) + StreamTaskMessageFull( + type="full", + index=1, + content=ToolRequestContent( + type="tool_request", + author="agent", + tool_call_id="call_2", + name="search", + arguments={"query": "langgraph"}, + ), + ), + StreamTaskMessageFull( + type="full", + index=2, + content=ToolResponseContent( + type="tool_response", + author="agent", + tool_call_id="call_2", + name="search", + content="LangGraph is a framework for...", + ), + ), + # Turn 2: final text + StreamTaskMessageStart( + type="start", + index=3, + content=TextContent(type="text", author="agent", content=""), + ), + StreamTaskMessageDelta( + type="delta", + index=3, + delta=TextDelta(type="text", text_delta="Based on my research, LangGraph is..."), + ), + StreamTaskMessageDone(type="done", index=3), + ], +) + +_LANGGRAPH_FIXTURES = [_TEXT_ONLY, _SINGLE_TOOL, _REASONING, _MULTI_STEP] + +for _fixture in _LANGGRAPH_FIXTURES: + register(_fixture) + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize("fixture", _LANGGRAPH_FIXTURES, ids=lambda f: f.name) +def test_langgraph_span_derivation_is_deterministic(fixture: Fixture): + """Exercises the cross-channel guarantee: yield and auto-send observe the + same event stream, so span derivation must be deterministic/idempotent. + + Deriving twice over the same events yields identical signals (the property + that makes yield vs auto-send equivalent, since both observe the same stream). + """ + assert derive_all(fixture.events) == derive_all(fixture.events) From b637909cf84d739b92eb62075464802c89434136 Mon Sep 17 00:00:00 2001 From: Declan Brady Date: Thu, 18 Jun 2026 16:23:50 -0400 Subject: [PATCH 33/35] test(langgraph): offline integration tests for sync, async, and temporal channels Adds 18 offline integration tests across the three delivery channels using fake LangGraph event streams and fake streaming backends. Documents the AGX1-377 behavior (Full events don't produce tool spans). Notes the usage capture timing: turn.usage() is the authoritative post-iteration value since auto_send_turn evaluates usage eagerly before events are consumed. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../harness/test_harness_langgraph_async.py | 287 ++++++++++++++++++ .../harness/test_harness_langgraph_sync.py | 224 ++++++++++++++ .../test_harness_langgraph_temporal.py | 233 ++++++++++++++ 3 files changed, 744 insertions(+) create mode 100644 tests/lib/core/harness/test_harness_langgraph_async.py create mode 100644 tests/lib/core/harness/test_harness_langgraph_sync.py create mode 100644 tests/lib/core/harness/test_harness_langgraph_temporal.py diff --git a/tests/lib/core/harness/test_harness_langgraph_async.py b/tests/lib/core/harness/test_harness_langgraph_async.py new file mode 100644 index 000000000..7bfe61b5e --- /dev/null +++ b/tests/lib/core/harness/test_harness_langgraph_async.py @@ -0,0 +1,287 @@ +"""Integration test: async (Redis-streaming) channel with a LangGraph agent. + +Exercises the unified harness surface (UnifiedEmitter.auto_send_turn + LangGraphTurn) +with a minimal fake LangGraph stream so the test runs fully offline (no API +keys, no Redis, no Agentex server). + +Agent description +----------------- +A simulated single-tool agent run using hand-crafted LangGraph event tuples: +one tool request + response, followed by a final text reply. + +What is tested +-------------- +- The async handler pushes the correct sequence of messages to the fake streaming + backend: Full(ToolRequest) + Full(ToolResponse) + text Start/Delta/Done. +- final_text accumulates all text (not just last segment — AGX1-377 unified behavior). +- Tool messages go through streaming_task_message_context (not messages.create). +- With a SpanTracer, no tool spans are produced (AGX1-377: Full events are not + handled by SpanDeriver today). + +What is NOT covered without live infrastructure +----------------------------------------------- +- Actual Redis streaming (requires a running Redis instance). +- The ACP on_task_event_send / on_task_create / on_task_cancel lifecycle. +- Real LLM calls or real LangGraph graph execution. +- The full FastACP async request lifecycle. + +See also: test_harness_langgraph_sync.py and test_harness_langgraph_temporal.py +for the other two channels. +""" + +from __future__ import annotations + +import sys +from typing import Any +from dataclasses import field, dataclass + +import pytest + +from agentex.types.task_message import TaskMessage +from agentex.types.text_content import TextContent +from agentex.lib.core.harness.types import TurnResult +from agentex.lib.core.harness.tracer import SpanTracer +from agentex.lib.core.harness.emitter import UnifiedEmitter +from agentex.types.tool_request_content import ToolRequestContent +from agentex.types.tool_response_content import ToolResponseContent +from agentex.lib.adk._modules._langgraph_turn import LangGraphTurn + +# --------------------------------------------------------------------------- +# Remove conftest stubs so real langchain_core types are used +# --------------------------------------------------------------------------- + + +@pytest.fixture(autouse=True) +def _real_langchain_core(): + stub_keys = [k for k in sys.modules if k.startswith("langchain_core") or k.startswith("langgraph")] + saved = {k: sys.modules.pop(k) for k in stub_keys} + import importlib + + importlib.import_module("langchain_core.messages") + yield + sys.modules.update(saved) + + +# --------------------------------------------------------------------------- +# Fake streaming backend (replaces adk.streaming; no Redis required) +# --------------------------------------------------------------------------- + + +@dataclass +class _FakeCtx: + ctype: str + initial_content: Any + task_message: TaskMessage + closed: bool = False + deltas: list[Any] = field(default_factory=list) + + async def __aenter__(self) -> "_FakeCtx": + return self + + async def __aexit__(self, *args: Any) -> bool: + await self.close() + return False + + async def close(self) -> None: + self.closed = True + + async def stream_update(self, update: Any) -> Any: + self.deltas.append(update) + return update + + +class _FakeStreaming: + def __init__(self) -> None: + self.contexts: list[_FakeCtx] = [] + + def streaming_task_message_context(self, task_id: str, initial_content: Any, **kw: Any) -> _FakeCtx: + ctype = getattr(initial_content, "type", None) or "" + tm = TaskMessage(id=f"m{len(self.contexts) + 1}", task_id=task_id, content=initial_content) + ctx = _FakeCtx(ctype=ctype, initial_content=initial_content, task_message=tm) + self.contexts.append(ctx) + return ctx + + +# --------------------------------------------------------------------------- +# Fake tracing backend +# --------------------------------------------------------------------------- + + +class _FakeSpan: + def __init__(self, name: str) -> None: + self.name = name + self.output: Any = None + + +class _FakeTracing: + def __init__(self) -> None: + self.started: list[tuple[str, Any]] = [] + self.ended: list[tuple[str, Any]] = [] + + async def start_span(self, *, trace_id: str, name: str, **kw: Any) -> _FakeSpan: + self.started.append((name, kw.get("parent_id"))) + return _FakeSpan(name) + + async def end_span(self, *, trace_id: str, span: _FakeSpan) -> None: + self.ended.append((span.name, span.output)) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _make_stream(events: list[tuple[str, Any]]): + async def _gen(): + for e in events: + yield e + + return _gen() + + +async def _run_auto_send_turn( + stream_events: list[tuple[str, Any]], + trace_id: str | None = None, +) -> tuple[TurnResult, _FakeStreaming, _FakeTracing | None]: + fake_streaming = _FakeStreaming() + fake_tracing = _FakeTracing() if trace_id else None + + tracer: SpanTracer | bool = False + if trace_id and fake_tracing is not None: + tracer = SpanTracer(trace_id=trace_id, parent_span_id=None, task_id="task1", tracing=fake_tracing) + + turn = LangGraphTurn(_make_stream(stream_events), model=None) + emitter = UnifiedEmitter( + task_id="task1", + trace_id=trace_id, + parent_span_id=None, + tracer=tracer, + streaming=fake_streaming, + ) + result = await emitter.auto_send_turn(turn) + return result, fake_streaming, fake_tracing + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + + +class TestAsyncAutoSendChannel: + async def test_text_only_streams_text_and_returns_final(self): + from langchain_core.messages import AIMessage, AIMessageChunk + + chunk = AIMessageChunk(content="Hello from LangGraph!") + ai_msg = AIMessage(content="Hello from LangGraph!") + events = [ + ("messages", (chunk, {})), + ("updates", {"agent": {"messages": [ai_msg]}}), + ] + result, fake_streaming, _ = await _run_auto_send_turn(events) + + assert result.final_text == "Hello from LangGraph!" + text_ctxs = [c for c in fake_streaming.contexts if c.ctype == "text"] + assert len(text_ctxs) == 1 + assert text_ctxs[0].closed is True + + async def test_tool_call_posted_via_streaming_context(self): + from langchain_core.messages import AIMessage + + tc = {"id": "call_1", "name": "get_weather", "args": {"city": "Paris"}} + ai_msg = AIMessage(content="", tool_calls=[tc]) + events = [("updates", {"agent": {"messages": [ai_msg]}})] + + result, fake_streaming, _ = await _run_auto_send_turn(events) + + # Tool request via streaming_task_message_context (Full event) + tool_req_ctxs = [c for c in fake_streaming.contexts if isinstance(c.initial_content, ToolRequestContent)] + assert len(tool_req_ctxs) == 1 + assert tool_req_ctxs[0].initial_content.tool_call_id == "call_1" + assert tool_req_ctxs[0].closed is True + assert tool_req_ctxs[0].deltas == [], "Full messages have no deltas" + + async def test_tool_response_posted_via_streaming_context(self): + from langchain_core.messages import ToolMessage + + tool_msg = ToolMessage(content="Sunny, 72F", tool_call_id="call_1", name="get_weather") + events = [("updates", {"tools": {"messages": [tool_msg]}})] + + _, fake_streaming, _ = await _run_auto_send_turn(events) + + tool_resp_ctxs = [c for c in fake_streaming.contexts if isinstance(c.initial_content, ToolResponseContent)] + assert len(tool_resp_ctxs) == 1 + assert tool_resp_ctxs[0].initial_content.content == "Sunny, 72F" + assert tool_resp_ctxs[0].closed is True + + async def test_multi_step_accumulates_all_text(self): + """Unified surface: final_text accumulates all text, not just last segment.""" + from langchain_core.messages import AIMessage, ToolMessage, AIMessageChunk + + chunk1 = AIMessageChunk(content="Searching...") + ai_msg1 = AIMessage(content="Searching...", tool_calls=[{"id": "c1", "name": "s", "args": {}}]) + tool_msg = ToolMessage(content="results", tool_call_id="c1", name="s") + chunk2 = AIMessageChunk(content="Found it!") + ai_msg2 = AIMessage(content="Found it!") + + events = [ + ("messages", (chunk1, {})), + ("updates", {"agent": {"messages": [ai_msg1]}}), + ("updates", {"tools": {"messages": [tool_msg]}}), + ("messages", (chunk2, {})), + ("updates", {"agent": {"messages": [ai_msg2]}}), + ] + result, fake_streaming, _ = await _run_auto_send_turn(events) + + # All text accumulated + assert "Searching..." in result.final_text + assert "Found it!" in result.final_text + + # Two text streaming contexts + text_ctxs = [c for c in fake_streaming.contexts if isinstance(c.initial_content, TextContent)] + assert len(text_ctxs) == 2 + + async def test_empty_stream_returns_empty_final_text(self): + result, fake_streaming, _ = await _run_auto_send_turn([]) + assert result.final_text == "" + assert fake_streaming.contexts == [] + + async def test_turn_usage_populated_after_events_consumed(self): + """LangGraphTurn.usage() is populated via the on_final_ai_message callback + during event iteration. TurnResult.usage is a snapshot from before events run + (emitter.auto_send_turn evaluates turn.usage() eagerly); the authoritative + post-iteration usage is on turn.usage() directly.""" + from langchain_core.messages import AIMessage + + fake_streaming = _FakeStreaming() + usage_meta = {"input_tokens": 10, "output_tokens": 5, "total_tokens": 15} + ai_msg = AIMessage(content="hi", usage_metadata=usage_meta) + events = [("updates", {"agent": {"messages": [ai_msg]}})] + + turn = LangGraphTurn(_make_stream(events), model="gpt-4") + emitter = UnifiedEmitter( + task_id="task1", trace_id=None, parent_span_id=None, tracer=False, streaming=fake_streaming + ) + await emitter.auto_send_turn(turn) + + # After auto_send_turn, turn.usage() has the captured values + usage = turn.usage() + assert usage.input_tokens == 10 + assert usage.output_tokens == 5 + assert usage.total_tokens == 15 + + async def test_tracer_does_not_produce_tool_spans_for_full_events(self): + """AGX1-377: Full events don't trigger SpanDeriver tool spans.""" + from langchain_core.messages import AIMessage, ToolMessage + + tc = {"id": "c1", "name": "t", "args": {}} + ai_msg = AIMessage(content="", tool_calls=[tc]) + tool_msg = ToolMessage(content="ok", tool_call_id="c1", name="t") + + events = [ + ("updates", {"agent": {"messages": [ai_msg]}}), + ("updates", {"tools": {"messages": [tool_msg]}}), + ] + _, _, fake_tracing = await _run_auto_send_turn(events, trace_id="trace-1") + + assert fake_tracing is not None + assert fake_tracing.started == [], "AGX1-377: Full events don't trigger tool spans" diff --git a/tests/lib/core/harness/test_harness_langgraph_sync.py b/tests/lib/core/harness/test_harness_langgraph_sync.py new file mode 100644 index 000000000..6117cacd8 --- /dev/null +++ b/tests/lib/core/harness/test_harness_langgraph_sync.py @@ -0,0 +1,224 @@ +"""Integration test: sync (HTTP-yield) channel with a LangGraph agent. + +Exercises the unified harness surface (UnifiedEmitter.yield_turn + LangGraphTurn) +with a minimal fake LangGraph stream so the test runs fully offline (no API +keys, no Redis, no Agentex server). + +Agent description +----------------- +A simulated single-tool agent run using hand-crafted LangGraph event tuples: +one tool request + response, followed by a final text reply. + +What is tested +-------------- +- The sync handler correctly yields StreamTaskMessage* events in order: + Full(ToolRequest) then Full(ToolResponse) then text Start+Delta+Done. +- With trace_id + fake tracing, the SpanDeriver fires for text events. +- AGX1-377: tool calls are Full events (not Start+Done), so the SpanDeriver + does NOT produce tool spans for LangGraph (documented gap, tracked in AGX1-373). +- Final text is accumulated via yield mode. + +What is NOT covered without live infrastructure +----------------------------------------------- +- Actual HTTP streaming over the ACP sync endpoint. +- Real LLM calls or real LangGraph graph execution. +- The full FastACP request/response lifecycle. + +See also: test_harness_langgraph_async.py and test_harness_langgraph_temporal.py +for the other two channels. +""" + +from __future__ import annotations + +import sys +from typing import Any + +import pytest + +from agentex.lib.core.harness.tracer import SpanTracer +from agentex.lib.core.harness.emitter import UnifiedEmitter +from agentex.types.task_message_update import ( + StreamTaskMessageFull, + StreamTaskMessageStart, +) +from agentex.types.tool_request_content import ToolRequestContent +from agentex.types.tool_response_content import ToolResponseContent +from agentex.lib.adk._modules._langgraph_turn import LangGraphTurn + +# --------------------------------------------------------------------------- +# Remove conftest stubs so real langchain_core types are used +# --------------------------------------------------------------------------- + + +@pytest.fixture(autouse=True) +def _real_langchain_core(): + stub_keys = [k for k in sys.modules if k.startswith("langchain_core") or k.startswith("langgraph")] + saved = {k: sys.modules.pop(k) for k in stub_keys} + import importlib + + importlib.import_module("langchain_core.messages") + yield + sys.modules.update(saved) + + +# --------------------------------------------------------------------------- +# Fake tracing backend +# --------------------------------------------------------------------------- + + +class _FakeSpan: + def __init__(self, name: str) -> None: + self.name = name + self.output: Any = None + + +class _FakeTracing: + def __init__(self) -> None: + self.started: list[tuple[str, Any]] = [] + self.ended: list[tuple[str, Any]] = [] + + async def start_span( + self, *, trace_id: str, name: str, input: Any = None, parent_id: Any = None, **kw: Any + ) -> _FakeSpan: + self.started.append((name, parent_id)) + return _FakeSpan(name) + + async def end_span(self, *, trace_id: str, span: _FakeSpan) -> None: + self.ended.append((span.name, span.output)) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _make_stream(events: list[tuple[str, Any]]): + async def _gen(): + for e in events: + yield e + + return _gen() + + +async def _run_yield_turn( + stream_events: list[tuple[str, Any]], trace_id: str | None = None +) -> tuple[list[Any], _FakeTracing | None]: + fake_tracing = _FakeTracing() if trace_id else None + tracer: SpanTracer | bool | None = None + if trace_id and fake_tracing is not None: + tracer = SpanTracer(trace_id=trace_id, parent_span_id=None, task_id="task1", tracing=fake_tracing) + + emitter = UnifiedEmitter( + task_id="task1", + trace_id=trace_id, + parent_span_id=None, + tracer=tracer if tracer is not None else False, + ) + turn = LangGraphTurn(_make_stream(stream_events), model=None) + out = [e async for e in emitter.yield_turn(turn)] + return out, fake_tracing + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + + +class TestSyncYieldChannel: + async def test_text_only_stream_yields_start_delta_done(self): + from langchain_core.messages import AIMessage, AIMessageChunk + + chunk = AIMessageChunk(content="Hello from LangGraph!") + ai_msg = AIMessage(content="Hello from LangGraph!") + events = [ + ("messages", (chunk, {})), + ("updates", {"agent": {"messages": [ai_msg]}}), + ] + out, _ = await _run_yield_turn(events) + + types = [type(e).__name__ for e in out] + assert "StreamTaskMessageStart" in types + assert "StreamTaskMessageDelta" in types + assert "StreamTaskMessageDone" in types + + async def test_tool_call_yields_full_events(self): + from langchain_core.messages import AIMessage, ToolMessage + + tc = {"id": "call_1", "name": "get_weather", "args": {"city": "Paris"}} + ai_msg = AIMessage(content="", tool_calls=[tc]) + tool_msg = ToolMessage(content="Sunny, 72F", tool_call_id="call_1", name="get_weather") + events = [ + ("updates", {"agent": {"messages": [ai_msg]}}), + ("updates", {"tools": {"messages": [tool_msg]}}), + ] + out, _ = await _run_yield_turn(events) + + full_events = [e for e in out if isinstance(e, StreamTaskMessageFull)] + assert len(full_events) == 2 + + contents = [e.content for e in full_events] + assert any(isinstance(c, ToolRequestContent) for c in contents) + assert any(isinstance(c, ToolResponseContent) for c in contents) + + async def test_multi_step_yields_events_in_order(self): + from langchain_core.messages import AIMessage, ToolMessage, AIMessageChunk + + chunk1 = AIMessageChunk(content="Searching...") + ai_msg1 = AIMessage(content="Searching...", tool_calls=[{"id": "c1", "name": "search", "args": {"q": "test"}}]) + tool_msg = ToolMessage(content="results", tool_call_id="c1", name="search") + chunk2 = AIMessageChunk(content="Found it!") + ai_msg2 = AIMessage(content="Found it!") + + events = [ + ("messages", (chunk1, {})), + ("updates", {"agent": {"messages": [ai_msg1]}}), + ("updates", {"tools": {"messages": [tool_msg]}}), + ("messages", (chunk2, {})), + ("updates", {"agent": {"messages": [ai_msg2]}}), + ] + out, _ = await _run_yield_turn(events) + + # Should have multiple start events (one per text segment) + starts = [e for e in out if isinstance(e, StreamTaskMessageStart)] + assert len(starts) >= 2 + # And two Full events (tool req + tool resp) + fulls = [e for e in out if isinstance(e, StreamTaskMessageFull)] + assert len(fulls) == 2 + + async def test_empty_stream_yields_nothing(self): + out, _ = await _run_yield_turn([]) + assert out == [] + + async def test_tracer_invoked_but_no_tool_spans_for_full_events(self): + """AGX1-377: tool spans are NOT derived from Full events (SpanDeriver uses Start+Done). + This is the documented gap; full cross-channel equivalence arrives with AGX1-373.""" + from langchain_core.messages import AIMessage, ToolMessage + + tc = {"id": "c1", "name": "t", "args": {}} + ai_msg = AIMessage(content="", tool_calls=[tc]) + tool_msg = ToolMessage(content="ok", tool_call_id="c1", name="t") + + events = [ + ("updates", {"agent": {"messages": [ai_msg]}}), + ("updates", {"tools": {"messages": [tool_msg]}}), + ] + _, fake_tracing = await _run_yield_turn(events, trace_id="trace-1") + + assert fake_tracing is not None + # No tool spans opened — Full events don't trigger OpenSpan in SpanDeriver + assert fake_tracing.started == [], "Expected no tool spans for LangGraph Full events (AGX1-377)" + + async def test_usage_captured_after_yield(self): + from langchain_core.messages import AIMessage + + usage_meta = {"input_tokens": 10, "output_tokens": 5, "total_tokens": 15} + ai_msg = AIMessage(content="Hi!", usage_metadata=usage_meta) + events = [("updates", {"agent": {"messages": [ai_msg]}})] + + turn = LangGraphTurn(_make_stream(events), model="gpt-4") + emitter = UnifiedEmitter(task_id="t", trace_id=None, parent_span_id=None) + _ = [e async for e in emitter.yield_turn(turn)] + + usage = turn.usage() + assert usage.input_tokens == 10 + assert usage.output_tokens == 5 diff --git a/tests/lib/core/harness/test_harness_langgraph_temporal.py b/tests/lib/core/harness/test_harness_langgraph_temporal.py new file mode 100644 index 000000000..1a094a33c --- /dev/null +++ b/tests/lib/core/harness/test_harness_langgraph_temporal.py @@ -0,0 +1,233 @@ +"""Integration test: Temporal channel with a LangGraph agent. + +The Temporal LangGraph agent pattern uses ``emit_langgraph_messages`` (from +``_langgraph_messages.py``) inside a Temporal activity. That module is not +yet unified onto the harness surface (it has its own Redis-streaming code). + +This test file verifies the LangGraph Temporal agent's streaming behavior using +the same fake streaming infrastructure as test_harness_langgraph_async.py. The +key difference from the non-temporal async path is that in Temporal, each agent +turn runs inside a Temporal activity that has already been handed the task_id +and a pre-wired streaming client — so the ``UnifiedEmitter.auto_send_turn`` +path is identical. The graph activities and workflow scaffolding are not tested +here; that requires a running Temporal cluster. + +What is tested +-------------- +- stream_langgraph_events (the public async API used by temporal agent acp.py via + the workflow activity) produces the same result via the unified surface. +- Usage from AIMessage.usage_metadata is captured in TurnResult.usage. +- The auto_send_turn path for a temporal-style call (same as async). + +What is NOT covered without live infrastructure +----------------------------------------------- +- Actual Temporal workflow execution (requires a running Temporal cluster). +- The Temporal activity retry/compensation logic. +- LangGraph checkpoint storage via TemporalCheckpointer. +- emit_langgraph_messages (the Temporal-specific streaming helper). +- Real LLM calls or real LangGraph graph execution. + +See also: test_harness_langgraph_sync.py and test_harness_langgraph_async.py. +""" + +from __future__ import annotations + +import sys +from typing import Any +from dataclasses import field, dataclass + +import pytest + +from agentex.types.task_message import TaskMessage +from agentex.types.text_content import TextContent +from agentex.lib.core.harness.emitter import UnifiedEmitter +from agentex.types.tool_request_content import ToolRequestContent +from agentex.types.tool_response_content import ToolResponseContent +from agentex.lib.adk._modules._langgraph_turn import LangGraphTurn +from agentex.lib.adk._modules._langgraph_async import stream_langgraph_events + +# --------------------------------------------------------------------------- +# Remove conftest stubs so real langchain_core types are used +# --------------------------------------------------------------------------- + + +@pytest.fixture(autouse=True) +def _real_langchain_core(): + stub_keys = [k for k in sys.modules if k.startswith("langchain_core") or k.startswith("langgraph")] + saved = {k: sys.modules.pop(k) for k in stub_keys} + import importlib + + importlib.import_module("langchain_core.messages") + yield + sys.modules.update(saved) + + +# --------------------------------------------------------------------------- +# Fake streaming backend +# --------------------------------------------------------------------------- + + +@dataclass +class _FakeCtx: + ctype: str + initial_content: Any + task_message: TaskMessage + closed: bool = False + deltas: list[Any] = field(default_factory=list) + + async def __aenter__(self) -> "_FakeCtx": + return self + + async def __aexit__(self, *args: Any) -> bool: + await self.close() + return False + + async def close(self) -> None: + self.closed = True + + async def stream_update(self, update: Any) -> Any: + self.deltas.append(update) + return update + + +class _FakeStreaming: + def __init__(self) -> None: + self.contexts: list[_FakeCtx] = [] + + def streaming_task_message_context(self, task_id: str, initial_content: Any, **kw: Any) -> _FakeCtx: + ctype = getattr(initial_content, "type", None) or "" + tm = TaskMessage(id=f"m{len(self.contexts) + 1}", task_id=task_id, content=initial_content) + ctx = _FakeCtx(ctype=ctype, initial_content=initial_content, task_message=tm) + self.contexts.append(ctx) + return ctx + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _make_stream(events: list[tuple[str, Any]]): + async def _gen(): + for e in events: + yield e + + return _gen() + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + + +class TestTemporalAutoSendChannel: + async def test_stream_langgraph_events_plain_text(self, monkeypatch): + """stream_langgraph_events (used by temporal agents via the acp.py activity) returns + the accumulated final text.""" + from langchain_core.messages import AIMessage, AIMessageChunk + + from agentex.lib import adk as adk_module + + fake_streaming = _FakeStreaming() + monkeypatch.setattr(adk_module, "streaming", fake_streaming) + + chunk = AIMessageChunk(content="Hello Temporal!") + ai_msg = AIMessage(content="Hello Temporal!") + events = [ + ("messages", (chunk, {})), + ("updates", {"agent": {"messages": [ai_msg]}}), + ] + + final = await stream_langgraph_events(_make_stream(events), "task-1") + assert final == "Hello Temporal!" + + async def test_stream_langgraph_events_tool_call(self, monkeypatch): + from langchain_core.messages import AIMessage, ToolMessage + + from agentex.lib import adk as adk_module + + fake_streaming = _FakeStreaming() + monkeypatch.setattr(adk_module, "streaming", fake_streaming) + + tc = {"id": "c1", "name": "search", "args": {"q": "test"}} + ai_msg = AIMessage(content="", tool_calls=[tc]) + tool_msg = ToolMessage(content="results", tool_call_id="c1", name="search") + chunk_final = AIMessage(content="Here are the results.") + + events = [ + ("updates", {"agent": {"messages": [ai_msg]}}), + ("updates", {"tools": {"messages": [tool_msg]}}), + ("updates", {"agent": {"messages": [chunk_final]}}), + ] + + final = await stream_langgraph_events(_make_stream(events), "task-1") + + # Check tool request and response posted to fake streaming + tool_req_ctxs = [c for c in fake_streaming.contexts if isinstance(c.initial_content, ToolRequestContent)] + tool_resp_ctxs = [c for c in fake_streaming.contexts if isinstance(c.initial_content, ToolResponseContent)] + assert len(tool_req_ctxs) == 1 + assert len(tool_resp_ctxs) == 1 + assert tool_req_ctxs[0].initial_content.name == "search" + + async def test_langgraph_turn_auto_send_via_unified_emitter(self): + """Direct UnifiedEmitter.auto_send_turn path used by temporal agent workflow + activities. Uses a fake streaming backend (no Redis).""" + from langchain_core.messages import AIMessage, AIMessageChunk + + fake_streaming = _FakeStreaming() + chunk = AIMessageChunk(content="Temporal answer!") + ai_msg = AIMessage(content="Temporal answer!") + events = [ + ("messages", (chunk, {})), + ("updates", {"agent": {"messages": [ai_msg]}}), + ] + + turn = LangGraphTurn(_make_stream(events), model=None) + emitter = UnifiedEmitter( + task_id="task-1", + trace_id=None, + parent_span_id=None, + streaming=fake_streaming, + ) + result = await emitter.auto_send_turn(turn) + + assert result.final_text == "Temporal answer!" + text_ctxs = [c for c in fake_streaming.contexts if isinstance(c.initial_content, TextContent)] + assert len(text_ctxs) == 1 + + async def test_usage_captured_via_turn_after_events_consumed(self): + """Usage from AIMessage.usage_metadata is captured via the on_final_ai_message + callback during event iteration. The authoritative usage is on turn.usage() + after events are consumed (emitter.auto_send_turn evaluates turn.usage() + eagerly before iteration, so TurnResult.usage is a pre-iteration snapshot).""" + from langchain_core.messages import AIMessage + + fake_streaming = _FakeStreaming() + usage_meta = {"input_tokens": 20, "output_tokens": 10, "total_tokens": 30} + ai_msg = AIMessage(content="answer", usage_metadata=usage_meta) + events = [("updates", {"agent": {"messages": [ai_msg]}})] + + turn = LangGraphTurn(_make_stream(events), model="gpt-4o") + emitter = UnifiedEmitter( + task_id="task-1", + trace_id=None, + parent_span_id=None, + streaming=fake_streaming, + ) + await emitter.auto_send_turn(turn) + + # After auto_send_turn, turn.usage() has the captured values + usage = turn.usage() + assert usage.input_tokens == 20 + assert usage.output_tokens == 10 + assert usage.total_tokens == 30 + + async def test_empty_stream_returns_empty_string(self, monkeypatch): + from agentex.lib import adk as adk_module + + fake_streaming = _FakeStreaming() + monkeypatch.setattr(adk_module, "streaming", fake_streaming) + + final = await stream_langgraph_events(_make_stream([]), "task-1") + assert final == "" + assert fake_streaming.contexts == [] From e43b36624b55e44003b70a72697afc3764e679fc Mon Sep 17 00:00:00 2001 From: Declan Brady Date: Thu, 18 Jun 2026 16:37:07 -0400 Subject: [PATCH 34/35] feat(langgraph): tutorial agents + CI live-matrix + pyright fixes Task 9: add 3 deployable tutorial agents that demonstrate the unified harness surface side-by-side with the bespoke reference examples: - examples/tutorials/00_sync/harness_langgraph/ (s-harness-langgraph) uses UnifiedEmitter.yield_turn(LangGraphTurn(stream)) - examples/tutorials/10_async/00_base/harness_langgraph/ (a-harness-langgraph) uses UnifiedEmitter.auto_send_turn(LangGraphTurn(stream)) - examples/tutorials/10_async/10_temporal/harness_langgraph/ (at-harness-langgraph) follows 130_langgraph pattern (LangGraphPlugin + emit_langgraph_messages) Task 10: enable live-matrix CI job in harness-integration.yml with a 3-way matrix over [sync, async, temporal] running offline integration tests. Also add test_harness_langgraph_*.py to PR path triggers. Task 11 (pyright fixes): annotate convert_langgraph_to_agentex_events and _generate_events with AsyncGenerator return types so pyright infers them as async generators rather than coroutines. Add start_time to Span construction in test_langgraph_sync_unified.py fake tracing backend. Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/harness-integration.yml | 27 +++- .../00_sync/harness_langgraph/Dockerfile | 50 ++++++ .../00_sync/harness_langgraph/README.md | 55 +++++++ .../00_sync/harness_langgraph/manifest.yaml | 58 +++++++ .../harness_langgraph/project/__init__.py | 0 .../00_sync/harness_langgraph/project/acp.py | 100 ++++++++++++ .../harness_langgraph/project/graph.py | 67 ++++++++ .../harness_langgraph/project/tools.py | 24 +++ .../00_sync/harness_langgraph/pyproject.toml | 37 +++++ .../harness_langgraph/tests/test_agent.py | 144 ++++++++++++++++++ .../00_base/harness_langgraph/Dockerfile | 50 ++++++ .../00_base/harness_langgraph/README.md | 57 +++++++ .../00_base/harness_langgraph/manifest.yaml | 58 +++++++ .../harness_langgraph/project/__init__.py | 0 .../00_base/harness_langgraph/project/acp.py | 109 +++++++++++++ .../harness_langgraph/project/graph.py | 67 ++++++++ .../harness_langgraph/project/tools.py | 24 +++ .../00_base/harness_langgraph/pyproject.toml | 37 +++++ .../harness_langgraph/tests/test_agent.py | 100 ++++++++++++ .../10_temporal/harness_langgraph/Dockerfile | 43 ++++++ .../10_temporal/harness_langgraph/README.md | 53 +++++++ .../harness_langgraph/manifest.yaml | 51 +++++++ .../harness_langgraph/project/__init__.py | 0 .../harness_langgraph/project/acp.py | 34 +++++ .../harness_langgraph/project/graph.py | 85 +++++++++++ .../harness_langgraph/project/run_worker.py | 46 ++++++ .../harness_langgraph/project/tools.py | 24 +++ .../harness_langgraph/project/workflow.py | 80 ++++++++++ .../harness_langgraph/pyproject.toml | 40 +++++ .../harness_langgraph/tests/test_agent.py | 106 +++++++++++++ .../lib/adk/_modules/_langgraph_sync.py | 10 +- .../lib/adk/_modules/_langgraph_turn.py | 3 +- tests/lib/adk/test_langgraph_sync_unified.py | 2 + 33 files changed, 1635 insertions(+), 6 deletions(-) create mode 100644 examples/tutorials/00_sync/harness_langgraph/Dockerfile create mode 100644 examples/tutorials/00_sync/harness_langgraph/README.md create mode 100644 examples/tutorials/00_sync/harness_langgraph/manifest.yaml create mode 100644 examples/tutorials/00_sync/harness_langgraph/project/__init__.py create mode 100644 examples/tutorials/00_sync/harness_langgraph/project/acp.py create mode 100644 examples/tutorials/00_sync/harness_langgraph/project/graph.py create mode 100644 examples/tutorials/00_sync/harness_langgraph/project/tools.py create mode 100644 examples/tutorials/00_sync/harness_langgraph/pyproject.toml create mode 100644 examples/tutorials/00_sync/harness_langgraph/tests/test_agent.py create mode 100644 examples/tutorials/10_async/00_base/harness_langgraph/Dockerfile create mode 100644 examples/tutorials/10_async/00_base/harness_langgraph/README.md create mode 100644 examples/tutorials/10_async/00_base/harness_langgraph/manifest.yaml create mode 100644 examples/tutorials/10_async/00_base/harness_langgraph/project/__init__.py create mode 100644 examples/tutorials/10_async/00_base/harness_langgraph/project/acp.py create mode 100644 examples/tutorials/10_async/00_base/harness_langgraph/project/graph.py create mode 100644 examples/tutorials/10_async/00_base/harness_langgraph/project/tools.py create mode 100644 examples/tutorials/10_async/00_base/harness_langgraph/pyproject.toml create mode 100644 examples/tutorials/10_async/00_base/harness_langgraph/tests/test_agent.py create mode 100644 examples/tutorials/10_async/10_temporal/harness_langgraph/Dockerfile create mode 100644 examples/tutorials/10_async/10_temporal/harness_langgraph/README.md create mode 100644 examples/tutorials/10_async/10_temporal/harness_langgraph/manifest.yaml create mode 100644 examples/tutorials/10_async/10_temporal/harness_langgraph/project/__init__.py create mode 100644 examples/tutorials/10_async/10_temporal/harness_langgraph/project/acp.py create mode 100644 examples/tutorials/10_async/10_temporal/harness_langgraph/project/graph.py create mode 100644 examples/tutorials/10_async/10_temporal/harness_langgraph/project/run_worker.py create mode 100644 examples/tutorials/10_async/10_temporal/harness_langgraph/project/tools.py create mode 100644 examples/tutorials/10_async/10_temporal/harness_langgraph/project/workflow.py create mode 100644 examples/tutorials/10_async/10_temporal/harness_langgraph/pyproject.toml create mode 100644 examples/tutorials/10_async/10_temporal/harness_langgraph/tests/test_agent.py diff --git a/.github/workflows/harness-integration.yml b/.github/workflows/harness-integration.yml index ab6b353b9..c24b9cb78 100644 --- a/.github/workflows/harness-integration.yml +++ b/.github/workflows/harness-integration.yml @@ -7,6 +7,8 @@ on: paths: - "src/agentex/lib/core/harness/**" - "src/agentex/lib/adk/_modules/**" + - "tests/lib/core/harness/test_harness_pydantic_ai_*.py" + - "tests/lib/core/harness/test_harness_langgraph_*.py" - ".github/workflows/harness-integration.yml" jobs: @@ -31,10 +33,27 @@ jobs: - name: Conformance suite run: ./scripts/test tests/lib/core/harness/ -v - # Live integration matrix (harness x {sync, async, temporal}) is added per-harness - # in the migration plans. Placeholder job keeps the workflow valid until then. + # Offline LangGraph integration tests (sync / async / temporal channels). + # These use fake LangGraph streams + fake streaming/tracing and require no live + # infrastructure. Enabled here for PR 5 (LangGraph migration). live-matrix: runs-on: ubuntu-latest - if: false # enabled once the first harness's test agents land + strategy: + matrix: + channel: [sync, async, temporal] + fail-fast: false + name: langgraph-${{ matrix.channel }} steps: - - run: echo "populated by migration PRs" # TODO(harness-migration): enable per-harness; see docs/superpowers/plans migration PRs 4-8 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - name: Install uv + uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86 # v5.4.2 + with: + version: '0.10.2' + + - name: Bootstrap + run: ./scripts/bootstrap + + - name: langgraph ${{ matrix.channel }} integration tests (offline, fake stream) + run: | + ./scripts/test tests/lib/core/harness/test_harness_langgraph_${{ matrix.channel }}.py -v diff --git a/examples/tutorials/00_sync/harness_langgraph/Dockerfile b/examples/tutorials/00_sync/harness_langgraph/Dockerfile new file mode 100644 index 000000000..9d492198f --- /dev/null +++ b/examples/tutorials/00_sync/harness_langgraph/Dockerfile @@ -0,0 +1,50 @@ +# syntax=docker/dockerfile:1.3 +FROM python:3.12-slim +COPY --from=ghcr.io/astral-sh/uv:0.6.4 /uv /uvx /bin/ + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + htop \ + vim \ + curl \ + tar \ + python3-dev \ + postgresql-client \ + build-essential \ + libpq-dev \ + gcc \ + cmake \ + netcat-openbsd \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +RUN uv pip install --system --upgrade pip setuptools wheel + +ENV UV_HTTP_TIMEOUT=1000 + +# Copy pyproject.toml and README.md to install dependencies +COPY 00_sync/harness_langgraph/pyproject.toml /app/harness_langgraph/pyproject.toml +COPY 00_sync/harness_langgraph/README.md /app/harness_langgraph/README.md + +WORKDIR /app/harness_langgraph + +# Copy the project code +COPY 00_sync/harness_langgraph/project /app/harness_langgraph/project + +# Copy the test files +COPY 00_sync/harness_langgraph/tests /app/harness_langgraph/tests + +# Copy shared test utilities +COPY test_utils /app/test_utils + +# Install the required Python packages with dev dependencies +RUN uv pip install --system .[dev] + +# Set environment variables +ENV PYTHONPATH=/app + +# Set test environment variables +ENV AGENT_NAME=s-harness-langgraph + +# Run the agent using uvicorn +CMD ["uvicorn", "project.acp:acp", "--host", "0.0.0.0", "--port", "8000"] diff --git a/examples/tutorials/00_sync/harness_langgraph/README.md b/examples/tutorials/00_sync/harness_langgraph/README.md new file mode 100644 index 000000000..86367f162 --- /dev/null +++ b/examples/tutorials/00_sync/harness_langgraph/README.md @@ -0,0 +1,55 @@ +# Tutorial: Sync Harness LangGraph Agent + +This tutorial demonstrates how to build a **synchronous** LangGraph agent on AgentEx +using the **unified harness surface**: + +```python +turn = LangGraphTurn(stream, model=None) +emitter = UnifiedEmitter(task_id=task_id, trace_id=task_id, ...) +async for event in emitter.yield_turn(turn): + yield event +``` + +Compare with ``030_langgraph``, which uses the bespoke +``convert_langgraph_to_agentex_events`` helper directly. + +## Key Concepts + +### Unified Harness + +`LangGraphTurn` implements the `HarnessTurn` protocol: it wraps the raw +LangGraph `astream()` generator and exposes `events` (an async generator of +`TaskMessageUpdate`) and `usage()` (token counts captured from the final +`AIMessage`). + +`UnifiedEmitter.yield_turn(turn)` iterates the turn's events and yields them +to the sync ACP handler unchanged. The same `LangGraphTurn` object can also be +passed to `UnifiedEmitter.auto_send_turn` in the async/temporal channels. + +### AGX1-377 Note + +LangGraph emits tool requests as `StreamTaskMessageFull` events (from "updates" +node outputs). The `SpanDeriver` does not open tool spans from Full events +today; that gap is tracked in AGX1-373. + +## Files + +| File | Description | +|------|-------------| +| `project/acp.py` | ACP server using unified harness (LangGraphTurn + yield_turn) | +| `project/graph.py` | LangGraph state graph (identical to 030_langgraph) | +| `project/tools.py` | Tool definitions (weather example) | +| `tests/test_agent.py` | Integration tests | +| `manifest.yaml` | Agent configuration (name: s-harness-langgraph) | + +## Running Locally + +```bash +agentex agents run +``` + +## Running Tests + +```bash +pytest tests/test_agent.py -v +``` diff --git a/examples/tutorials/00_sync/harness_langgraph/manifest.yaml b/examples/tutorials/00_sync/harness_langgraph/manifest.yaml new file mode 100644 index 000000000..1f57678f2 --- /dev/null +++ b/examples/tutorials/00_sync/harness_langgraph/manifest.yaml @@ -0,0 +1,58 @@ +build: + context: + root: ../../ + include_paths: + - 00_sync/harness_langgraph + - test_utils + dockerfile: 00_sync/harness_langgraph/Dockerfile + dockerignore: 00_sync/harness_langgraph/.dockerignore + +local_development: + agent: + port: 8000 + host_address: host.docker.internal + paths: + acp: project/acp.py + +agent: + acp_type: sync + name: s-harness-langgraph + description: A sync LangGraph agent using the unified harness surface (LangGraphTurn + UnifiedEmitter.yield_turn) + + temporal: + enabled: false + + credentials: + - env_var_name: OPENAI_API_KEY + secret_name: openai-api-key + secret_key: api-key + - env_var_name: REDIS_URL + secret_name: redis-url-secret + secret_key: url + - env_var_name: SGP_API_KEY + secret_name: sgp-api-key + secret_key: api-key + - env_var_name: SGP_ACCOUNT_ID + secret_name: sgp-account-id + secret_key: account-id + - env_var_name: SGP_CLIENT_BASE_URL + secret_name: sgp-client-base-url + secret_key: url + +deployment: + image: + repository: "" + tag: "latest" + + global: + agent: + name: "s-harness-langgraph" + description: "A sync LangGraph agent using the unified harness surface" + replicaCount: 1 + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "1000m" + memory: "2Gi" diff --git a/examples/tutorials/00_sync/harness_langgraph/project/__init__.py b/examples/tutorials/00_sync/harness_langgraph/project/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/tutorials/00_sync/harness_langgraph/project/acp.py b/examples/tutorials/00_sync/harness_langgraph/project/acp.py new file mode 100644 index 000000000..a9b4cc5b2 --- /dev/null +++ b/examples/tutorials/00_sync/harness_langgraph/project/acp.py @@ -0,0 +1,100 @@ +"""ACP handler for sync harness LangGraph agent. + +Uses the unified harness surface: ``LangGraphTurn`` wraps the LangGraph +``astream()`` generator, and ``UnifiedEmitter.yield_turn`` converts it into +the AgentEx ``TaskMessageUpdate`` event stream expected by the sync ACP. + +Differences from ``030_langgraph`` (bespoke path): +- No ``create_langgraph_tracing_handler`` boilerplate. +- No manual text-delta accumulation for the span output. +- Tool calls are emitted as ``StreamTaskMessageFull`` (not Start+Delta+Done) + via the same code path as the async/temporal channels. +- Usage data (token counts) is captured on the ``LangGraphTurn`` object and + can be read after the turn completes. + +AGX1-377 note: LangGraph emits tool requests as ``StreamTaskMessageFull`` +events (from "updates"). The ``SpanDeriver`` does not open tool spans from +Full events today; that gap is tracked in AGX1-373. +""" + +from __future__ import annotations + +import os +from typing import AsyncGenerator + +from dotenv import load_dotenv + +load_dotenv() + +import agentex.lib.adk as adk +from project.graph import create_graph +from agentex.lib.types.acp import SendMessageParams +from agentex.lib.types.tracing import SGPTracingProcessorConfig +from agentex.lib.utils.logging import make_logger +from agentex.lib.sdk.fastacp.fastacp import FastACP +from agentex.lib.core.harness.emitter import UnifiedEmitter +from agentex.types.task_message_update import TaskMessageUpdate +from agentex.types.task_message_content import TaskMessageContent +from agentex.lib.adk._modules._langgraph_turn import LangGraphTurn +from agentex.lib.core.tracing.tracing_processor_manager import add_tracing_processor_config + +logger = make_logger(__name__) + +add_tracing_processor_config( + SGPTracingProcessorConfig( + sgp_api_key=os.environ.get("SGP_API_KEY", ""), + sgp_account_id=os.environ.get("SGP_ACCOUNT_ID", ""), + sgp_base_url=os.environ.get("SGP_CLIENT_BASE_URL", ""), + ) +) + +acp = FastACP.create(acp_type="sync") + +_graph = None + + +async def get_graph(): + """Get or create the compiled graph instance.""" + global _graph + if _graph is None: + _graph = await create_graph() + return _graph + + +@acp.on_message_send +async def handle_message_send( + params: SendMessageParams, +) -> TaskMessageContent | list[TaskMessageContent] | AsyncGenerator[TaskMessageUpdate, None]: + """Handle incoming messages, streaming tokens and tool calls via unified harness.""" + graph = await get_graph() + + task_id = params.task.id + user_message = params.content.content + + logger.info(f"Processing message for task {task_id}") + + async with adk.tracing.span( + trace_id=task_id, + task_id=task_id, + name="message", + input={"message": user_message}, + data={"__span_type__": "AGENT_WORKFLOW"}, + ) as turn_span: + stream = graph.astream( + {"messages": [{"role": "user", "content": user_message}]}, + config={"configurable": {"thread_id": task_id}}, + stream_mode=["messages", "updates"], + ) + + turn = LangGraphTurn(stream, model=None) + emitter = UnifiedEmitter( + task_id=task_id, + trace_id=task_id, + parent_span_id=turn_span.id if turn_span else None, + ) + + async for event in emitter.yield_turn(turn): + yield event + + if turn_span: + turn_span.output = {"final_output": turn.usage().model_dump()} diff --git a/examples/tutorials/00_sync/harness_langgraph/project/graph.py b/examples/tutorials/00_sync/harness_langgraph/project/graph.py new file mode 100644 index 000000000..4516087d2 --- /dev/null +++ b/examples/tutorials/00_sync/harness_langgraph/project/graph.py @@ -0,0 +1,67 @@ +"""LangGraph graph definition for the harness_langgraph sync agent. + +Identical to ``030_langgraph/project/graph.py`` — the graph definition is not +affected by the harness migration. Only ``acp.py`` changes. +""" + +from __future__ import annotations + +from typing import Any, Annotated +from datetime import datetime +from typing_extensions import TypedDict + +from langgraph.graph import START, StateGraph +from langchain_openai import ChatOpenAI +from langgraph.prebuilt import ToolNode, tools_condition +from langchain_core.messages import SystemMessage +from langgraph.graph.message import add_messages + +from project.tools import TOOLS +from agentex.lib.adk import create_checkpointer + +MODEL_NAME = "gpt-5" +SYSTEM_PROMPT = """You are a helpful AI assistant with access to tools. + +Current date and time: {timestamp} + +Guidelines: +- Be concise and helpful +- Use tools when they would help answer the user's question +- If you're unsure, ask clarifying questions +- Always provide accurate information +""" + + +class AgentState(TypedDict): + """State schema for the agent graph.""" + + messages: Annotated[list[Any], add_messages] + + +async def create_graph(): + """Create and compile the agent graph with checkpointer.""" + llm = ChatOpenAI( + model=MODEL_NAME, + reasoning={"effort": "high", "summary": "auto"}, + ) + llm_with_tools = llm.bind_tools(TOOLS) + + checkpointer = await create_checkpointer() + + def agent_node(state: AgentState) -> dict[str, Any]: + """Process the current state and generate a response.""" + messages = state["messages"] + if not messages or not isinstance(messages[0], SystemMessage): + system_content = SYSTEM_PROMPT.format(timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + messages = [SystemMessage(content=system_content)] + messages + response = llm_with_tools.invoke(messages) + return {"messages": [response]} + + builder = StateGraph(AgentState) + builder.add_node("agent", agent_node) + builder.add_node("tools", ToolNode(tools=TOOLS)) + builder.add_edge(START, "agent") + builder.add_conditional_edges("agent", tools_condition, "tools") + builder.add_edge("tools", "agent") + + return builder.compile(checkpointer=checkpointer) diff --git a/examples/tutorials/00_sync/harness_langgraph/project/tools.py b/examples/tutorials/00_sync/harness_langgraph/project/tools.py new file mode 100644 index 000000000..f02587430 --- /dev/null +++ b/examples/tutorials/00_sync/harness_langgraph/project/tools.py @@ -0,0 +1,24 @@ +"""Tool definitions for the harness_langgraph sync agent.""" + +from langchain_core.tools import Tool + + +def get_weather(city: str) -> str: + """Get the current weather for a city. + + Args: + city: The name of the city to get weather for. + + Returns: + A string describing the weather conditions. + """ + return f"The weather in {city} is sunny and 72°F" + + +weather_tool = Tool( + name="get_weather", + func=get_weather, + description="Get the current weather for a city. Input should be a city name.", +) + +TOOLS = [weather_tool] diff --git a/examples/tutorials/00_sync/harness_langgraph/pyproject.toml b/examples/tutorials/00_sync/harness_langgraph/pyproject.toml new file mode 100644 index 000000000..deecd08b3 --- /dev/null +++ b/examples/tutorials/00_sync/harness_langgraph/pyproject.toml @@ -0,0 +1,37 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "s-harness-langgraph" +version = "0.1.0" +description = "A sync LangGraph agent using the unified harness surface" +readme = "README.md" +requires-python = ">=3.12" +dependencies = [ + "agentex-sdk", + "scale-gp", + "langgraph", + "langchain-openai", +] + +[project.optional-dependencies] +dev = [ + "pytest", + "pytest-asyncio", + "httpx", + "black", + "isort", + "flake8", +] + +[tool.hatch.build.targets.wheel] +packages = ["project"] + +[tool.black] +line-length = 88 +target-version = ['py312'] + +[tool.isort] +profile = "black" +line_length = 88 diff --git a/examples/tutorials/00_sync/harness_langgraph/tests/test_agent.py b/examples/tutorials/00_sync/harness_langgraph/tests/test_agent.py new file mode 100644 index 000000000..2eb561cec --- /dev/null +++ b/examples/tutorials/00_sync/harness_langgraph/tests/test_agent.py @@ -0,0 +1,144 @@ +""" +Tests for the sync harness LangGraph agent. + +Validates the unified harness surface (LangGraphTurn + UnifiedEmitter.yield_turn) +end-to-end against a live AgentEx server. + +Configuration: +- AGENTEX_API_BASE_URL: Base URL for the AgentEx server (default: http://localhost:5003) +- AGENT_NAME: Name of the agent to test (default: s-harness-langgraph) +""" + +import os + +import pytest +from test_utils.sync import validate_text_in_string, collect_streaming_response + +from agentex import Agentex +from agentex.types import TextContent, TextContentParam +from agentex.types.agent_rpc_params import ParamsCreateTaskRequest, ParamsSendMessageRequest +from agentex.lib.sdk.fastacp.base.base_acp_server import uuid + +AGENTEX_API_BASE_URL = os.environ.get("AGENTEX_API_BASE_URL", "http://localhost:5003") +AGENT_NAME = os.environ.get("AGENT_NAME", "s-harness-langgraph") + + +@pytest.fixture +def client(): + return Agentex(base_url=AGENTEX_API_BASE_URL) + + +@pytest.fixture +def agent_name(): + return AGENT_NAME + + +@pytest.fixture +def agent_id(client, agent_name): + agents = client.agents.list() + for agent in agents: + if agent.name == agent_name: + return agent.id + raise ValueError(f"Agent with name {agent_name} not found.") + + +class TestNonStreamingMessages: + def test_send_simple_message(self, client: Agentex, agent_name: str): + response = client.agents.send_message( + agent_name=agent_name, + params=ParamsSendMessageRequest( + content=TextContentParam( + author="user", + content="Hello! What can you help me with?", + type="text", + ) + ), + ) + result = response.result + assert result is not None + assert len(result) >= 1 + + def test_tool_calling(self, client: Agentex, agent_name: str): + response = client.agents.send_message( + agent_name=agent_name, + params=ParamsSendMessageRequest( + content=TextContentParam( + author="user", + content="What's the weather in San Francisco?", + type="text", + ) + ), + ) + result = response.result + assert result is not None + assert len(result) >= 1 + + def test_multiturn_conversation(self, client: Agentex, agent_name: str, agent_id: str): + task_response = client.agents.create_task(agent_id, params=ParamsCreateTaskRequest(name=uuid.uuid1().hex)) + task = task_response.result + assert task is not None + + response1 = client.agents.send_message( + agent_name=agent_name, + params=ParamsSendMessageRequest( + content=TextContentParam( + author="user", + content="My name is Alice. Remember that.", + type="text", + ), + task_id=task.id, + ), + ) + assert response1.result is not None + + response2 = client.agents.send_message( + agent_name=agent_name, + params=ParamsSendMessageRequest( + content=TextContentParam( + author="user", + content="What is my name?", + type="text", + ), + task_id=task.id, + ), + ) + assert response2.result is not None + for message in response2.result: + if isinstance(message.content, TextContent): + validate_text_in_string("alice", message.content.content.lower()) + + +class TestStreamingMessages: + def test_stream_simple_message(self, client: Agentex, agent_name: str): + stream = client.agents.send_message_stream( + agent_name=agent_name, + params=ParamsSendMessageRequest( + content=TextContentParam( + author="user", + content="Tell me a short joke.", + type="text", + ) + ), + ) + aggregated_content, chunks = collect_streaming_response(stream) + assert aggregated_content is not None + assert len(chunks) > 1, "No chunks received in streaming response." + + def test_stream_tool_calling(self, client: Agentex, agent_name: str): + stream = client.agents.send_message_stream( + agent_name=agent_name, + params=ParamsSendMessageRequest( + content=TextContentParam( + author="user", + content="What's the weather in New York?", + type="text", + ) + ), + ) + aggregated_content, chunks = collect_streaming_response(stream) + assert aggregated_content is not None + assert len(chunks) > 0, "No chunks received in streaming response." + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/examples/tutorials/10_async/00_base/harness_langgraph/Dockerfile b/examples/tutorials/10_async/00_base/harness_langgraph/Dockerfile new file mode 100644 index 000000000..3e0bd696a --- /dev/null +++ b/examples/tutorials/10_async/00_base/harness_langgraph/Dockerfile @@ -0,0 +1,50 @@ +# syntax=docker/dockerfile:1.3 +FROM python:3.12-slim +COPY --from=ghcr.io/astral-sh/uv:0.6.4 /uv /uvx /bin/ + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + htop \ + vim \ + curl \ + tar \ + python3-dev \ + postgresql-client \ + build-essential \ + libpq-dev \ + gcc \ + cmake \ + netcat-openbsd \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +RUN uv pip install --system --upgrade pip setuptools wheel + +ENV UV_HTTP_TIMEOUT=1000 + +# Copy pyproject.toml and README.md to install dependencies +COPY 10_async/00_base/harness_langgraph/pyproject.toml /app/harness_langgraph/pyproject.toml +COPY 10_async/00_base/harness_langgraph/README.md /app/harness_langgraph/README.md + +WORKDIR /app/harness_langgraph + +# Copy the project code +COPY 10_async/00_base/harness_langgraph/project /app/harness_langgraph/project + +# Copy the test files +COPY 10_async/00_base/harness_langgraph/tests /app/harness_langgraph/tests + +# Copy shared test utilities +COPY test_utils /app/test_utils + +# Install the required Python packages with dev dependencies +RUN uv pip install --system .[dev] pytest-asyncio httpx + +# Set environment variables +ENV PYTHONPATH=/app + +# Set test environment variables +ENV AGENT_NAME=a-harness-langgraph + +# Run the agent using uvicorn +CMD ["uvicorn", "project.acp:acp", "--host", "0.0.0.0", "--port", "8000"] diff --git a/examples/tutorials/10_async/00_base/harness_langgraph/README.md b/examples/tutorials/10_async/00_base/harness_langgraph/README.md new file mode 100644 index 000000000..7efe28207 --- /dev/null +++ b/examples/tutorials/10_async/00_base/harness_langgraph/README.md @@ -0,0 +1,57 @@ +# Tutorial: Async Harness LangGraph Agent + +This tutorial demonstrates how to build an **async** LangGraph agent on AgentEx +using the **unified harness surface**: + +```python +turn = LangGraphTurn(stream, model=None) +emitter = UnifiedEmitter(task_id=task_id, trace_id=task_id, ...) +result = await emitter.auto_send_turn(turn) +``` + +Compare with ``100_langgraph``, which uses the bespoke +``stream_langgraph_events`` helper directly. + +## Key Concepts + +### Unified Harness + +`LangGraphTurn` implements the `HarnessTurn` protocol: it wraps the raw +LangGraph `astream()` generator and exposes `events` (an async generator of +`TaskMessageUpdate`) and `usage()` (token counts captured from the final +`AIMessage`). + +`UnifiedEmitter.auto_send_turn(turn)` pushes each event to Redis via +`streaming_task_message_context`, accumulates the final text, and returns a +`TurnResult(final_text=..., usage=...)`. + +The same `LangGraphTurn` object can also be passed to +`UnifiedEmitter.yield_turn` in the sync channel. + +### AGX1-377 Note + +LangGraph emits tool requests as `StreamTaskMessageFull` events (from "updates" +node outputs). The `SpanDeriver` does not open tool spans from Full events +today; that gap is tracked in AGX1-373. + +## Files + +| File | Description | +|------|-------------| +| `project/acp.py` | ACP server using unified harness (LangGraphTurn + auto_send_turn) | +| `project/graph.py` | LangGraph state graph (identical to 100_langgraph) | +| `project/tools.py` | Tool definitions (weather example) | +| `tests/test_agent.py` | Integration tests | +| `manifest.yaml` | Agent configuration (name: a-harness-langgraph) | + +## Running Locally + +```bash +agentex agents run +``` + +## Running Tests + +```bash +pytest tests/test_agent.py -v +``` diff --git a/examples/tutorials/10_async/00_base/harness_langgraph/manifest.yaml b/examples/tutorials/10_async/00_base/harness_langgraph/manifest.yaml new file mode 100644 index 000000000..bb19e25b3 --- /dev/null +++ b/examples/tutorials/10_async/00_base/harness_langgraph/manifest.yaml @@ -0,0 +1,58 @@ +build: + context: + root: ../../../ + include_paths: + - 10_async/00_base/harness_langgraph + - test_utils + dockerfile: 10_async/00_base/harness_langgraph/Dockerfile + dockerignore: 10_async/00_base/harness_langgraph/.dockerignore + +local_development: + agent: + port: 8000 + host_address: host.docker.internal + paths: + acp: project/acp.py + +agent: + acp_type: async + name: a-harness-langgraph + description: An async LangGraph agent using the unified harness surface (LangGraphTurn + UnifiedEmitter.auto_send_turn) + + temporal: + enabled: false + + credentials: + - env_var_name: OPENAI_API_KEY + secret_name: openai-api-key + secret_key: api-key + - env_var_name: REDIS_URL + secret_name: redis-url-secret + secret_key: url + - env_var_name: SGP_API_KEY + secret_name: sgp-api-key + secret_key: api-key + - env_var_name: SGP_ACCOUNT_ID + secret_name: sgp-account-id + secret_key: account-id + - env_var_name: SGP_CLIENT_BASE_URL + secret_name: sgp-client-base-url + secret_key: url + +deployment: + image: + repository: "" + tag: "latest" + + global: + agent: + name: "a-harness-langgraph" + description: "An async LangGraph agent using the unified harness surface" + replicaCount: 1 + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "1000m" + memory: "2Gi" diff --git a/examples/tutorials/10_async/00_base/harness_langgraph/project/__init__.py b/examples/tutorials/10_async/00_base/harness_langgraph/project/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/tutorials/10_async/00_base/harness_langgraph/project/acp.py b/examples/tutorials/10_async/00_base/harness_langgraph/project/acp.py new file mode 100644 index 000000000..a99395424 --- /dev/null +++ b/examples/tutorials/10_async/00_base/harness_langgraph/project/acp.py @@ -0,0 +1,109 @@ +"""ACP handler for async harness LangGraph agent. + +Uses the unified harness surface: ``LangGraphTurn`` wraps the LangGraph +``astream()`` generator, and ``UnifiedEmitter.auto_send_turn`` streams events +to Redis and returns a ``TurnResult`` with the accumulated final text. + +Differences from ``100_langgraph`` (bespoke path): +- No ``create_langgraph_tracing_handler`` boilerplate. +- ``stream_langgraph_events`` is replaced by + ``UnifiedEmitter.auto_send_turn(LangGraphTurn(stream))``. +- Tool calls/responses go through ``streaming_task_message_context`` + (same code path as text deltas), making the event stream channel-agnostic. +- Usage data (token counts) is captured on ``LangGraphTurn.usage()`` after + ``auto_send_turn`` returns. + +AGX1-377 note: LangGraph emits tool requests as ``StreamTaskMessageFull`` +events (from "updates"). The ``SpanDeriver`` does not open tool spans from +Full events today; that gap is tracked in AGX1-373. +""" + +from __future__ import annotations + +import os + +from dotenv import load_dotenv + +load_dotenv() + +import agentex.lib.adk as adk +from project.graph import create_graph +from agentex.lib.types.acp import SendEventParams, CancelTaskParams, CreateTaskParams +from agentex.lib.types.fastacp import AsyncACPConfig +from agentex.lib.types.tracing import SGPTracingProcessorConfig +from agentex.lib.utils.logging import make_logger +from agentex.lib.sdk.fastacp.fastacp import FastACP +from agentex.lib.core.harness.emitter import UnifiedEmitter +from agentex.lib.adk._modules._langgraph_turn import LangGraphTurn +from agentex.lib.core.tracing.tracing_processor_manager import add_tracing_processor_config + +logger = make_logger(__name__) + +add_tracing_processor_config( + SGPTracingProcessorConfig( + sgp_api_key=os.environ.get("SGP_API_KEY", ""), + sgp_account_id=os.environ.get("SGP_ACCOUNT_ID", ""), + sgp_base_url=os.environ.get("SGP_CLIENT_BASE_URL", ""), + ) +) + +acp = FastACP.create( + acp_type="async", + config=AsyncACPConfig(type="base"), +) + +_graph = None + + +async def get_graph(): + global _graph + if _graph is None: + _graph = await create_graph() + return _graph + + +@acp.on_task_event_send +async def handle_task_event_send(params: SendEventParams): + """Handle incoming events, streaming tokens and tool calls via unified harness.""" + graph = await get_graph() + task_id = params.task.id + user_message = params.event.content.content + + logger.info(f"Processing message for thread {task_id}") + + await adk.messages.create(task_id=task_id, content=params.event.content) + + async with adk.tracing.span( + trace_id=task_id, + task_id=task_id, + name="message", + input={"message": user_message}, + data={"__span_type__": "AGENT_WORKFLOW"}, + ) as turn_span: + stream = graph.astream( + {"messages": [{"role": "user", "content": user_message}]}, + config={"configurable": {"thread_id": task_id}}, + stream_mode=["messages", "updates"], + ) + + turn = LangGraphTurn(stream, model=None) + emitter = UnifiedEmitter( + task_id=task_id, + trace_id=task_id, + parent_span_id=turn_span.id if turn_span else None, + ) + + result = await emitter.auto_send_turn(turn) + + if turn_span: + turn_span.output = {"final_output": result.final_text} + + +@acp.on_task_create +async def handle_task_create(params: CreateTaskParams): + logger.info(f"Task created: {params.task.id}") + + +@acp.on_task_cancel +async def handle_task_canceled(params: CancelTaskParams): + logger.info(f"Task canceled: {params.task.id}") diff --git a/examples/tutorials/10_async/00_base/harness_langgraph/project/graph.py b/examples/tutorials/10_async/00_base/harness_langgraph/project/graph.py new file mode 100644 index 000000000..4aeac3b3c --- /dev/null +++ b/examples/tutorials/10_async/00_base/harness_langgraph/project/graph.py @@ -0,0 +1,67 @@ +"""LangGraph graph definition for the harness_langgraph async agent. + +Identical to ``100_langgraph/project/graph.py`` — the graph definition is not +affected by the harness migration. Only ``acp.py`` changes. +""" + +from __future__ import annotations + +from typing import Any, Annotated +from datetime import datetime +from typing_extensions import TypedDict + +from langgraph.graph import START, StateGraph +from langchain_openai import ChatOpenAI +from langgraph.prebuilt import ToolNode, tools_condition +from langchain_core.messages import SystemMessage +from langgraph.graph.message import add_messages + +from project.tools import TOOLS +from agentex.lib.adk import create_checkpointer + +MODEL_NAME = "gpt-5" +SYSTEM_PROMPT = """You are a helpful AI assistant with access to tools. + +Current date and time: {timestamp} + +Guidelines: +- Be concise and helpful +- Use tools when they would help answer the user's question +- If you're unsure, ask clarifying questions +- Always provide accurate information +""" + + +class AgentState(TypedDict): + """State schema for the agent graph.""" + + messages: Annotated[list[Any], add_messages] + + +async def create_graph(): + """Create and compile the agent graph with checkpointer.""" + llm = ChatOpenAI( + model=MODEL_NAME, + reasoning={"effort": "high", "summary": "auto"}, + ) + llm_with_tools = llm.bind_tools(TOOLS) + + checkpointer = await create_checkpointer() + + def agent_node(state: AgentState) -> dict[str, Any]: + """Process the current state and generate a response.""" + messages = state["messages"] + if not messages or not isinstance(messages[0], SystemMessage): + system_content = SYSTEM_PROMPT.format(timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + messages = [SystemMessage(content=system_content)] + messages + response = llm_with_tools.invoke(messages) + return {"messages": [response]} + + builder = StateGraph(AgentState) + builder.add_node("agent", agent_node) + builder.add_node("tools", ToolNode(tools=TOOLS)) + builder.add_edge(START, "agent") + builder.add_conditional_edges("agent", tools_condition, "tools") + builder.add_edge("tools", "agent") + + return builder.compile(checkpointer=checkpointer) diff --git a/examples/tutorials/10_async/00_base/harness_langgraph/project/tools.py b/examples/tutorials/10_async/00_base/harness_langgraph/project/tools.py new file mode 100644 index 000000000..6e7614300 --- /dev/null +++ b/examples/tutorials/10_async/00_base/harness_langgraph/project/tools.py @@ -0,0 +1,24 @@ +"""Tool definitions for the harness_langgraph async agent.""" + +from langchain_core.tools import Tool + + +def get_weather(city: str) -> str: + """Get the current weather for a city. + + Args: + city: The name of the city to get weather for. + + Returns: + A string describing the weather conditions. + """ + return f"The weather in {city} is sunny and 72°F" + + +weather_tool = Tool( + name="get_weather", + func=get_weather, + description="Get the current weather for a city. Input should be a city name.", +) + +TOOLS = [weather_tool] diff --git a/examples/tutorials/10_async/00_base/harness_langgraph/pyproject.toml b/examples/tutorials/10_async/00_base/harness_langgraph/pyproject.toml new file mode 100644 index 000000000..69856e6db --- /dev/null +++ b/examples/tutorials/10_async/00_base/harness_langgraph/pyproject.toml @@ -0,0 +1,37 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "a-harness-langgraph" +version = "0.1.0" +description = "An async LangGraph agent using the unified harness surface" +readme = "README.md" +requires-python = ">=3.12" +dependencies = [ + "agentex-sdk", + "scale-gp", + "langgraph", + "langchain-openai", +] + +[project.optional-dependencies] +dev = [ + "pytest", + "pytest-asyncio", + "httpx", + "black", + "isort", + "flake8", +] + +[tool.hatch.build.targets.wheel] +packages = ["project"] + +[tool.black] +line-length = 88 +target-version = ['py312'] + +[tool.isort] +profile = "black" +line_length = 88 diff --git a/examples/tutorials/10_async/00_base/harness_langgraph/tests/test_agent.py b/examples/tutorials/10_async/00_base/harness_langgraph/tests/test_agent.py new file mode 100644 index 000000000..762b2b90c --- /dev/null +++ b/examples/tutorials/10_async/00_base/harness_langgraph/tests/test_agent.py @@ -0,0 +1,100 @@ +""" +Tests for the async harness LangGraph agent. + +Validates the unified harness surface (LangGraphTurn + UnifiedEmitter.auto_send_turn) +end-to-end against a live AgentEx server. + +Configuration: +- AGENTEX_API_BASE_URL: Base URL for the AgentEx server (default: http://localhost:5003) +- AGENT_NAME: Name of the agent to test (default: a-harness-langgraph) +""" + +import os + +import pytest +import pytest_asyncio + +from agentex import AsyncAgentex +from agentex.types import TextContentParam +from agentex.types.agent_rpc_params import ParamsCreateTaskRequest +from agentex.lib.sdk.fastacp.base.base_acp_server import uuid + +AGENTEX_API_BASE_URL = os.environ.get("AGENTEX_API_BASE_URL", "http://localhost:5003") +AGENT_NAME = os.environ.get("AGENT_NAME", "a-harness-langgraph") + + +@pytest_asyncio.fixture +async def client(): + client = AsyncAgentex(base_url=AGENTEX_API_BASE_URL) + yield client + await client.close() + + +@pytest.fixture +def agent_name(): + return AGENT_NAME + + +@pytest_asyncio.fixture +async def agent_id(client, agent_name): + agents = await client.agents.list() + for agent in agents: + if agent.name == agent_name: + return agent.id + raise ValueError(f"Agent with name {agent_name} not found.") + + +class TestNonStreamingEvents: + @pytest.mark.asyncio + async def test_send_event(self, client: AsyncAgentex, agent_id: str): + task_response = await client.agents.create_task(agent_id, params=ParamsCreateTaskRequest(name=uuid.uuid1().hex)) + task = task_response.result + assert task is not None + + event_content = TextContentParam( + type="text", + author="user", + content="Hello! What can you help me with?", + ) + await client.agents.send_event( + agent_id=agent_id, + params={"task_id": task.id, "content": event_content}, + ) + + @pytest.mark.asyncio + async def test_tool_calling(self, client: AsyncAgentex, agent_id: str): + task_response = await client.agents.create_task(agent_id, params=ParamsCreateTaskRequest(name=uuid.uuid1().hex)) + task = task_response.result + assert task is not None + + event_content = TextContentParam( + type="text", + author="user", + content="What's the weather in San Francisco?", + ) + await client.agents.send_event( + agent_id=agent_id, + params={"task_id": task.id, "content": event_content}, + ) + + +class TestStreamingEvents: + @pytest.mark.asyncio + async def test_send_event_and_stream(self, client: AsyncAgentex, agent_id: str): + task_response = await client.agents.create_task(agent_id, params=ParamsCreateTaskRequest(name=uuid.uuid1().hex)) + task = task_response.result + assert task is not None + + event_content = TextContentParam( + type="text", + author="user", + content="Tell me a short joke.", + ) + await client.agents.send_event( + agent_id=agent_id, + params={"task_id": task.id, "content": event_content}, + ) + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/examples/tutorials/10_async/10_temporal/harness_langgraph/Dockerfile b/examples/tutorials/10_async/10_temporal/harness_langgraph/Dockerfile new file mode 100644 index 000000000..f6c9fb59b --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/harness_langgraph/Dockerfile @@ -0,0 +1,43 @@ +# syntax=docker/dockerfile:1.3 +FROM python:3.12-slim +COPY --from=ghcr.io/astral-sh/uv:0.6.4 /uv /uvx /bin/ + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + htop \ + vim \ + curl \ + tar \ + python3-dev \ + postgresql-client \ + build-essential \ + libpq-dev \ + gcc \ + cmake \ + netcat-openbsd \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +RUN uv pip install --system --upgrade pip setuptools wheel + +ENV UV_HTTP_TIMEOUT=1000 + +COPY 10_async/10_temporal/harness_langgraph/pyproject.toml /app/harness_langgraph/pyproject.toml +COPY 10_async/10_temporal/harness_langgraph/README.md /app/harness_langgraph/README.md + +WORKDIR /app/harness_langgraph + +COPY 10_async/10_temporal/harness_langgraph/project /app/harness_langgraph/project +COPY 10_async/10_temporal/harness_langgraph/tests /app/harness_langgraph/tests +COPY test_utils /app/test_utils + +RUN uv pip install --system .[dev] + +ENV PYTHONPATH=/app + +ENV AGENT_NAME=at-harness-langgraph + +CMD ["uvicorn", "project.acp:acp", "--host", "0.0.0.0", "--port", "8000"] + +# When we deploy the worker, we will replace the CMD with the following +# CMD ["python", "-m", "run_worker"] diff --git a/examples/tutorials/10_async/10_temporal/harness_langgraph/README.md b/examples/tutorials/10_async/10_temporal/harness_langgraph/README.md new file mode 100644 index 000000000..4df6969f1 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/harness_langgraph/README.md @@ -0,0 +1,53 @@ +# Tutorial: Temporal Harness LangGraph Agent + +This tutorial demonstrates how to build a **Temporal-backed** LangGraph agent on +AgentEx, following the ``130_langgraph`` pattern. The agent's LLM node runs as a +durable Temporal activity; the tools node runs inline in the workflow. + +This agent is named ``at-harness-langgraph`` to distinguish it from +``at130-langgraph`` (the bespoke reference). The graph and workflow structure are +identical; only the agent name changes. + +## Key Concepts + +### Temporal + LangGraph + +The ``LangGraphPlugin`` from ``temporalio.contrib.langgraph`` turns annotated graph +nodes into Temporal activities or inline workflow callables: + +- `agent` node: `execute_in="activity"` (durable, retryable LLM call) +- `tools` node: `execute_in="workflow"` (inline, fast tool execution) + +### Message surfacing + +After each turn, ``emit_langgraph_messages`` converts the new LangGraph messages +(tool requests, tool responses, final text) into AgentEx ``TaskMessage`` objects +and posts them to the task's message stream. + +This is the Temporal-specific path. The non-Temporal async/sync channels use +``UnifiedEmitter.auto_send_turn`` / ``UnifiedEmitter.yield_turn`` with +``LangGraphTurn`` instead. + +## Files + +| File | Description | +|------|-------------| +| `project/acp.py` | ACP server (Temporal config, LangGraphPlugin) | +| `project/graph.py` | LangGraph graph (agent + tools nodes) | +| `project/workflow.py` | Temporal workflow (signal handlers, emit_langgraph_messages) | +| `project/run_worker.py` | Temporal worker runner | +| `project/tools.py` | Tool definitions (weather example) | +| `tests/test_agent.py` | Integration tests | +| `manifest.yaml` | Agent configuration (name: at-harness-langgraph) | + +## Running Locally + +```bash +agentex agents run +``` + +## Running Tests + +```bash +pytest tests/test_agent.py -v +``` diff --git a/examples/tutorials/10_async/10_temporal/harness_langgraph/manifest.yaml b/examples/tutorials/10_async/10_temporal/harness_langgraph/manifest.yaml new file mode 100644 index 000000000..596d38eb4 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/harness_langgraph/manifest.yaml @@ -0,0 +1,51 @@ +build: + context: + root: ../../../ + include_paths: + - 10_async/10_temporal/harness_langgraph + - test_utils + dockerfile: 10_async/10_temporal/harness_langgraph/Dockerfile + dockerignore: 10_async/10_temporal/harness_langgraph/.dockerignore + +local_development: + agent: + port: 8000 + host_address: host.docker.internal + paths: + acp: project/acp.py + worker: project/run_worker.py + +agent: + acp_type: async + name: at-harness-langgraph + description: "A Temporal-backed LangGraph agent (harness variant) whose nodes run as Temporal activities" + + temporal: + enabled: true + workflows: + - name: at-harness-langgraph + queue_name: at_harness_langgraph_queue + + credentials: + - env_var_name: REDIS_URL + secret_name: redis-url-secret + secret_key: url + + env: {} + +deployment: + image: + repository: "" + tag: "latest" + + imagePullSecrets: [] + + global: + replicaCount: 1 + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "1000m" + memory: "2Gi" diff --git a/examples/tutorials/10_async/10_temporal/harness_langgraph/project/__init__.py b/examples/tutorials/10_async/10_temporal/harness_langgraph/project/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/tutorials/10_async/10_temporal/harness_langgraph/project/acp.py b/examples/tutorials/10_async/10_temporal/harness_langgraph/project/acp.py new file mode 100644 index 000000000..7af9c5e68 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/harness_langgraph/project/acp.py @@ -0,0 +1,34 @@ +"""ACP server for the Temporal harness LangGraph agent. + +Follows the ``130_langgraph`` pattern: the Temporal ``LangGraphPlugin`` runs +graph nodes as Temporal activities. The agent logic lives in ``workflow.py`` +(the runtime) and ``graph.py`` (the LangGraph graph), executed by the Temporal +worker (``run_worker.py``), not by this HTTP process. + +The workflow uses ``emit_langgraph_messages`` to surface turn messages to +AgentEx. That helper is Temporal-specific and is not replaced by the unified +harness here (``UnifiedEmitter`` targets the non-Temporal async/sync channels). +""" + +from __future__ import annotations + +import os + +from dotenv import load_dotenv + +load_dotenv() + +from temporalio.contrib.langgraph import LangGraphPlugin + +from project.graph import GRAPH_NAME, build_graph +from agentex.lib.types.fastacp import TemporalACPConfig +from agentex.lib.sdk.fastacp.fastacp import FastACP + +acp = FastACP.create( + acp_type="async", + config=TemporalACPConfig( + type="temporal", + temporal_address=os.getenv("TEMPORAL_ADDRESS", "localhost:7233"), + plugins=[LangGraphPlugin(graphs={GRAPH_NAME: build_graph()})], + ), +) diff --git a/examples/tutorials/10_async/10_temporal/harness_langgraph/project/graph.py b/examples/tutorials/10_async/10_temporal/harness_langgraph/project/graph.py new file mode 100644 index 000000000..ce9c2b520 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/harness_langgraph/project/graph.py @@ -0,0 +1,85 @@ +"""LangGraph graph for at-harness-langgraph — nodes run as Temporal activities. + +Identical in structure to ``130_langgraph/project/graph.py``. The graph +definition is not affected by the harness migration; only the agent naming +changes. The LLM ``agent`` node runs as a durable Temporal activity; +the ``tools`` node runs inline in the workflow. +""" + +from __future__ import annotations + +import os +from typing import Any, Annotated +from datetime import datetime, timedelta + +_litellm_key = os.environ.get("LITELLM_API_KEY") +if _litellm_key: + os.environ.setdefault("OPENAI_API_KEY", _litellm_key) + +from typing_extensions import TypedDict + +from langgraph.graph import END, START, StateGraph +from langchain_openai import ChatOpenAI +from langchain_core.messages import ToolMessage, SystemMessage +from langgraph.graph.message import add_messages + +from project.tools import TOOLS + +_TOOLS_BY_NAME = {tool.name: tool for tool in TOOLS} + +GRAPH_NAME = "at-harness-langgraph" +MODEL_NAME = "gpt-4o" +SYSTEM_PROMPT = """You are a helpful AI assistant with access to tools. + +Current date and time: {timestamp} + +Be concise and use tools when they help answer the question.""" + + +class AgentState(TypedDict): + messages: Annotated[list[Any], add_messages] + + +async def agent_node(state: AgentState) -> dict[str, Any]: + """The 'agent' node — one LLM call. Runs as a durable Temporal activity.""" + llm = ChatOpenAI(model=MODEL_NAME).bind_tools(TOOLS) + messages = state["messages"] + if not messages or not isinstance(messages[0], SystemMessage): + system = SystemMessage(content=SYSTEM_PROMPT.format(timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) + messages = [system, *messages] + return {"messages": [await llm.ainvoke(messages)]} + + +async def tools_node(state: AgentState) -> dict[str, Any]: + """Run the tool calls the model requested. Runs inline in the workflow.""" + last = state["messages"][-1] + results: list[Any] = [] + for call in getattr(last, "tool_calls", None) or []: + tool = _TOOLS_BY_NAME.get(call["name"]) + if tool is None: + output = f"Error: unknown tool {call['name']!r}. Available: {list(_TOOLS_BY_NAME)}" + else: + output = await tool.ainvoke(call["args"]) + results.append(ToolMessage(content=str(output), tool_call_id=call["id"], name=call["name"])) + return {"messages": results} + + +async def route_after_agent(state: AgentState) -> str: + """Go to the tools node if the model requested tools, else finish.""" + last = state["messages"][-1] + return "tools" if getattr(last, "tool_calls", None) else END + + +def build_graph() -> StateGraph: + """Build the agent graph; the LLM node runs as an activity, tools in the workflow.""" + builder = StateGraph(AgentState) + builder.add_node( + "agent", + agent_node, + metadata={"execute_in": "activity", "start_to_close_timeout": timedelta(minutes=5)}, + ) + builder.add_node("tools", tools_node, metadata={"execute_in": "workflow"}) + builder.add_edge(START, "agent") + builder.add_conditional_edges("agent", route_after_agent, {"tools": "tools", END: END}) + builder.add_edge("tools", "agent") + return builder diff --git a/examples/tutorials/10_async/10_temporal/harness_langgraph/project/run_worker.py b/examples/tutorials/10_async/10_temporal/harness_langgraph/project/run_worker.py new file mode 100644 index 000000000..ca64464fc --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/harness_langgraph/project/run_worker.py @@ -0,0 +1,46 @@ +"""Temporal worker for at-harness-langgraph. + +Run as a separate long-lived process alongside the ACP HTTP server. The +worker polls Temporal for workflow + activity tasks and executes them. + +The ``LangGraphPlugin`` is given the graph registry (``{ GRAPH_NAME: graph }``). +At runtime it turns the graph's ``execute_in="activity"`` nodes into Temporal +activities and registers them on the worker automatically. +""" + +import asyncio + +from temporalio.contrib.langgraph import LangGraphPlugin + +from project.graph import GRAPH_NAME, build_graph +from project.workflow import AtHarnessLanggraphWorkflow +from agentex.lib.utils.debug import setup_debug_if_enabled +from agentex.lib.utils.logging import make_logger +from agentex.lib.environment_variables import EnvironmentVariables +from agentex.lib.core.temporal.activities import get_all_activities +from agentex.lib.core.temporal.workers.worker import AgentexWorker + +environment_variables = EnvironmentVariables.refresh() +logger = make_logger(__name__) + + +async def main(): + setup_debug_if_enabled() + + task_queue_name = environment_variables.WORKFLOW_TASK_QUEUE + if task_queue_name is None: + raise ValueError("WORKFLOW_TASK_QUEUE is not set") + + worker = AgentexWorker( + task_queue=task_queue_name, + plugins=[LangGraphPlugin(graphs={GRAPH_NAME: build_graph()})], + ) + + await worker.run( + activities=get_all_activities(), + workflow=AtHarnessLanggraphWorkflow, + ) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/tutorials/10_async/10_temporal/harness_langgraph/project/tools.py b/examples/tutorials/10_async/10_temporal/harness_langgraph/project/tools.py new file mode 100644 index 000000000..51440398e --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/harness_langgraph/project/tools.py @@ -0,0 +1,24 @@ +"""Tool definitions for the harness_langgraph temporal agent.""" + +from langchain_core.tools import Tool + + +def get_weather(city: str) -> str: + """Get the current weather for a city. + + Args: + city: The name of the city to get weather for. + + Returns: + A string describing the weather conditions. + """ + return f"The weather in {city} is sunny and 72°F" + + +weather_tool = Tool( + name="get_weather", + func=get_weather, + description="Get the current weather for a city. Input should be a city name.", +) + +TOOLS = [weather_tool] diff --git a/examples/tutorials/10_async/10_temporal/harness_langgraph/project/workflow.py b/examples/tutorials/10_async/10_temporal/harness_langgraph/project/workflow.py new file mode 100644 index 000000000..4125dca39 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/harness_langgraph/project/workflow.py @@ -0,0 +1,80 @@ +"""Temporal workflow for at-harness-langgraph. + +Each turn the workflow runs the LangGraph graph (``project/graph.py``) via the +``temporalio.contrib.langgraph`` plugin. The plugin runs the LLM ``agent`` node +as a durable Temporal activity and the ``tools`` node inline in the workflow. + +Multi-turn memory is kept on the workflow instance (``self._messages``) — it's +durable and replay-safe for free, so no checkpoint database is needed. +""" + +from __future__ import annotations + +import json +from typing import Any + +from temporalio import workflow +from temporalio.contrib.langgraph import graph as lg_graph + +from agentex.lib import adk +from project.graph import GRAPH_NAME +from agentex.lib.adk import emit_langgraph_messages +from agentex.protocol.acp import SendEventParams, CreateTaskParams +from agentex.lib.utils.logging import make_logger +from agentex.types.text_content import TextContent +from agentex.lib.environment_variables import EnvironmentVariables +from agentex.lib.core.temporal.types.workflow import SignalName +from agentex.lib.core.temporal.workflows.workflow import BaseWorkflow + +environment_variables = EnvironmentVariables.refresh() + +if environment_variables.WORKFLOW_NAME is None: + raise ValueError("Environment variable WORKFLOW_NAME is not set") +if environment_variables.AGENT_NAME is None: + raise ValueError("Environment variable AGENT_NAME is not set") + +logger = make_logger(__name__) + + +@workflow.defn(name=environment_variables.WORKFLOW_NAME) +class AtHarnessLanggraphWorkflow(BaseWorkflow): + """Runs the LangGraph agent each turn; its nodes run as Temporal activities.""" + + def __init__(self) -> None: + super().__init__(display_name=environment_variables.AGENT_NAME) + self._complete_task = False + self._messages: list[Any] = [] + self._emitted = 0 + + @workflow.signal(name=SignalName.RECEIVE_EVENT) + async def on_task_event_send(self, params: SendEventParams) -> None: + """Echo the user's message, run the graph, surface the new messages.""" + await adk.messages.create(task_id=params.task.id, content=params.event.content) + self._messages.append({"role": "user", "content": params.event.content.content}) + + compiled = lg_graph(GRAPH_NAME).compile() + result = await compiled.ainvoke({"messages": self._messages}) + self._messages = result["messages"] + + await emit_langgraph_messages(self._messages[self._emitted :], params.task.id) + self._emitted = len(self._messages) + + @workflow.signal + async def complete_task_signal(self) -> None: + self._complete_task = True + + @workflow.run + async def on_task_create(self, params: CreateTaskParams) -> str: + await adk.messages.create( + task_id=params.task.id, + content=TextContent( + author="agent", + content=( + f"Task initialized with params:\n{json.dumps(params.params, indent=2)}\n\n" + "Send me a message and I'll respond using a LangGraph agent whose nodes " + "run as durable Temporal activities." + ), + ), + ) + await workflow.wait_condition(lambda: self._complete_task, timeout=None) + return "Task completed" diff --git a/examples/tutorials/10_async/10_temporal/harness_langgraph/pyproject.toml b/examples/tutorials/10_async/10_temporal/harness_langgraph/pyproject.toml new file mode 100644 index 000000000..897f54dd6 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/harness_langgraph/pyproject.toml @@ -0,0 +1,40 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "at-harness-langgraph" +version = "0.1.0" +description = "A Temporal-backed LangGraph agent (harness variant) whose nodes run as Temporal activities" +requires-python = ">=3.12" +dependencies = [ + "agentex-sdk", + "scale-gp", + "temporalio[langgraph]>=1.27.0", + "langchain-openai", + "langchain-core", + "grandalf", + "python-dotenv", +] + +[project.optional-dependencies] +dev = [ + "pytest", + "pytest-asyncio", + "httpx", + "black", + "isort", + "flake8", + "debugpy>=1.8.15", +] + +[tool.hatch.build.targets.wheel] +packages = ["project"] + +[tool.black] +line-length = 88 +target-version = ['py312'] + +[tool.isort] +profile = "black" +line_length = 88 diff --git a/examples/tutorials/10_async/10_temporal/harness_langgraph/tests/test_agent.py b/examples/tutorials/10_async/10_temporal/harness_langgraph/tests/test_agent.py new file mode 100644 index 000000000..05d9ffa01 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/harness_langgraph/tests/test_agent.py @@ -0,0 +1,106 @@ +"""Integration tests for the Temporal harness LangGraph agent (live agent required). + +These drive a *running* agent over the AgentEx API and verify that: +- the agent sends a welcome message on task creation, +- a weather question triggers a tool_request / tool_response round-trip + (proving the LLM node ran as a Temporal activity and the tool node ran), +- the final answer reflects the tool output. + +To run: +1. Start the agent (worker + ACP server): ``agentex agents run --manifest manifest.yaml`` +2. Set AGENTEX_API_BASE_URL if not using the default +3. ``pytest tests/test_agent.py -v`` +""" + +import os +import uuid + +import pytest +import pytest_asyncio +from test_utils.async_utils import ( + poll_messages, + send_event_and_poll_yielding, +) + +from agentex import AsyncAgentex +from agentex.types.task_message import TaskMessage +from agentex.types.agent_rpc_params import ParamsCreateTaskRequest + +AGENTEX_API_BASE_URL = os.environ.get("AGENTEX_API_BASE_URL", "http://localhost:5003") +AGENT_NAME = os.environ.get("AGENT_NAME", "at-harness-langgraph") + + +@pytest_asyncio.fixture +async def client(): + client = AsyncAgentex(base_url=AGENTEX_API_BASE_URL) + yield client + await client.close() + + +@pytest.fixture +def agent_name(): + return AGENT_NAME + + +@pytest_asyncio.fixture +async def agent_id(client, agent_name): + agents = await client.agents.list() + for agent in agents: + if agent.name == agent_name: + return agent.id + raise ValueError(f"Agent with name {agent_name} not found.") + + +class TestNonStreamingEvents: + """The Temporal-backed LangGraph agent responds and uses tools.""" + + @pytest.mark.asyncio + async def test_send_event_and_poll(self, client: AsyncAgentex, agent_id: str): + """Create a task, ask about weather, verify the tool round-trip.""" + task_response = await client.agents.create_task(agent_id, params=ParamsCreateTaskRequest(name=uuid.uuid1().hex)) + task = task_response.result + assert task is not None + + task_creation_found = False + async for message in poll_messages(client=client, task_id=task.id, timeout=30, sleep_interval=1.0): + assert isinstance(message, TaskMessage) + if message.content and message.content.type == "text" and message.content.author == "agent": + task_creation_found = True + break + assert task_creation_found, "Task creation welcome message not found" + + seen_tool_request = False + seen_tool_response = False + final_message = None + async for message in send_event_and_poll_yielding( + client=client, + agent_id=agent_id, + task_id=task.id, + user_message="What is the weather in San Francisco? Use your tool.", + timeout=60, + sleep_interval=1.0, + ): + assert isinstance(message, TaskMessage) + + if message.content and message.content.type == "tool_request": + seen_tool_request = True + if message.content and message.content.type == "tool_response": + seen_tool_response = True + + if message.content and message.content.type == "text" and message.content.author == "agent": + final_message = message + content_length = len(getattr(message.content, "content", "") or "") + if getattr(message, "streaming_status", None) in (None, "DONE") and content_length > 0: + if seen_tool_response: + break + + assert seen_tool_request, "Expected a tool_request (agent calling get_weather)" + assert seen_tool_response, "Expected a tool_response (get_weather result)" + assert final_message is not None, "Expected a final agent text message" + final_text = getattr(final_message.content, "content", None) if final_message.content else None + assert isinstance(final_text, str) and len(final_text) > 0 + assert "72" in final_text, "Expected weather response to mention 72°F" + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/src/agentex/lib/adk/_modules/_langgraph_sync.py b/src/agentex/lib/adk/_modules/_langgraph_sync.py index 381ff6880..fcf1c10a9 100644 --- a/src/agentex/lib/adk/_modules/_langgraph_sync.py +++ b/src/agentex/lib/adk/_modules/_langgraph_sync.py @@ -23,8 +23,16 @@ harness envelope). """ +from __future__ import annotations -async def convert_langgraph_to_agentex_events(stream, on_final_ai_message=None): +from typing import Any, Callable, Optional +from collections.abc import AsyncGenerator + + +async def convert_langgraph_to_agentex_events( + stream: Any, + on_final_ai_message: Optional[Callable[..., None]] = None, +) -> AsyncGenerator[Any, None]: """Convert LangGraph streaming events to Agentex TaskMessageUpdate events. Expects the stream from graph.astream() called with diff --git a/src/agentex/lib/adk/_modules/_langgraph_turn.py b/src/agentex/lib/adk/_modules/_langgraph_turn.py index 9be486323..6f0913623 100644 --- a/src/agentex/lib/adk/_modules/_langgraph_turn.py +++ b/src/agentex/lib/adk/_modules/_langgraph_turn.py @@ -12,6 +12,7 @@ from __future__ import annotations from typing import Any, AsyncIterator +from collections.abc import AsyncGenerator from agentex.lib.core.harness.types import TurnUsage, StreamTaskMessage from agentex.lib.adk._modules._langgraph_sync import convert_langgraph_to_agentex_events @@ -100,7 +101,7 @@ def __init__(self, stream: Any, model: str | None = None) -> None: def events(self) -> AsyncIterator[StreamTaskMessage]: return self._generate_events() - async def _generate_events(self) -> AsyncIterator[StreamTaskMessage]: + async def _generate_events(self) -> AsyncGenerator[StreamTaskMessage, None]: def _capture(ai_msg: Any) -> None: usage_metadata = getattr(ai_msg, "usage_metadata", None) if usage_metadata is not None: diff --git a/tests/lib/adk/test_langgraph_sync_unified.py b/tests/lib/adk/test_langgraph_sync_unified.py index 57e7fb821..84f6af959 100644 --- a/tests/lib/adk/test_langgraph_sync_unified.py +++ b/tests/lib/adk/test_langgraph_sync_unified.py @@ -14,6 +14,7 @@ import sys from typing import Any +from datetime import datetime, timezone from dataclasses import field, dataclass import pytest @@ -68,6 +69,7 @@ async def start_span(self, **kw) -> Any: id=f"span-{len(self.spans_started) + 1}", trace_id=kw.get("trace_id", "trace1"), name=kw.get("name", ""), + start_time=datetime.now(tz=timezone.utc), ) self.spans_started.append(kw) return sp From 68572d556fda90ca59c8efdb1cba158f919dcda8 Mon Sep 17 00:00:00 2001 From: Declan Brady Date: Thu, 18 Jun 2026 17:12:47 -0400 Subject: [PATCH 35/35] fix(langgraph): restore created_at + docstring-only deprecation for tracing handler (PR 5/6) AGX1-378: wire workflow_now_if_in_workflow() into stream_langgraph_events so Temporal callers get deterministic message timestamps, matching the pattern used by the openai/litellm providers. Deprecation alignment: remove runtime warnings.warn from create_langgraph_tracing_handler (and unused import warnings) to match PR 4/6 pydantic-ai convention. Deprecation remains in docstrings on module, class, and function. Callers under -W error are no longer broken. Test alignment after rebase onto unified-harness-surface (b4b8b330): - FakeStreamingModule.streaming_task_message_context in test_langgraph_async.py and test_pydantic_ai_async.py updated to accept **kw (foundation now passes created_at). - Three "no tool spans for Full events" tests updated to assert the new SpanDeriver behaviour: Full(ToolRequestContent) opens a span, Full(ToolResponseContent) closes it. - Two "accumulates all text" multi-step tests corrected to last-segment semantics (auto_send resets final_text_parts on each new Start(TextContent)). Co-Authored-By: Claude Opus 4.8 (1M context) --- .../lib/adk/_modules/_langgraph_async.py | 11 ++++++- .../lib/adk/_modules/_langgraph_tracing.py | 8 ----- tests/lib/adk/test_langgraph_async.py | 20 ++++++------- tests/lib/adk/test_langgraph_sync.py | 26 +++++++++++------ tests/lib/adk/test_langgraph_sync_unified.py | 28 +++++++----------- tests/lib/adk/test_pydantic_ai_async.py | 2 +- .../harness/test_harness_langgraph_async.py | 29 +++++++++++++------ .../harness/test_harness_langgraph_sync.py | 14 +++++---- 8 files changed, 78 insertions(+), 60 deletions(-) diff --git a/src/agentex/lib/adk/_modules/_langgraph_async.py b/src/agentex/lib/adk/_modules/_langgraph_async.py index 40e8f024b..02ef059eb 100644 --- a/src/agentex/lib/adk/_modules/_langgraph_async.py +++ b/src/agentex/lib/adk/_modules/_langgraph_async.py @@ -16,6 +16,8 @@ handles Full events correctly; no coalescing wrapper is needed. """ +from agentex.lib.utils.temporal import workflow_now_if_in_workflow + async def stream_langgraph_events(stream, task_id: str) -> str: """Stream LangGraph events to Agentex via Redis. @@ -37,6 +39,11 @@ async def stream_langgraph_events(stream, task_id: str) -> str: NOT Start+Delta+Done like pydantic-ai. ``auto_send`` handles Full events correctly; no coalescing wrapper is needed. + AGX1-378 note: ``created_at`` is set from ``workflow.now()`` when called inside a + Temporal workflow, matching the pattern used by the openai/litellm providers. + Outside a workflow (plain async activities, sync agents) it is ``None`` and the + server's wall clock is used. + Args: stream: Async iterator from graph.astream(..., stream_mode=["messages", "updates"]) task_id: The Agentex task ID to stream messages to. @@ -50,7 +57,9 @@ async def stream_langgraph_events(stream, task_id: str) -> str: # AGX1-377 note: LangGraph emits tool requests as Full events (from "updates"), # NOT Start+Delta+Done like pydantic-ai. auto_send handles Full events correctly; # no coalescing wrapper is needed. + # AGX1-378: stamp messages with workflow.now() inside Temporal for deterministic + # created_at ordering; falls back to None (server wall clock) outside a workflow. turn = LangGraphTurn(stream, model=None) emitter = UnifiedEmitter(task_id=task_id, trace_id=None, parent_span_id=None) - result = await emitter.auto_send_turn(turn) + result = await emitter.auto_send_turn(turn, created_at=workflow_now_if_in_workflow()) return result.final_text diff --git a/src/agentex/lib/adk/_modules/_langgraph_tracing.py b/src/agentex/lib/adk/_modules/_langgraph_tracing.py index 0aa411f46..2162201e1 100644 --- a/src/agentex/lib/adk/_modules/_langgraph_tracing.py +++ b/src/agentex/lib/adk/_modules/_langgraph_tracing.py @@ -14,7 +14,6 @@ from __future__ import annotations -import warnings from uuid import UUID from typing import Any, override @@ -268,13 +267,6 @@ def create_langgraph_tracing_handler( This function remains available for backward compatibility. """ - warnings.warn( - "create_langgraph_tracing_handler is deprecated. Use LangGraphTurn with " - "UnifiedEmitter instead — the unified harness derives equivalent spans from " - "the canonical event stream without a LangChain callback handler.", - DeprecationWarning, - stacklevel=2, - ) return AgentexLangGraphTracingHandler( trace_id=trace_id, parent_span_id=parent_span_id, diff --git a/tests/lib/adk/test_langgraph_async.py b/tests/lib/adk/test_langgraph_async.py index 96befb461..682bd43bc 100644 --- a/tests/lib/adk/test_langgraph_async.py +++ b/tests/lib/adk/test_langgraph_async.py @@ -79,7 +79,7 @@ class FakeStreamingModule: def __init__(self) -> None: self.contexts: list[FakeContext] = [] - def streaming_task_message_context(self, *, task_id: str, initial_content: Any) -> FakeContext: + def streaming_task_message_context(self, *, task_id: str, initial_content: Any, **kw: Any) -> FakeContext: tm = TaskMessage( id=f"m{len(self.contexts) + 1}", task_id=task_id, @@ -226,14 +226,15 @@ async def test_tool_response_posted_via_streaming_context( assert content.content == "Sunny, 72F" assert streaming.contexts[0].closed is True - async def test_multi_step_text_then_tool_then_text_accumulates_all_text( + async def test_multi_step_text_then_tool_then_text_last_segment( self, fake_adk: tuple[FakeStreamingModule, FakeMessagesModule] ) -> None: - """Unified surface: final_text accumulates all text across the turn. + """Unified surface: final_text uses last-segment semantics. - Old bespoke impl only returned the last text segment (reset final_text - each time a new text context opened). The unified surface accumulates - all text because auto_send appends every TextDelta. + auto_send resets final_text_parts when a new Start(TextContent) is seen, + so multi-step turns (text -> tool -> text) return only the LAST text segment. + Both text contexts are still opened and streamed to Redis; only the + return value is last-segment. This matches stream_pydantic_ai_events. """ from langchain_core.messages import AIMessage, ToolMessage, AIMessageChunk @@ -256,10 +257,9 @@ async def test_multi_step_text_then_tool_then_text_accumulates_all_text( final = await stream_langgraph_events(stream, TASK_ID) - # Unified surface accumulates all text (not just the last segment) - assert "Looking up..." in final - assert "Found it!" in final - # Two text streaming contexts (one per text segment) + # Last segment only — first text segment is NOT in final_text + assert final == "Found it!" + # Two text streaming contexts (one per text segment) — both streamed to Redis text_ctxs = [c for c in streaming.contexts if isinstance(c.initial_content, TextContent)] assert len(text_ctxs) == 2 assert all(ctx.closed for ctx in text_ctxs) diff --git a/tests/lib/adk/test_langgraph_sync.py b/tests/lib/adk/test_langgraph_sync.py index 8bfcfebde..d64b5b1e8 100644 --- a/tests/lib/adk/test_langgraph_sync.py +++ b/tests/lib/adk/test_langgraph_sync.py @@ -3,7 +3,8 @@ Covers: - Basic text, tool call, and tool response emission - on_final_ai_message callback for usage capture -- Deprecation warning emitted by create_langgraph_tracing_handler +- create_langgraph_tracing_handler symbol is importable and functional + (runtime DeprecationWarning removed; deprecation is docstring-only) NOTE: langchain_core imports must be deferred to test-function scope because conftest.py stubs out ``langchain_core.messages`` with MagicMock for ADK @@ -13,7 +14,6 @@ from __future__ import annotations import sys -import warnings from typing import Any, AsyncIterator import pytest @@ -197,13 +197,21 @@ def _cb(msg): assert yield_order.index("event") < yield_order.index("callback") -class TestDeprecationWarning: - def test_create_langgraph_tracing_handler_emits_deprecation_warning(self): +class TestLangGraphTracingHandlerBackwardCompat: + def test_create_langgraph_tracing_handler_no_runtime_warning(self): + """Deprecated symbol remains importable and emits no runtime DeprecationWarning. + + The runtime warnings.warn was removed (docstring-only deprecation) to + align with PR 4/6 and avoid breaking callers under warnings-as-errors. + Using ``warnings.simplefilter("error", DeprecationWarning)`` verifies + that calling the function is safe under -W error conditions. + """ + import warnings + from agentex.lib.adk._modules._langgraph_tracing import create_langgraph_tracing_handler with warnings.catch_warnings(record=True) as w: - warnings.simplefilter("always") - create_langgraph_tracing_handler(trace_id="t1") - assert any(issubclass(warning.category, DeprecationWarning) for warning in w), ( - "create_langgraph_tracing_handler must emit a DeprecationWarning" - ) + warnings.simplefilter("error", DeprecationWarning) + create_langgraph_tracing_handler(trace_id="t1", parent_span_id="p1") + + assert w == [], "create_langgraph_tracing_handler must NOT emit a runtime DeprecationWarning" diff --git a/tests/lib/adk/test_langgraph_sync_unified.py b/tests/lib/adk/test_langgraph_sync_unified.py index 84f6af959..cfd522828 100644 --- a/tests/lib/adk/test_langgraph_sync_unified.py +++ b/tests/lib/adk/test_langgraph_sync_unified.py @@ -149,16 +149,13 @@ def fake_tracer(self): ) return tracer, backend - async def test_tool_span_not_derived_from_full_events(self, fake_tracer): - """AGX1-377: LangGraph emits tool calls as Full events (not Start+Done). - The SpanDeriver opens tool spans from Start(ToolRequestContent)+Done - sequences. Since LangGraph uses Full, no tool span is opened by the - SpanDeriver -- this is the documented AGX1-377 gap resolved by the - unified surface (Full events are emitted identically; cross-channel - span equivalence arrives with AGX1-373). - - The tracer must still be invoked (SpanDeriver.observe is called for each - event); it just produces no open-span signals for LangGraph Full tool events. + async def test_tool_span_derived_from_full_events(self, fake_tracer): + """AGX1-377: SpanDeriver now handles Full tool events for LangGraph. + + Full(ToolRequestContent) opens a tool span keyed by tool_call_id; + Full(ToolResponseContent) closes it. This bridges the previous gap where + LangGraph's Full-event path produced no spans, aligning it with + Start+Done harnesses (pydantic-ai, openai-agents). """ from langchain_core.messages import AIMessage, ToolMessage @@ -175,13 +172,10 @@ async def test_tool_span_not_derived_from_full_events(self, fake_tracer): emitter = UnifiedEmitter(task_id="t", trace_id="trace1", parent_span_id=None, tracer=tracer) _ = [e async for e in emitter.yield_turn(LangGraphTurn(_make_stream(events_raw)))] - # AGX1-377: Full events don't produce tool spans via SpanDeriver today. - # This is the documented gap; full cross-channel equivalence arrives with AGX1-373. - assert backend.spans_started == [], ( - "Expected no tool spans for LangGraph Full events (AGX1-377); if this " - "assertion fails it means SpanDeriver now handles Full events — update " - "the test to assert the new span names." - ) + assert len(backend.spans_started) == 1, "Full(ToolRequestContent) opens one tool span" + started = backend.spans_started[0] + assert started["name"] == "get_weather" + assert started["input"] == {"city": "Paris"} async def test_no_spans_when_no_tool_calls(self, fake_tracer): """yield_turn with tracer but no tool calls emits no spans.""" diff --git a/tests/lib/adk/test_pydantic_ai_async.py b/tests/lib/adk/test_pydantic_ai_async.py index dadda5914..62cc3970a 100644 --- a/tests/lib/adk/test_pydantic_ai_async.py +++ b/tests/lib/adk/test_pydantic_ai_async.py @@ -82,7 +82,7 @@ class FakeStreamingModule: def __init__(self) -> None: self.contexts: list[FakeContext] = [] - def streaming_task_message_context(self, *, task_id: str, initial_content: Any) -> FakeContext: + def streaming_task_message_context(self, *, task_id: str, initial_content: Any, **kw: Any) -> FakeContext: tm = TaskMessage( id=f"m{len(self.contexts) + 1}", task_id=task_id, diff --git a/tests/lib/core/harness/test_harness_langgraph_async.py b/tests/lib/core/harness/test_harness_langgraph_async.py index 7bfe61b5e..39bf5bc66 100644 --- a/tests/lib/core/harness/test_harness_langgraph_async.py +++ b/tests/lib/core/harness/test_harness_langgraph_async.py @@ -213,8 +213,14 @@ async def test_tool_response_posted_via_streaming_context(self): assert tool_resp_ctxs[0].initial_content.content == "Sunny, 72F" assert tool_resp_ctxs[0].closed is True - async def test_multi_step_accumulates_all_text(self): - """Unified surface: final_text accumulates all text, not just last segment.""" + async def test_multi_step_final_text_is_last_segment(self): + """Unified surface: final_text uses last-segment semantics. + + auto_send resets final_text_parts when a new Start(TextContent) is seen, + so multi-step turns (text -> tool -> text) return only the LAST text segment. + This matches the behaviour documented in auto_send.py and mirrors + stream_pydantic_ai_events. + """ from langchain_core.messages import AIMessage, ToolMessage, AIMessageChunk chunk1 = AIMessageChunk(content="Searching...") @@ -232,11 +238,10 @@ async def test_multi_step_accumulates_all_text(self): ] result, fake_streaming, _ = await _run_auto_send_turn(events) - # All text accumulated - assert "Searching..." in result.final_text - assert "Found it!" in result.final_text + # Last segment only — first text segment is NOT in final_text + assert result.final_text == "Found it!" - # Two text streaming contexts + # Two text streaming contexts still opened (both streamed to Redis) text_ctxs = [c for c in fake_streaming.contexts if isinstance(c.initial_content, TextContent)] assert len(text_ctxs) == 2 @@ -269,8 +274,12 @@ async def test_turn_usage_populated_after_events_consumed(self): assert usage.output_tokens == 5 assert usage.total_tokens == 15 - async def test_tracer_does_not_produce_tool_spans_for_full_events(self): - """AGX1-377: Full events don't trigger SpanDeriver tool spans.""" + async def test_tracer_produces_tool_spans_for_full_events(self): + """AGX1-377: SpanDeriver now handles Full tool events (request opens, response closes). + + Full(ToolRequestContent) opens a tool span; Full(ToolResponseContent) closes it. + This aligns LangGraph tracing with Start+Done harnesses (pydantic-ai, openai-agents). + """ from langchain_core.messages import AIMessage, ToolMessage tc = {"id": "c1", "name": "t", "args": {}} @@ -284,4 +293,6 @@ async def test_tracer_does_not_produce_tool_spans_for_full_events(self): _, _, fake_tracing = await _run_auto_send_turn(events, trace_id="trace-1") assert fake_tracing is not None - assert fake_tracing.started == [], "AGX1-377: Full events don't trigger tool spans" + assert len(fake_tracing.started) == 1, "Full(ToolRequestContent) opens one tool span" + assert fake_tracing.started[0][0] == "t", "span name matches the tool name" + assert len(fake_tracing.ended) == 1, "Full(ToolResponseContent) closes the span" diff --git a/tests/lib/core/harness/test_harness_langgraph_sync.py b/tests/lib/core/harness/test_harness_langgraph_sync.py index 6117cacd8..5a6667f7e 100644 --- a/tests/lib/core/harness/test_harness_langgraph_sync.py +++ b/tests/lib/core/harness/test_harness_langgraph_sync.py @@ -189,9 +189,12 @@ async def test_empty_stream_yields_nothing(self): out, _ = await _run_yield_turn([]) assert out == [] - async def test_tracer_invoked_but_no_tool_spans_for_full_events(self): - """AGX1-377: tool spans are NOT derived from Full events (SpanDeriver uses Start+Done). - This is the documented gap; full cross-channel equivalence arrives with AGX1-373.""" + async def test_tracer_produces_tool_spans_for_full_events(self): + """AGX1-377: SpanDeriver now handles Full tool events (request opens, response closes). + + Full(ToolRequestContent) opens a tool span; Full(ToolResponseContent) closes it. + This aligns LangGraph tracing with Start+Done harnesses (pydantic-ai, openai-agents). + """ from langchain_core.messages import AIMessage, ToolMessage tc = {"id": "c1", "name": "t", "args": {}} @@ -205,8 +208,9 @@ async def test_tracer_invoked_but_no_tool_spans_for_full_events(self): _, fake_tracing = await _run_yield_turn(events, trace_id="trace-1") assert fake_tracing is not None - # No tool spans opened — Full events don't trigger OpenSpan in SpanDeriver - assert fake_tracing.started == [], "Expected no tool spans for LangGraph Full events (AGX1-377)" + assert len(fake_tracing.started) == 1, "Full(ToolRequestContent) opens one tool span" + assert fake_tracing.started[0][0] == "t", "span name matches the tool name" + assert len(fake_tracing.ended) == 1, "Full(ToolResponseContent) closes the span" async def test_usage_captured_after_yield(self): from langchain_core.messages import AIMessage