From 87b46b6a90587d6ec048bd6bac5f1ba60b4b09d2 Mon Sep 17 00:00:00 2001 From: Katrina Date: Thu, 11 Jun 2026 11:09:11 -0400 Subject: [PATCH] move latency metrics to under response_speed --- src/eva/metrics/diagnostic/response_speed.py | 38 +++++++++++++++- src/eva/metrics/runner.py | 47 -------------------- tests/fixtures/metric_signatures.json | 4 +- 3 files changed, 39 insertions(+), 50 deletions(-) diff --git a/src/eva/metrics/diagnostic/response_speed.py b/src/eva/metrics/diagnostic/response_speed.py index 56571e2c..85f7fae2 100644 --- a/src/eva/metrics/diagnostic/response_speed.py +++ b/src/eva/metrics/diagnostic/response_speed.py @@ -4,11 +4,38 @@ final evaluation scores. """ +import json +from pathlib import Path + from eva.metrics.base import CodeMetric, MetricContext from eva.metrics.registry import register_metric from eva.models.results import MetricScore +def _load_component_latencies(output_dir: str) -> dict[str, dict]: + """Load per-component latency stats from result.json. + + Returns a dict mapping short keys (e.g. "llm_latency", "stt_latency", + "tts_latency") to their stats dicts, only for non-null entries. + """ + result_path = Path(output_dir) / "result.json" + if not result_path.exists(): + return {} + + try: + result_data = json.loads(result_path.read_text()) + except Exception: + return {} + + latencies: dict[str, dict] = {} + for key in ("llm_latency", "stt_latency", "tts_latency"): + value = result_data.get(key) + if value is not None and isinstance(value, dict) and value.get("mean_ms") is not None: + latencies[key] = value + + return latencies + + def _split_by_tool_calls( context: MetricContext, ) -> tuple[list[float], list[float]]: @@ -60,7 +87,7 @@ class ResponseSpeedMetric(CodeMetric): description = "Diagnostic metric: latency between user utterance end and assistant response start" exclude_from_pass_at_k = True higher_is_better = False # Score is latency in seconds — lower is better. - version = "v0.1" + version = "v0.2" async def compute(self, context: MetricContext) -> MetricScore: try: @@ -102,6 +129,15 @@ async def compute(self, context: MetricContext) -> MetricScore: details=stats, ) + # Add per-component latency sub_metrics from result.json + for key, latency_stats in _load_component_latencies(context.output_dir).items(): + sub_metrics[key] = MetricScore( + name=f"{self.name}.{key}", + score=latency_stats["mean_ms"], + normalized_score=None, + details=latency_stats, + ) + return MetricScore( name=self.name, score=overall_stats["mean_speed_seconds"], diff --git a/src/eva/metrics/runner.py b/src/eva/metrics/runner.py index 77881a38..fb836696 100644 --- a/src/eva/metrics/runner.py +++ b/src/eva/metrics/runner.py @@ -4,7 +4,6 @@ import inspect import json import os -import statistics from dataclasses import dataclass, field from pathlib import Path from typing import Any @@ -955,44 +954,6 @@ def _compute_pass_at_k_from_all_metrics( logger.info(f"pass@k computation complete for {len(results)} records") return results - def _compute_latency_summary(self) -> dict[str, Any]: - """Compute mean latency for llm, stt, and tts across all records. - - Reads each record's result.json and collects the mean_ms values - for llm_latency, stt_latency, and tts_latency, skipping nulls. - Returns a dict with the mean of mean_ms values for each latency type - that has at least one non-null entry. - """ - latency_keys = ["llm_latency", "stt_latency", "tts_latency", "model_response_latency"] - collected: dict[str, list[float]] = {k: [] for k in latency_keys} - - for _record_id, record_dir in self._discover_record_dirs(self.run_dir, self.record_ids): - result_path = record_dir / "result.json" - if not result_path.exists(): - continue - try: - result_data = json.loads(result_path.read_text()) - except Exception: - continue - for key in latency_keys: - latency = result_data.get(key) - if latency is not None and isinstance(latency, dict) and latency.get("mean_ms") is not None: - collected[key].append(latency["mean_ms"]) - - summary: dict[str, Any] = {} - for key in latency_keys: - values = collected[key] - # Strip the _latency suffix for the summary key (e.g. "llm", "stt", "tts") - short_key = key.removesuffix("_latency") - if values: - summary[short_key] = { - "mean_of_means_ms": round(statistics.mean(values), 2), - } - else: - summary[short_key] = None - - return summary - async def _save_summary( self, all_metrics: dict[str, RecordMetrics], @@ -1077,14 +1038,6 @@ async def _save_summary( elif existing_summary.get("pass_at_k_config"): summary["pass_at_k_config"] = existing_summary["pass_at_k_config"] - # Add latency summary from record result.json files - try: - latency_summary = self._compute_latency_summary() - if latency_summary: - summary["latency"] = latency_summary - except Exception as e: - logger.warning(f"Failed to compute latency summary: {e}") - try: run_config = json.loads((self.run_dir / "config.json").read_text()) provenance = capture_metrics_provenance(run_metric_names, run_config=run_config) diff --git a/tests/fixtures/metric_signatures.json b/tests/fixtures/metric_signatures.json index a8c2023a..aafb600f 100644 --- a/tests/fixtures/metric_signatures.json +++ b/tests/fixtures/metric_signatures.json @@ -56,8 +56,8 @@ "ResponseSpeedMetric": { "name": "response_speed", "prompt_hash": null, - "source_hash": "ebce1a06bf30", - "version": "v0.1" + "source_hash": "f4d81db9f95b", + "version": "v0.2" }, "STTWERMetric": { "name": "stt_wer",