Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 37 additions & 1 deletion src/eva/metrics/diagnostic/response_speed.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,38 @@
final evaluation scores.
"""

import json
from pathlib import Path

from eva.metrics.base import CodeMetric, MetricContext
from eva.metrics.registry import register_metric
from eva.models.results import MetricScore


def _load_component_latencies(output_dir: str) -> dict[str, dict]:
"""Load per-component latency stats from result.json.

Returns a dict mapping short keys (e.g. "llm_latency", "stt_latency",
"tts_latency") to their stats dicts, only for non-null entries.
"""
result_path = Path(output_dir) / "result.json"
if not result_path.exists():
return {}

try:
result_data = json.loads(result_path.read_text())
except Exception:
return {}

latencies: dict[str, dict] = {}
for key in ("llm_latency", "stt_latency", "tts_latency"):
value = result_data.get(key)
if value is not None and isinstance(value, dict) and value.get("mean_ms") is not None:
latencies[key] = value

return latencies


def _split_by_tool_calls(
context: MetricContext,
) -> tuple[list[float], list[float]]:
Expand Down Expand Up @@ -60,7 +87,7 @@ class ResponseSpeedMetric(CodeMetric):
description = "Diagnostic metric: latency between user utterance end and assistant response start"
exclude_from_pass_at_k = True
higher_is_better = False # Score is latency in seconds — lower is better.
version = "v0.1"
version = "v0.2"

async def compute(self, context: MetricContext) -> MetricScore:
try:
Expand Down Expand Up @@ -102,6 +129,15 @@ async def compute(self, context: MetricContext) -> MetricScore:
details=stats,
)

# Add per-component latency sub_metrics from result.json
for key, latency_stats in _load_component_latencies(context.output_dir).items():
sub_metrics[key] = MetricScore(
name=f"{self.name}.{key}",
score=latency_stats["mean_ms"],
normalized_score=None,
details=latency_stats,
)

return MetricScore(
name=self.name,
score=overall_stats["mean_speed_seconds"],
Expand Down
47 changes: 0 additions & 47 deletions src/eva/metrics/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import inspect
import json
import os
import statistics
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any
Expand Down Expand Up @@ -948,44 +947,6 @@ def _compute_pass_at_k_from_all_metrics(
logger.info(f"pass@k computation complete for {len(results)} records")
return results

def _compute_latency_summary(self) -> dict[str, Any]:
"""Compute mean latency for llm, stt, and tts across all records.

Reads each record's result.json and collects the mean_ms values
for llm_latency, stt_latency, and tts_latency, skipping nulls.
Returns a dict with the mean of mean_ms values for each latency type
that has at least one non-null entry.
"""
latency_keys = ["llm_latency", "stt_latency", "tts_latency", "model_response_latency"]
collected: dict[str, list[float]] = {k: [] for k in latency_keys}

for _record_id, record_dir in self._discover_record_dirs(self.run_dir, self.record_ids):
result_path = record_dir / "result.json"
if not result_path.exists():
continue
try:
result_data = json.loads(result_path.read_text())
except Exception:
continue
for key in latency_keys:
latency = result_data.get(key)
if latency is not None and isinstance(latency, dict) and latency.get("mean_ms") is not None:
collected[key].append(latency["mean_ms"])

summary: dict[str, Any] = {}
for key in latency_keys:
values = collected[key]
# Strip the _latency suffix for the summary key (e.g. "llm", "stt", "tts")
short_key = key.removesuffix("_latency")
if values:
summary[short_key] = {
"mean_of_means_ms": round(statistics.mean(values), 2),
}
else:
summary[short_key] = None

return summary

async def _save_summary(
self,
all_metrics: dict[str, RecordMetrics],
Expand Down Expand Up @@ -1070,14 +1031,6 @@ async def _save_summary(
elif existing_summary.get("pass_at_k_config"):
summary["pass_at_k_config"] = existing_summary["pass_at_k_config"]

# Add latency summary from record result.json files
try:
latency_summary = self._compute_latency_summary()
if latency_summary:
summary["latency"] = latency_summary
except Exception as e:
logger.warning(f"Failed to compute latency summary: {e}")

try:
run_config = json.loads((self.run_dir / "config.json").read_text())
provenance = capture_metrics_provenance(run_metric_names, run_config=run_config)
Expand Down
4 changes: 2 additions & 2 deletions tests/fixtures/metric_signatures.json
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@
"ResponseSpeedMetric": {
"name": "response_speed",
"prompt_hash": null,
"source_hash": "7fecaf5fa24f",
"version": "v0.1"
"source_hash": "7cd7e8baa3d2",
"version": "v0.2"
},
"STTWERMetric": {
"name": "stt_wer",
Expand Down
Loading