Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .changeset/ets-per-table-memory-metrics.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
'@core/electric-telemetry': patch
---

Emit per-individual-table ETS memory/size telemetry (`ets.table.memory` and `ets.table.size`, tagged by `table_name` and `table_type`) for the top N tables by memory, controlled by the new `top_ets_individual_count` option (default 10). Complements the existing per-`table_type` `ets.memory.total` aggregate.
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,19 @@ defmodule ElectricTelemetry.ApplicationTelemetry do
defp exporter_child_specs(opts) do
metrics = metrics(opts)

# Metrics that should reach Prometheus only, e.g. stack-level metrics that the other
# reporters already export per-stack. Appending them here avoids double-reporting.
prometheus_metrics = metrics ++ Map.get(opts, :additional_prometheus_metrics, [])
# `ets.table.*` are high-cardinality per-table gauges tagged by the raw table name
# (which embeds per-shape/stack ids). That rotating top-N set is the intended trade-off
# for OTel/Honeycomb (the exporter clears its series between exports) and is harmless for
# StatsD (each value is pushed as it's emitted, so nothing accumulates in-process). But
# `TelemetryMetricsPrometheus.Core` keeps every series in its registry
# with no TTL — so a rotating top-N set would accumulate stale series indefinitely. Keep
# these off the Prometheus `/metrics` path; its `ets.memory.total` (by `table_type`) stays.
#
# `additional_prometheus_metrics` are, conversely, Prometheus-only extras (stack-level
# metrics the other reporters already export per-stack) appended to avoid double-reporting.
prometheus_metrics =
Enum.reject(metrics, &prometheus_excluded?/1) ++
Map.get(opts, :additional_prometheus_metrics, [])

[
Reporters.CallHomeReporter.child_spec(
Expand All @@ -57,6 +67,11 @@ defmodule ElectricTelemetry.ApplicationTelemetry do
]
end

# Per-table ETS gauges are excluded from the (TTL-less) Prometheus registry; see
# `exporter_child_specs/1`. They still flow to OTel and StatsD.
defp prometheus_excluded?(%{name: [:ets, :table | _]}), do: true
defp prometheus_excluded?(_), do: false

@impl ElectricTelemetry.Poller
def builtin_periodic_measurements(telemetry_opts) do
[
Expand All @@ -82,6 +97,7 @@ defmodule ElectricTelemetry.ApplicationTelemetry do
:process_memory,
:process_bin_memory,
:ets_memory,
:ets_table_memory,
:get_system_load_average,
:get_system_memory_usage
],
Expand All @@ -98,6 +114,8 @@ defmodule ElectricTelemetry.ApplicationTelemetry do
last_value("process.bin_memory.max_ref_count", tags: [:process_type]),
last_value("process.bin_memory.avg_ref_count", tags: [:process_type]),
last_value("ets.memory.total", tags: [:table_type], unit: :byte),
last_value("ets.table.memory", tags: [:table_name, :table_type], unit: :byte),
last_value("ets.table.size", tags: [:table_name, :table_type]),
last_value("system.cpu.core_count"),
last_value("system.cpu.utilization.total"),
last_value("system.load_percent.avg1"),
Expand Down Expand Up @@ -186,7 +204,7 @@ defmodule ElectricTelemetry.ApplicationTelemetry do
:telemetry.execute(
[:process, :memory],
%{total: map.proc_mem},
%{process_type: to_string(map.type)}
%{process_type: map.type}
)
end
end
Expand All @@ -202,15 +220,28 @@ defmodule ElectricTelemetry.ApplicationTelemetry do
max_ref_count: map.max_ref_count,
avg_ref_count: map.avg_ref_count
},
%{process_type: to_string(map.type)}
%{process_type: map.type}
)
end
end

def ets_memory(%{intervals_and_thresholds: %{top_ets_table_count: ets_table_count}}) do
for %{type: type, memory: memory} <-
ElectricTelemetry.EtsTables.top_by_type(ets_table_count) do
:telemetry.execute([:ets, :memory], %{total: memory}, %{table_type: to_string(type)})
:telemetry.execute([:ets, :memory], %{total: memory}, %{table_type: type})
end
end

def ets_table_memory(%{
intervals_and_thresholds: %{top_ets_individual_count: individual_count}
}) do
for %{name: name, type: type, memory: memory, size: size} <-
ElectricTelemetry.EtsTables.top_tables(individual_count) do
:telemetry.execute(
[:ets, :table],
%{memory: memory, size: size},
%{table_name: name, table_type: type}
)
end
end

Expand Down
1 change: 1 addition & 0 deletions packages/electric-telemetry/lib/electric/telemetry/opts.ex
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ defmodule ElectricTelemetry.Opts do
default: {:count, 5}
],
top_ets_table_count: [type: :integer, default: 10],
top_ets_individual_count: [type: :integer, default: 10],
# Garbage collection should run almost instantly since each process has its own heap that
# is garbage collected independently of others. 50ms might be too generous.
long_gc_threshold: [type: :integer, default: 50],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,37 @@ defmodule ElectricTelemetry.ApplicationTelemetryTest do
end
end

describe "ets.table.* metric routing" do
setup do
{:ok, opts} =
ElectricTelemetry.validate_options(
instance_id: "test-instance",
version: "1.0.0",
reporters: [prometheus?: true, statsd_host: "localhost", otel_metrics?: true]
)

{:ok, {_flags, children}} = ApplicationTelemetry.init(opts)
%{children: children}
end

test "reach OTel and StatsD but are kept off the Prometheus registry", %{children: children} do
ets_table? = &(&1.name in [[:ets, :table, :memory], [:ets, :table, :size]])

assert Enum.any?(reporter_metrics(children, OtelMetricExporter), ets_table?)
assert Enum.any?(reporter_metrics(children, TelemetryMetricsStatsd), ets_table?)
refute Enum.any?(reporter_metrics(children, :prometheus_metrics), ets_table?)
end

test "the bounded ets.memory.total (by table_type) still reaches Prometheus", %{
children: children
} do
assert Enum.any?(
reporter_metrics(children, :prometheus_metrics),
&(&1.name == [:ets, :memory, :total])
)
end
end

# `Supervisor.init/2` normalises child specs into maps, nesting the reporter's start args
# (which carry `:metrics`) inside `:start`. Reporters are identified by their child spec id.
defp reporter_metrics(children, id) do
Expand Down Expand Up @@ -72,4 +103,58 @@ defmodule ElectricTelemetry.ApplicationTelemetryTest do
end
end
end

describe "ets_table_memory/1" do
test "emits a [:ets, :table] event per top table with memory/size and name/type tags" do
table = :ets.new(:"ApplicationTelemetryTest:ets_table_memory", [:public, :named_table])
for i <- 1..200, do: :ets.insert(table, {i, :binary.copy(<<0>>, 1000)})

ref = make_ref()
test_pid = self()
handler_id = {__MODULE__, :ets_table, ref}

:telemetry.attach(
handler_id,
[:ets, :table],
fn _event, measurements, metadata, _config ->
send(test_pid, {ref, measurements, metadata})
end,
nil
)

on_exit(fn -> :telemetry.detach(handler_id) end)

ApplicationTelemetry.ets_table_memory(%{
intervals_and_thresholds: %{top_ets_individual_count: 100}
})

events = collect_events(ref, [])

assert events != []

for {measurements, metadata} <- events do
assert %{memory: memory, size: size} = measurements
assert is_integer(memory) and memory > 0
assert is_integer(size) and size >= 0
# Tag values are left as-is (atom name, string type); every reporter
# stringifies tag values itself, so no to_string/1 at the call site.
assert %{table_name: name, table_type: type} = metadata
assert is_atom(name)
assert is_binary(type)
end

# Our named test table should be among the emitted tables.
assert Enum.any?(events, fn {_measurements, metadata} ->
metadata.table_name == :"ApplicationTelemetryTest:ets_table_memory"
end)
end
end

defp collect_events(ref, acc) do
receive do
{^ref, measurements, metadata} -> collect_events(ref, [{measurements, metadata} | acc])
after
0 -> Enum.reverse(acc)
end
end
end
Loading