diff --git a/.changeset/ets-per-table-memory-metrics.md b/.changeset/ets-per-table-memory-metrics.md new file mode 100644 index 0000000000..5865859434 --- /dev/null +++ b/.changeset/ets-per-table-memory-metrics.md @@ -0,0 +1,5 @@ +--- +'@core/electric-telemetry': patch +--- + +Emit per-individual-table ETS memory/size telemetry (`ets.table.memory` and `ets.table.size`, tagged by `table_name` and `table_type`) for the top N tables by memory, controlled by the new `top_ets_individual_count` option (default 10). Complements the existing per-`table_type` `ets.memory.total` aggregate. diff --git a/packages/electric-telemetry/lib/electric/telemetry/application_telemetry.ex b/packages/electric-telemetry/lib/electric/telemetry/application_telemetry.ex index d2479ec687..51b5d0f465 100644 --- a/packages/electric-telemetry/lib/electric/telemetry/application_telemetry.ex +++ b/packages/electric-telemetry/lib/electric/telemetry/application_telemetry.ex @@ -42,9 +42,19 @@ defmodule ElectricTelemetry.ApplicationTelemetry do defp exporter_child_specs(opts) do metrics = metrics(opts) - # Metrics that should reach Prometheus only, e.g. stack-level metrics that the other - # reporters already export per-stack. Appending them here avoids double-reporting. - prometheus_metrics = metrics ++ Map.get(opts, :additional_prometheus_metrics, []) + # `ets.table.*` are high-cardinality per-table gauges tagged by the raw table name + # (which embeds per-shape/stack ids). That rotating top-N set is the intended trade-off + # for OTel/Honeycomb (the exporter clears its series between exports) and is harmless for + # StatsD (each value is pushed as it's emitted, so nothing accumulates in-process). But + # `TelemetryMetricsPrometheus.Core` keeps every series in its registry + # with no TTL — so a rotating top-N set would accumulate stale series indefinitely. Keep + # these off the Prometheus `/metrics` path; its `ets.memory.total` (by `table_type`) stays. + # + # `additional_prometheus_metrics` are, conversely, Prometheus-only extras (stack-level + # metrics the other reporters already export per-stack) appended to avoid double-reporting. + prometheus_metrics = + Enum.reject(metrics, &prometheus_excluded?/1) ++ + Map.get(opts, :additional_prometheus_metrics, []) [ Reporters.CallHomeReporter.child_spec( @@ -57,6 +67,11 @@ defmodule ElectricTelemetry.ApplicationTelemetry do ] end + # Per-table ETS gauges are excluded from the (TTL-less) Prometheus registry; see + # `exporter_child_specs/1`. They still flow to OTel and StatsD. + defp prometheus_excluded?(%{name: [:ets, :table | _]}), do: true + defp prometheus_excluded?(_), do: false + @impl ElectricTelemetry.Poller def builtin_periodic_measurements(telemetry_opts) do [ @@ -82,6 +97,7 @@ defmodule ElectricTelemetry.ApplicationTelemetry do :process_memory, :process_bin_memory, :ets_memory, + :ets_table_memory, :get_system_load_average, :get_system_memory_usage ], @@ -98,6 +114,8 @@ defmodule ElectricTelemetry.ApplicationTelemetry do last_value("process.bin_memory.max_ref_count", tags: [:process_type]), last_value("process.bin_memory.avg_ref_count", tags: [:process_type]), last_value("ets.memory.total", tags: [:table_type], unit: :byte), + last_value("ets.table.memory", tags: [:table_name, :table_type], unit: :byte), + last_value("ets.table.size", tags: [:table_name, :table_type]), last_value("system.cpu.core_count"), last_value("system.cpu.utilization.total"), last_value("system.load_percent.avg1"), @@ -186,7 +204,7 @@ defmodule ElectricTelemetry.ApplicationTelemetry do :telemetry.execute( [:process, :memory], %{total: map.proc_mem}, - %{process_type: to_string(map.type)} + %{process_type: map.type} ) end end @@ -202,7 +220,7 @@ defmodule ElectricTelemetry.ApplicationTelemetry do max_ref_count: map.max_ref_count, avg_ref_count: map.avg_ref_count }, - %{process_type: to_string(map.type)} + %{process_type: map.type} ) end end @@ -210,7 +228,20 @@ defmodule ElectricTelemetry.ApplicationTelemetry do def ets_memory(%{intervals_and_thresholds: %{top_ets_table_count: ets_table_count}}) do for %{type: type, memory: memory} <- ElectricTelemetry.EtsTables.top_by_type(ets_table_count) do - :telemetry.execute([:ets, :memory], %{total: memory}, %{table_type: to_string(type)}) + :telemetry.execute([:ets, :memory], %{total: memory}, %{table_type: type}) + end + end + + def ets_table_memory(%{ + intervals_and_thresholds: %{top_ets_individual_count: individual_count} + }) do + for %{name: name, type: type, memory: memory, size: size} <- + ElectricTelemetry.EtsTables.top_tables(individual_count) do + :telemetry.execute( + [:ets, :table], + %{memory: memory, size: size}, + %{table_name: name, table_type: type} + ) end end diff --git a/packages/electric-telemetry/lib/electric/telemetry/opts.ex b/packages/electric-telemetry/lib/electric/telemetry/opts.ex index fedd966578..992bb4626f 100644 --- a/packages/electric-telemetry/lib/electric/telemetry/opts.ex +++ b/packages/electric-telemetry/lib/electric/telemetry/opts.ex @@ -38,6 +38,7 @@ defmodule ElectricTelemetry.Opts do default: {:count, 5} ], top_ets_table_count: [type: :integer, default: 10], + top_ets_individual_count: [type: :integer, default: 10], # Garbage collection should run almost instantly since each process has its own heap that # is garbage collected independently of others. 50ms might be too generous. long_gc_threshold: [type: :integer, default: 50], diff --git a/packages/electric-telemetry/test/electric/telemetry/application_telemetry_test.exs b/packages/electric-telemetry/test/electric/telemetry/application_telemetry_test.exs index 5a4eba5657..33dc997ad3 100644 --- a/packages/electric-telemetry/test/electric/telemetry/application_telemetry_test.exs +++ b/packages/electric-telemetry/test/electric/telemetry/application_telemetry_test.exs @@ -35,6 +35,37 @@ defmodule ElectricTelemetry.ApplicationTelemetryTest do end end + describe "ets.table.* metric routing" do + setup do + {:ok, opts} = + ElectricTelemetry.validate_options( + instance_id: "test-instance", + version: "1.0.0", + reporters: [prometheus?: true, statsd_host: "localhost", otel_metrics?: true] + ) + + {:ok, {_flags, children}} = ApplicationTelemetry.init(opts) + %{children: children} + end + + test "reach OTel and StatsD but are kept off the Prometheus registry", %{children: children} do + ets_table? = &(&1.name in [[:ets, :table, :memory], [:ets, :table, :size]]) + + assert Enum.any?(reporter_metrics(children, OtelMetricExporter), ets_table?) + assert Enum.any?(reporter_metrics(children, TelemetryMetricsStatsd), ets_table?) + refute Enum.any?(reporter_metrics(children, :prometheus_metrics), ets_table?) + end + + test "the bounded ets.memory.total (by table_type) still reaches Prometheus", %{ + children: children + } do + assert Enum.any?( + reporter_metrics(children, :prometheus_metrics), + &(&1.name == [:ets, :memory, :total]) + ) + end + end + # `Supervisor.init/2` normalises child specs into maps, nesting the reporter's start args # (which carry `:metrics`) inside `:start`. Reporters are identified by their child spec id. defp reporter_metrics(children, id) do @@ -72,4 +103,58 @@ defmodule ElectricTelemetry.ApplicationTelemetryTest do end end end + + describe "ets_table_memory/1" do + test "emits a [:ets, :table] event per top table with memory/size and name/type tags" do + table = :ets.new(:"ApplicationTelemetryTest:ets_table_memory", [:public, :named_table]) + for i <- 1..200, do: :ets.insert(table, {i, :binary.copy(<<0>>, 1000)}) + + ref = make_ref() + test_pid = self() + handler_id = {__MODULE__, :ets_table, ref} + + :telemetry.attach( + handler_id, + [:ets, :table], + fn _event, measurements, metadata, _config -> + send(test_pid, {ref, measurements, metadata}) + end, + nil + ) + + on_exit(fn -> :telemetry.detach(handler_id) end) + + ApplicationTelemetry.ets_table_memory(%{ + intervals_and_thresholds: %{top_ets_individual_count: 100} + }) + + events = collect_events(ref, []) + + assert events != [] + + for {measurements, metadata} <- events do + assert %{memory: memory, size: size} = measurements + assert is_integer(memory) and memory > 0 + assert is_integer(size) and size >= 0 + # Tag values are left as-is (atom name, string type); every reporter + # stringifies tag values itself, so no to_string/1 at the call site. + assert %{table_name: name, table_type: type} = metadata + assert is_atom(name) + assert is_binary(type) + end + + # Our named test table should be among the emitted tables. + assert Enum.any?(events, fn {_measurements, metadata} -> + metadata.table_name == :"ApplicationTelemetryTest:ets_table_memory" + end) + end + end + + defp collect_events(ref, acc) do + receive do + {^ref, measurements, metadata} -> collect_events(ref, [{measurements, metadata} | acc]) + after + 0 -> Enum.reverse(acc) + end + end end