akkoma/lib/pleroma/web/telemetry.ex

defmodule Pleroma.Web.Telemetry do
  use Supervisor
  import Telemetry.Metrics
  alias Pleroma.Stats
  alias Pleroma.Config

  def start_link(arg) do
    Supervisor.start_link(__MODULE__, arg, name: __MODULE__)
  end

  @impl true
  def init(_arg) do
    children =
      [
        {:telemetry_poller, measurements: periodic_measurements(), period: 10_000}
      ] ++
        prometheus_children()

    Supervisor.init(children, strategy: :one_for_one)
  end

  defp prometheus_children do
    config = Config.get([:instance, :export_prometheus_metrics], true)

    if config do
      [
        {TelemetryMetricsPrometheus.Core, metrics: prometheus_metrics()},
        Pleroma.PrometheusExporter
      ]
    else
      []
    end
  end

  # A seperate set of metrics for distributions because phoenix dashboard does NOT handle them well
  defp distribution_metrics do
    [
      distribution(
        "phoenix.router_dispatch.stop.duration",
        # event_name: [:pleroma, :repo, :query, :total_time],
        measurement: :duration,
        unit: {:native, :second},
        tags: [:route],
        reporter_options: [
          buckets: [0.1, 0.2, 0.5, 1, 2.5, 5, 10, 25, 50, 100, 250, 500, 1000]
        ]
      ),

      # Database Time Metrics
      distribution(
        "pleroma.repo.query.total_time",
        # event_name: [:pleroma, :repo, :query, :total_time],
        measurement: :total_time,
        unit: {:native, :millisecond},
        reporter_options: [
          buckets: [0.1, 0.2, 0.5, 1, 2.5, 5, 10, 25, 50, 100, 250, 500, 1000]
        ]
      ),
      distribution(
        "pleroma.repo.query.queue_time",
        # event_name: [:pleroma, :repo, :query, :total_time],
        measurement: :queue_time,
        unit: {:native, :millisecond},
        reporter_options: [
          buckets: [0.01, 0.025, 0.05, 0.1, 0.2, 0.5, 1, 2.5, 5, 10]
        ]
      ),
      distribution(
        "oban_job_exception",
        event_name: [:oban, :job, :exception],
        measurement: :duration,
        tags: [:worker],
        tag_values: fn tags -> Map.put(tags, :worker, tags.job.worker) end,
        unit: {:native, :second},
        reporter_options: [
          buckets: [0.01, 0.025, 0.05, 0.1, 0.2, 0.5, 1, 2.5, 5, 10]
        ]
      ),
      distribution(
        "tesla_request_completed",
        event_name: [:tesla, :request, :stop],
        measurement: :duration,
        tags: [:response_code],
        tag_values: fn tags -> Map.put(tags, :response_code, tags.env.status) end,
        unit: {:native, :second},
        reporter_options: [
          buckets: [0.01, 0.025, 0.05, 0.1, 0.2, 0.5, 1, 2.5, 5, 10]
        ]
      ),
      distribution(
        "oban_job_completion",
        event_name: [:oban, :job, :stop],
        measurement: :duration,
        tags: [:worker],
        tag_values: fn tags -> Map.put(tags, :worker, tags.job.worker) end,
        unit: {:native, :second},
        reporter_options: [
          buckets: [0.01, 0.025, 0.05, 0.1, 0.2, 0.5, 1, 2.5, 5, 10]
        ]
      )
    ]
  end

  # Summary metrics are currently not (yet) supported by the prometheus exporter
  defp summary_metrics(byte_unit) do
    [
      # Phoenix Metrics
      summary("phoenix.endpoint.stop.duration",
        unit: {:native, :millisecond}
      ),
      summary("phoenix.router_dispatch.stop.duration",
        tags: [:route],
        unit: {:native, :millisecond}
      ),
      summary("pleroma.repo.query.total_time", unit: {:native, :millisecond}),
      summary("pleroma.repo.query.decode_time", unit: {:native, :millisecond}),
      summary("pleroma.repo.query.query_time", unit: {:native, :millisecond}),
      summary("pleroma.repo.query.queue_time", unit: {:native, :millisecond}),
      summary("pleroma.repo.query.idle_time", unit: {:native, :millisecond}),

      # VM Metrics
      summary("vm.memory.total", unit: {:byte, byte_unit}),
      summary("vm.total_run_queue_lengths.total"),
      summary("vm.total_run_queue_lengths.cpu"),
      summary("vm.total_run_queue_lengths.io")
    ]
  end

  defp sum_counter_pair(basename, opts) do
    [
      sum(basename <> ".psum", opts),
      counter(basename <> ".pcount", opts)
    ]
  end

  # Prometheus exporter doesn't support summaries, so provide fallbacks
  defp summary_fallback_metrics(byte_unit \\ :byte) do
    # Summary metrics are not supported by the Prometheus exporter
    #   https://github.com/beam-telemetry/telemetry_metrics_prometheus_core/issues/11
    # and sum metrics currently only work with integers
    #   https://github.com/beam-telemetry/telemetry_metrics_prometheus_core/issues/35
    #
    # For VM metrics this is kindof ok as they appear to always be integers
    # and we can use sum + counter to get the average between polls from their change
    # But for repo query times we need to use a full distribution

    simple_buckets = [0, 1, 2, 4, 8, 16]
    simple_buckets_quick = for t <- simple_buckets, do: t / 100.0

    # Already included in distribution metrics anyway:
    #   phoenix.router_dispatch.stop.duration
    #   pleroma.repo.query.total_time
    #   pleroma.repo.query.queue_time
    dist_metrics =
      [
        distribution("phoenix.endpoint.stop.duration.fdist",
          event_name: [:phoenix, :endpoint, :stop],
          measurement: :duration,
          unit: {:native, :millisecond},
          reporter_options: [
            buckets: simple_buckets
          ]
        ),
        distribution("pleroma.repo.query.decode_time.fdist",
          event_name: [:pleroma, :repo, :query],
          measurement: :decode_time,
          unit: {:native, :millisecond},
          reporter_options: [
            buckets: simple_buckets_quick
          ]
        ),
        distribution("pleroma.repo.query.query_time.fdist",
          event_name: [:pleroma, :repo, :query],
          measurement: :query_time,
          unit: {:native, :millisecond},
          reporter_options: [
            buckets: simple_buckets
          ]
        ),
        distribution("pleroma.repo.query.idle_time.fdist",
          event_name: [:pleroma, :repo, :query],
          measurement: :idle_time,
          unit: {:native, :millisecond},
          reporter_options: [
            buckets: simple_buckets
          ]
        )
      ]

    vm_metrics =
      sum_counter_pair("vm.memory.total",
        event_name: [:vm, :memory],
        measurement: :total,
        unit: {:byte, byte_unit}
      ) ++
        sum_counter_pair("vm.total_run_queue_lengths.total",
          event_name: [:vm, :total_run_queue_lengths],
          measurement: :total
        ) ++
        sum_counter_pair("vm.total_run_queue_lengths.cpu",
          event_name: [:vm, :total_run_queue_lengths],
          measurement: :cpu
        ) ++
        sum_counter_pair("vm.total_run_queue_lengths.io.fsum",
          event_name: [:vm, :total_run_queue_lengths],
          measurement: :io
        )

    dist_metrics ++ vm_metrics
  end

  defp common_metrics do
    [
      last_value("pleroma.local_users.total"),
      last_value("pleroma.domains.total"),
      last_value("pleroma.local_statuses.total"),
      last_value("pleroma.remote_users.total")
    ]
  end

  def prometheus_metrics,
    do: common_metrics() ++ distribution_metrics() ++ summary_fallback_metrics()

  def live_dashboard_metrics, do: common_metrics() ++ summary_metrics(:megabyte)

  defp periodic_measurements do
    [
      {__MODULE__, :instance_stats, []}
    ]
  end

  def instance_stats do
    stats = Stats.get_stats()
    :telemetry.execute([:pleroma, :local_users], %{total: stats.user_count}, %{})
    :telemetry.execute([:pleroma, :domains], %{total: stats.domain_count}, %{})
    :telemetry.execute([:pleroma, :local_statuses], %{total: stats.status_count}, %{})
    :telemetry.execute([:pleroma, :remote_users], %{total: stats.remote_user_count}, %{})
  end
end
Start adding telemetry 2022-11-12 10:14:16 +00:00			`defmodule Pleroma.Web.Telemetry do`
			`use Supervisor`
			`import Telemetry.Metrics`
Measure stats-data 2022-11-12 16:13:39 +00:00			`alias Pleroma.Stats`
Use a genserver to periodically fetch metrics Ref https://github.com/beam-telemetry/telemetry_metrics_prometheus_core/issues/52 2023-01-01 18:32:14 +00:00			`alias Pleroma.Config`
Start adding telemetry 2022-11-12 10:14:16 +00:00
			`def start_link(arg) do`
			`Supervisor.start_link(__MODULE__, arg, name: __MODULE__)`
			`end`

			`@impl true`
			`def init(_arg) do`
Use a genserver to periodically fetch metrics Ref https://github.com/beam-telemetry/telemetry_metrics_prometheus_core/issues/52 2023-01-01 18:32:14 +00:00			`children =`
			`[`
			`{:telemetry_poller, measurements: periodic_measurements(), period: 10_000}`
			`] ++`
			`prometheus_children()`
Start adding telemetry 2022-11-12 10:14:16 +00:00
			`Supervisor.init(children, strategy: :one_for_one)`
			`end`

Use a genserver to periodically fetch metrics Ref https://github.com/beam-telemetry/telemetry_metrics_prometheus_core/issues/52 2023-01-01 18:32:14 +00:00			`defp prometheus_children do`
			`config = Config.get([:instance, :export_prometheus_metrics], true)`

			`if config do`
			`[`
			`{TelemetryMetricsPrometheus.Core, metrics: prometheus_metrics()},`
			`Pleroma.PrometheusExporter`
			`]`
			`else`
			`[]`
			`end`
			`end`

allow users with admin:metrics to read app metrics 2022-12-16 03:32:51 +00:00			`# A seperate set of metrics for distributions because phoenix dashboard does NOT handle them well`
Add prometheus metrics to router 2022-12-15 02:02:07 +00:00			`defp distribution_metrics do`
Start adding telemetry 2022-11-12 10:14:16 +00:00			`[`
			`distribution(`
			`"phoenix.router_dispatch.stop.duration",`
			`# event_name: [:pleroma, :repo, :query, :total_time],`
			`measurement: :duration,`
			`unit: {:native, :second},`
			`tags: [:route],`
			`reporter_options: [`
Fix buckets for query timing 2022-11-12 10:23:44 +00:00			`buckets: [0.1, 0.2, 0.5, 1, 2.5, 5, 10, 25, 50, 100, 250, 500, 1000]`
Start adding telemetry 2022-11-12 10:14:16 +00:00			`]`
			`),`

			`# Database Time Metrics`
			`distribution(`
			`"pleroma.repo.query.total_time",`
			`# event_name: [:pleroma, :repo, :query, :total_time],`
			`measurement: :total_time,`
			`unit: {:native, :millisecond},`
			`reporter_options: [`
Fix buckets for query timing 2022-11-12 10:23:44 +00:00			`buckets: [0.1, 0.2, 0.5, 1, 2.5, 5, 10, 25, 50, 100, 250, 500, 1000]`
Start adding telemetry 2022-11-12 10:14:16 +00:00			`]`
			`),`
			`distribution(`
			`"pleroma.repo.query.queue_time",`
			`# event_name: [:pleroma, :repo, :query, :total_time],`
			`measurement: :queue_time,`
			`unit: {:native, :millisecond},`
			`reporter_options: [`
			`buckets: [0.01, 0.025, 0.05, 0.1, 0.2, 0.5, 1, 2.5, 5, 10]`
			`]`
			`),`
			`distribution(`
Add prometheus metrics to router 2022-12-15 02:02:07 +00:00			`"oban_job_exception",`
			`event_name: [:oban, :job, :exception],`
Start adding telemetry 2022-11-12 10:14:16 +00:00			`measurement: :duration,`
Fix oban tags 2022-11-12 10:54:35 +00:00			`tags: [:worker],`
			`tag_values: fn tags -> Map.put(tags, :worker, tags.job.worker) end,`
Start adding telemetry 2022-11-12 10:14:16 +00:00			`unit: {:native, :second},`
			`reporter_options: [`
			`buckets: [0.01, 0.025, 0.05, 0.1, 0.2, 0.5, 1, 2.5, 5, 10]`
			`]`
Track oban failures 2022-11-12 11:42:53 +00:00			`),`
			`distribution(`
Add prometheus metrics to router 2022-12-15 02:02:07 +00:00			`"tesla_request_completed",`
			`event_name: [:tesla, :request, :stop],`
Track oban failures 2022-11-12 11:42:53 +00:00			`measurement: :duration,`
Add prometheus metrics to router 2022-12-15 02:02:07 +00:00			`tags: [:response_code],`
			`tag_values: fn tags -> Map.put(tags, :response_code, tags.env.status) end,`
Track oban failures 2022-11-12 11:42:53 +00:00			`unit: {:native, :second},`
			`reporter_options: [`
			`buckets: [0.01, 0.025, 0.05, 0.1, 0.2, 0.5, 1, 2.5, 5, 10]`
			`]`
Take tesla telemetry 2022-11-12 15:11:38 +00:00			`),`
			`distribution(`
Add prometheus metrics to router 2022-12-15 02:02:07 +00:00			`"oban_job_completion",`
			`event_name: [:oban, :job, :stop],`
Take tesla telemetry 2022-11-12 15:11:38 +00:00			`measurement: :duration,`
Add prometheus metrics to router 2022-12-15 02:02:07 +00:00			`tags: [:worker],`
			`tag_values: fn tags -> Map.put(tags, :worker, tags.job.worker) end,`
Take tesla telemetry 2022-11-12 15:11:38 +00:00			`unit: {:native, :second},`
			`reporter_options: [`
			`buckets: [0.01, 0.025, 0.05, 0.1, 0.2, 0.5, 1, 2.5, 5, 10]`
			`]`
Add prometheus metrics to router 2022-12-15 02:02:07 +00:00			`)`
			`]`
			`end`

Don't add summary metrics to prometheus The exporter doesn’t support them thus we don't lose anything by this, but it avoids a bunch of warnings each time the server starts up. 2024-02-03 16:30:00 +00:00			`# Summary metrics are currently not (yet) supported by the prometheus exporter`
Display memory as MB in live dashboard With kilobyte the resulting numbers got too large and were cut off in the charts, making them useless. However, even an idle Akkoma server’s memory usage is in the lower hundreths of megabytes, so we don’t need this much precision to begin with for the dashboard. Other metric users might prefer base units and can handle scaling in a smarter way, so keep this configurable. 2024-02-03 17:21:09 +00:00			`defp summary_metrics(byte_unit) do`
Add prometheus metrics to router 2022-12-15 02:02:07 +00:00			`[`
			`# Phoenix Metrics`
			`summary("phoenix.endpoint.stop.duration",`
			`unit: {:native, :millisecond}`
			`),`
			`summary("phoenix.router_dispatch.stop.duration",`
			`tags: [:route],`
			`unit: {:native, :millisecond}`
Measure stats-data 2022-11-12 16:13:39 +00:00			`),`
Add prometheus metrics to router 2022-12-15 02:02:07 +00:00			`summary("pleroma.repo.query.total_time", unit: {:native, :millisecond}),`
			`summary("pleroma.repo.query.decode_time", unit: {:native, :millisecond}),`
			`summary("pleroma.repo.query.query_time", unit: {:native, :millisecond}),`
			`summary("pleroma.repo.query.queue_time", unit: {:native, :millisecond}),`
			`summary("pleroma.repo.query.idle_time", unit: {:native, :millisecond}),`

			`# VM Metrics`
Display memory as MB in live dashboard With kilobyte the resulting numbers got too large and were cut off in the charts, making them useless. However, even an idle Akkoma server’s memory usage is in the lower hundreths of megabytes, so we don’t need this much precision to begin with for the dashboard. Other metric users might prefer base units and can handle scaling in a smarter way, so keep this configurable. 2024-02-03 17:21:09 +00:00			`summary("vm.memory.total", unit: {:byte, byte_unit}),`
Add prometheus metrics to router 2022-12-15 02:02:07 +00:00			`summary("vm.total_run_queue_lengths.total"),`
			`summary("vm.total_run_queue_lengths.cpu"),`
Don't add summary metrics to prometheus The exporter doesn’t support them thus we don't lose anything by this, but it avoids a bunch of warnings each time the server starts up. 2024-02-03 16:30:00 +00:00			`summary("vm.total_run_queue_lengths.io")`
			`]`
			`end`

Use fallbacks of summary metrics for prometheus 2024-02-03 17:28:55 +00:00			`defp sum_counter_pair(basename, opts) do`
			`[`
			`sum(basename <> ".psum", opts),`
			`counter(basename <> ".pcount", opts)`
			`]`
			`end`

			`# Prometheus exporter doesn't support summaries, so provide fallbacks`
			`defp summary_fallback_metrics(byte_unit \\ :byte) do`
			`# Summary metrics are not supported by the Prometheus exporter`
			`# https://github.com/beam-telemetry/telemetry_metrics_prometheus_core/issues/11`
			`# and sum metrics currently only work with integers`
			`# https://github.com/beam-telemetry/telemetry_metrics_prometheus_core/issues/35`
			`#`
			`# For VM metrics this is kindof ok as they appear to always be integers`
			`# and we can use sum + counter to get the average between polls from their change`
			`# But for repo query times we need to use a full distribution`

			`simple_buckets = [0, 1, 2, 4, 8, 16]`
			`simple_buckets_quick = for t <- simple_buckets, do: t / 100.0`

			`# Already included in distribution metrics anyway:`
			`# phoenix.router_dispatch.stop.duration`
			`# pleroma.repo.query.total_time`
			`# pleroma.repo.query.queue_time`
			`dist_metrics =`
			`[`
			`distribution("phoenix.endpoint.stop.duration.fdist",`
			`event_name: [:phoenix, :endpoint, :stop],`
			`measurement: :duration,`
			`unit: {:native, :millisecond},`
			`reporter_options: [`
			`buckets: simple_buckets`
			`]`
			`),`
			`distribution("pleroma.repo.query.decode_time.fdist",`
			`event_name: [:pleroma, :repo, :query],`
			`measurement: :decode_time,`
			`unit: {:native, :millisecond},`
			`reporter_options: [`
			`buckets: simple_buckets_quick`
			`]`
			`),`
			`distribution("pleroma.repo.query.query_time.fdist",`
			`event_name: [:pleroma, :repo, :query],`
			`measurement: :query_time,`
			`unit: {:native, :millisecond},`
			`reporter_options: [`
			`buckets: simple_buckets`
			`]`
			`),`
			`distribution("pleroma.repo.query.idle_time.fdist",`
			`event_name: [:pleroma, :repo, :query],`
			`measurement: :idle_time,`
			`unit: {:native, :millisecond},`
			`reporter_options: [`
			`buckets: simple_buckets`
			`]`
			`)`
			`]`

			`vm_metrics =`
			`sum_counter_pair("vm.memory.total",`
			`event_name: [:vm, :memory],`
			`measurement: :total,`
			`unit: {:byte, byte_unit}`
			`) ++`
			`sum_counter_pair("vm.total_run_queue_lengths.total",`
			`event_name: [:vm, :total_run_queue_lengths],`
			`measurement: :total`
			`) ++`
			`sum_counter_pair("vm.total_run_queue_lengths.cpu",`
			`event_name: [:vm, :total_run_queue_lengths],`
			`measurement: :cpu`
			`) ++`
			`sum_counter_pair("vm.total_run_queue_lengths.io.fsum",`
			`event_name: [:vm, :total_run_queue_lengths],`
			`measurement: :io`
			`)`

			`dist_metrics ++ vm_metrics`
			`end`

Don't add summary metrics to prometheus The exporter doesn’t support them thus we don't lose anything by this, but it avoids a bunch of warnings each time the server starts up. 2024-02-03 16:30:00 +00:00			`defp common_metrics do`
			`[`
Measure stats-data 2022-11-12 16:13:39 +00:00			`last_value("pleroma.local_users.total"),`
			`last_value("pleroma.domains.total"),`
add remote user count for the heck of it 2022-12-16 17:22:26 +00:00			`last_value("pleroma.local_statuses.total"),`
			`last_value("pleroma.remote_users.total")`
Start adding telemetry 2022-11-12 10:14:16 +00:00			`]`
			`end`

Use fallbacks of summary metrics for prometheus 2024-02-03 17:28:55 +00:00			`def prometheus_metrics,`
			`do: common_metrics() ++ distribution_metrics() ++ summary_fallback_metrics()`

Display memory as MB in live dashboard With kilobyte the resulting numbers got too large and were cut off in the charts, making them useless. However, even an idle Akkoma server’s memory usage is in the lower hundreths of megabytes, so we don’t need this much precision to begin with for the dashboard. Other metric users might prefer base units and can handle scaling in a smarter way, so keep this configurable. 2024-02-03 17:21:09 +00:00			`def live_dashboard_metrics, do: common_metrics() ++ summary_metrics(:megabyte)`
Add prometheus metrics to router 2022-12-15 02:02:07 +00:00
Start adding telemetry 2022-11-12 10:14:16 +00:00			`defp periodic_measurements do`
Measure stats-data 2022-11-12 16:13:39 +00:00			`[`
			`{__MODULE__, :instance_stats, []}`
			`]`
			`end`

			`def instance_stats do`
			`stats = Stats.get_stats()`
			`:telemetry.execute([:pleroma, :local_users], %{total: stats.user_count}, %{})`
			`:telemetry.execute([:pleroma, :domains], %{total: stats.domain_count}, %{})`
			`:telemetry.execute([:pleroma, :local_statuses], %{total: stats.status_count}, %{})`
add remote user count for the heck of it 2022-12-16 17:22:26 +00:00			`:telemetry.execute([:pleroma, :remote_users], %{total: stats.remote_user_count}, %{})`
Start adding telemetry 2022-11-12 10:14:16 +00:00			`end`
			`end`