Commit 3a2b2cb6 authored by feld's avatar feld

Merge branch 'refactor/gun-pool-registry' into 'develop'

Refactor gun pooling and simplify adapter option insertion

Closes #1834, #1700, and #1680

See merge request !2479
parents 00f9b53f b2d398b1
......@@ -172,7 +172,7 @@
"application/ld+json" => ["activity+json"]
}
config :tesla, adapter: Tesla.Adapter.Hackney
config :tesla, adapter: Tesla.Adapter.Gun
# Configures http settings, upstream proxy etc.
config :pleroma, :http,
......@@ -648,32 +648,30 @@
prepare: :unnamed
config :pleroma, :connections_pool,
checkin_timeout: 250,
reclaim_multiplier: 0.1,
connection_acquisition_wait: 250,
connection_acquisition_retries: 5,
max_connections: 250,
retry: 1,
retry_timeout: 1000,
max_idle_time: 30_000,
retry: 0,
await_up_timeout: 5_000
config :pleroma, :pools,
federation: [
size: 50,
max_overflow: 10,
timeout: 150_000
max_waiting: 10
],
media: [
size: 50,
max_overflow: 10,
timeout: 150_000
max_waiting: 10
],
upload: [
size: 25,
max_overflow: 5,
timeout: 300_000
max_waiting: 5
],
default: [
size: 10,
max_overflow: 2,
timeout: 10_000
max_waiting: 2
]
config :pleroma, :hackney_pools,
......
......@@ -3161,36 +3161,37 @@
description: "Advanced settings for `gun` connections pool",
children: [
%{
key: :checkin_timeout,
key: :connection_acquisition_wait,
type: :integer,
description: "Timeout to checkin connection from pool. Default: 250ms.",
suggestions: [250]
},
%{
key: :max_connections,
type: :integer,
description: "Maximum number of connections in the pool. Default: 250 connections.",
description:
"Timeout to acquire a connection from pool.The total max time is this value multiplied by the number of retries. Default: 250ms.",
suggestions: [250]
},
%{
key: :retry,
key: :connection_acquisition_retries,
type: :integer,
description:
"Number of retries, while `gun` will try to reconnect if connection goes down. Default: 1.",
suggestions: [1]
"Number of attempts to acquire the connection from the pool if it is overloaded. Default: 5",
suggestions: [5]
},
%{
key: :retry_timeout,
key: :max_connections,
type: :integer,
description:
"Time between retries when `gun` will try to reconnect in milliseconds. Default: 1000ms.",
suggestions: [1000]
description: "Maximum number of connections in the pool. Default: 250 connections.",
suggestions: [250]
},
%{
key: :await_up_timeout,
type: :integer,
description: "Timeout while `gun` will wait until connection is up. Default: 5000ms.",
suggestions: [5000]
},
%{
key: :reclaim_multiplier,
type: :integer,
description:
"Multiplier for the number of idle connection to be reclaimed if the pool is full. For example if the pool maxes out at 250 connections and this setting is set to 0.3, the pool will reclaim at most 75 idle connections if it's overloaded. Default: 0.1",
suggestions: [0.1]
}
]
},
......@@ -3199,108 +3200,29 @@
key: :pools,
type: :group,
description: "Advanced settings for `gun` workers pools",
children: [
%{
key: :federation,
type: :keyword,
description: "Settings for federation pool.",
children: [
%{
key: :size,
type: :integer,
description: "Number workers in the pool.",
suggestions: [50]
},
%{
key: :max_overflow,
type: :integer,
description: "Number of additional workers if pool is under load.",
suggestions: [10]
},
%{
key: :timeout,
type: :integer,
description: "Timeout while `gun` will wait for response.",
suggestions: [150_000]
}
]
},
%{
key: :media,
type: :keyword,
description: "Settings for media pool.",
children: [
%{
key: :size,
type: :integer,
description: "Number workers in the pool.",
suggestions: [50]
},
%{
key: :max_overflow,
type: :integer,
description: "Number of additional workers if pool is under load.",
suggestions: [10]
},
%{
key: :timeout,
type: :integer,
description: "Timeout while `gun` will wait for response.",
suggestions: [150_000]
}
]
},
%{
key: :upload,
type: :keyword,
description: "Settings for upload pool.",
children: [
%{
key: :size,
type: :integer,
description: "Number workers in the pool.",
suggestions: [25]
},
%{
key: :max_overflow,
type: :integer,
description: "Number of additional workers if pool is under load.",
suggestions: [5]
},
%{
key: :timeout,
type: :integer,
description: "Timeout while `gun` will wait for response.",
suggestions: [300_000]
}
]
},
%{
key: :default,
type: :keyword,
description: "Settings for default pool.",
children: [
%{
key: :size,
type: :integer,
description: "Number workers in the pool.",
suggestions: [10]
},
%{
key: :max_overflow,
type: :integer,
description: "Number of additional workers if pool is under load.",
suggestions: [2]
},
%{
key: :timeout,
type: :integer,
description: "Timeout while `gun` will wait for response.",
suggestions: [10_000]
}
]
}
]
children:
Enum.map([:federation, :media, :upload, :default], fn pool_name ->
%{
key: pool_name,
type: :keyword,
description: "Settings for #{pool_name} pool.",
children: [
%{
key: :size,
type: :integer,
description: "Maximum number of concurrent requests in the pool.",
suggestions: [50]
},
%{
key: :max_waiting,
type: :integer,
description:
"Maximum number of requests waiting for other requests to finish. After this number is reached, the pool will start returning errrors when a new request is made",
suggestions: [10]
}
]
}
end)
},
%{
group: :pleroma,
......
......@@ -448,36 +448,32 @@ For each pool, the options are:
*For `gun` adapter*
Advanced settings for connections pool. Pool with opened connections. These connections can be reused in worker pools.
Settings for HTTP connection pool.
For big instances it's recommended to increase `config :pleroma, :connections_pool, max_connections: 500` up to 500-1000.
It will increase memory usage, but federation would work faster.
* `:checkin_timeout` - timeout to checkin connection from pool. Default: 250ms.
* `:max_connections` - maximum number of connections in the pool. Default: 250 connections.
* `:retry` - number of retries, while `gun` will try to reconnect if connection goes down. Default: 1.
* `:retry_timeout` - time between retries when `gun` will try to reconnect in milliseconds. Default: 1000ms.
* `:await_up_timeout` - timeout while `gun` will wait until connection is up. Default: 5000ms.
* `:connection_acquisition_wait` - Timeout to acquire a connection from pool.The total max time is this value multiplied by the number of retries.
* `connection_acquisition_retries` - Number of attempts to acquire the connection from the pool if it is overloaded. Each attempt is timed `:connection_acquisition_wait` apart.
* `:max_connections` - Maximum number of connections in the pool.
* `:await_up_timeout` - Timeout to connect to the host.
* `:reclaim_multiplier` - Multiplied by `:max_connections` this will be the maximum number of idle connections that will be reclaimed in case the pool is overloaded.
### :pools
*For `gun` adapter*
Advanced settings for workers pools.
Settings for request pools. These pools are limited on top of `:connections_pool`.
There are four pools used:
* `:federation` for the federation jobs.
You may want this pool max_connections to be at least equal to the number of federator jobs + retry queue jobs.
* `:media` for rich media, media proxy
* `:upload` for uploaded media (if using a remote uploader and `proxy_remote: true`)
* `:default` for other requests
* `:federation` for the federation jobs. You may want this pool's max_connections to be at least equal to the number of federator jobs + retry queue jobs.
* `:media` - for rich media, media proxy.
* `:upload` - for proxying media when a remote uploader is used and `proxy_remote: true`.
* `:default` - for other requests.
For each pool, the options are:
* `:size` - how much workers the pool can hold
* `:size` - limit to how much requests can be concurrently executed.
* `:timeout` - timeout while `gun` will wait for response
* `:max_overflow` - additional workers if pool is under load
* `:max_waiting` - limit to how much requests can be waiting for others to finish, after this is reached, subsequent requests will be dropped.
## Captcha
......
......@@ -39,6 +39,7 @@ def start(_type, _args) do
# every time the application is restarted, so we disable module
# conflicts at runtime
Code.compiler_options(ignore_module_conflict: true)
Pleroma.Telemetry.Logger.attach()
Config.Holder.save_default()
Pleroma.HTML.compile_scrubbers()
Config.DeprecationWarnings.warn()
......@@ -223,9 +224,7 @@ defp task_children(_) do
# start hackney and gun pools in tests
defp http_children(_, :test) do
hackney_options = Config.get([:hackney_pools, :federation])
hackney_pool = :hackney_pool.child_spec(:federation, hackney_options)
[hackney_pool, Pleroma.Pool.Supervisor]
http_children(Tesla.Adapter.Hackney, nil) ++ http_children(Tesla.Adapter.Gun, nil)
end
defp http_children(Tesla.Adapter.Hackney, _) do
......@@ -244,7 +243,10 @@ defp http_children(Tesla.Adapter.Hackney, _) do
end
end
defp http_children(Tesla.Adapter.Gun, _), do: [Pleroma.Pool.Supervisor]
defp http_children(Tesla.Adapter.Gun, _) do
Pleroma.Gun.ConnectionPool.children() ++
[{Task, &Pleroma.HTTP.AdapterHelper.Gun.limiter_setup/0}]
end
defp http_children(_, _), do: []
end
......@@ -19,7 +19,8 @@ defmodule Pleroma.Gun.API do
:tls_opts,
:tcp_opts,
:socks_opts,
:ws_opts
:ws_opts,
:supervise
]
@impl Gun
......
......@@ -3,85 +3,33 @@
# SPDX-License-Identifier: AGPL-3.0-only
defmodule Pleroma.Gun.Conn do
@moduledoc """
Struct for gun connection data
"""
alias Pleroma.Gun
alias Pleroma.Pool.Connections
require Logger
@type gun_state :: :up | :down
@type conn_state :: :active | :idle
@type t :: %__MODULE__{
conn: pid(),
gun_state: gun_state(),
conn_state: conn_state(),
used_by: [pid()],
last_reference: pos_integer(),
crf: float(),
retries: pos_integer()
}
defstruct conn: nil,
gun_state: :open,
conn_state: :init,
used_by: [],
last_reference: 0,
crf: 1,
retries: 0
@spec open(String.t() | URI.t(), atom(), keyword()) :: :ok | nil
def open(url, name, opts \\ [])
def open(url, name, opts) when is_binary(url), do: open(URI.parse(url), name, opts)
def open(%URI{} = uri, name, opts) do
def open(%URI{} = uri, opts) do
pool_opts = Pleroma.Config.get([:connections_pool], [])
opts =
opts
|> Enum.into(%{})
|> Map.put_new(:retry, pool_opts[:retry] || 1)
|> Map.put_new(:retry_timeout, pool_opts[:retry_timeout] || 1000)
|> Map.put_new(:await_up_timeout, pool_opts[:await_up_timeout] || 5_000)
|> Map.put_new(:supervise, false)
|> maybe_add_tls_opts(uri)
key = "#{uri.scheme}:#{uri.host}:#{uri.port}"
max_connections = pool_opts[:max_connections] || 250
conn_pid =
if Connections.count(name) < max_connections do
do_open(uri, opts)
else
close_least_used_and_do_open(name, uri, opts)
end
if is_pid(conn_pid) do
conn = %Pleroma.Gun.Conn{
conn: conn_pid,
gun_state: :up,
conn_state: :active,
last_reference: :os.system_time(:second)
}
:ok = Gun.set_owner(conn_pid, Process.whereis(name))
Connections.add_conn(name, key, conn)
end
do_open(uri, opts)
end
defp maybe_add_tls_opts(opts, %URI{scheme: "http"}), do: opts
defp maybe_add_tls_opts(opts, %URI{scheme: "https", host: host}) do
defp maybe_add_tls_opts(opts, %URI{scheme: "https"}) do
tls_opts = [
verify: :verify_peer,
cacertfile: CAStore.file_path(),
depth: 20,
reuse_sessions: false,
verify_fun:
{&:ssl_verify_hostname.verify_fun/3,
[check_hostname: Pleroma.HTTP.Connection.format_host(host)]}
log_level: :warning,
customize_hostname_check: [match_fun: :public_key.pkix_verify_hostname_match_fun(:https)]
]
tls_opts =
......@@ -105,7 +53,7 @@ defp do_open(uri, %{proxy: {proxy_host, proxy_port}} = opts) do
{:ok, _} <- Gun.await_up(conn, opts[:await_up_timeout]),
stream <- Gun.connect(conn, connect_opts),
{:response, :fin, 200, _} <- Gun.await(conn, stream) do
conn
{:ok, conn}
else
error ->
Logger.warn(
......@@ -141,7 +89,7 @@ defp do_open(uri, %{proxy: {proxy_type, proxy_host, proxy_port}} = opts) do
with {:ok, conn} <- Gun.open(proxy_host, proxy_port, opts),
{:ok, _} <- Gun.await_up(conn, opts[:await_up_timeout]) do
conn
{:ok, conn}
else
error ->
Logger.warn(
......@@ -155,11 +103,11 @@ defp do_open(uri, %{proxy: {proxy_type, proxy_host, proxy_port}} = opts) do
end
defp do_open(%URI{host: host, port: port} = uri, opts) do
host = Pleroma.HTTP.Connection.parse_host(host)
host = Pleroma.HTTP.AdapterHelper.parse_host(host)
with {:ok, conn} <- Gun.open(host, port, opts),
{:ok, _} <- Gun.await_up(conn, opts[:await_up_timeout]) do
conn
{:ok, conn}
else
error ->
Logger.warn(
......@@ -171,7 +119,7 @@ defp do_open(%URI{host: host, port: port} = uri, opts) do
end
defp destination_opts(%URI{host: host, port: port}) do
host = Pleroma.HTTP.Connection.parse_host(host)
host = Pleroma.HTTP.AdapterHelper.parse_host(host)
%{host: host, port: port}
end
......@@ -181,17 +129,6 @@ defp add_http2_opts(opts, "https", tls_opts) do
defp add_http2_opts(opts, _, _), do: opts
defp close_least_used_and_do_open(name, uri, opts) do
with [{key, conn} | _conns] <- Connections.get_unused_conns(name),
:ok <- Gun.close(conn.conn) do
Connections.remove_conn(name, key)
do_open(uri, opts)
else
[] -> {:error, :pool_overflowed}
end
end
def compose_uri_log(%URI{scheme: scheme, host: host, path: path}) do
"#{scheme}://#{host}#{path}"
end
......
defmodule Pleroma.Gun.ConnectionPool do
@registry __MODULE__
alias Pleroma.Gun.ConnectionPool.WorkerSupervisor
def children do
[
{Registry, keys: :unique, name: @registry},
Pleroma.Gun.ConnectionPool.WorkerSupervisor
]
end
def get_conn(uri, opts) do
key = "#{uri.scheme}:#{uri.host}:#{uri.port}"
case Registry.lookup(@registry, key) do
# The key has already been registered, but connection is not up yet
[{worker_pid, nil}] ->
get_gun_pid_from_worker(worker_pid, true)
[{worker_pid, {gun_pid, _used_by, _crf, _last_reference}}] ->
GenServer.cast(worker_pid, {:add_client, self(), false})
{:ok, gun_pid}
[] ->
# :gun.set_owner fails in :connected state for whatevever reason,
# so we open the connection in the process directly and send it's pid back
# We trust gun to handle timeouts by itself
case WorkerSupervisor.start_worker([key, uri, opts, self()]) do
{:ok, worker_pid} ->
get_gun_pid_from_worker(worker_pid, false)
{:error, {:already_started, worker_pid}} ->
get_gun_pid_from_worker(worker_pid, true)
err ->
err
end
end
end
defp get_gun_pid_from_worker(worker_pid, register) do
# GenServer.call will block the process for timeout length if
# the server crashes on startup (which will happen if gun fails to connect)
# so instead we use cast + monitor
ref = Process.monitor(worker_pid)
if register, do: GenServer.cast(worker_pid, {:add_client, self(), true})
receive do
{:conn_pid, pid} ->
Process.demonitor(ref)
{:ok, pid}
{:DOWN, ^ref, :process, ^worker_pid, reason} ->
case reason do
{:shutdown, error} -> error
_ -> {:error, reason}
end
end
end
def release_conn(conn_pid) do
# :ets.fun2ms(fn {_, {worker_pid, {gun_pid, _, _, _}}} when gun_pid == conn_pid ->
# worker_pid end)
query_result =
Registry.select(@registry, [
{{:_, :"$1", {:"$2", :_, :_, :_}}, [{:==, :"$2", conn_pid}], [:"$1"]}
])
case query_result do
[worker_pid] ->
GenServer.cast(worker_pid, {:remove_client, self()})
[] ->
:ok
end
end
end
defmodule Pleroma.Gun.ConnectionPool.Reclaimer do
use GenServer, restart: :temporary
@registry Pleroma.Gun.ConnectionPool
def start_monitor do
pid =
case :gen_server.start(__MODULE__, [], name: {:via, Registry, {@registry, "reclaimer"}}) do
{:ok, pid} ->
pid
{:error, {:already_registered, pid}} ->
pid
end
{pid, Process.monitor(pid)}
end
@impl true
def init(_) do
{:ok, nil, {:continue, :reclaim}}
end
@impl true
def handle_continue(:reclaim, _) do
max_connections = Pleroma.Config.get([:connections_pool, :max_connections])
reclaim_max =
[:connections_pool, :reclaim_multiplier]
|> Pleroma.Config.get()
|> Kernel.*(max_connections)
|> round
|> max(1)
:telemetry.execute([:pleroma, :connection_pool, :reclaim, :start], %{}, %{
max_connections: max_connections,
reclaim_max: reclaim_max
})
# :ets.fun2ms(
# fn {_, {worker_pid, {_, used_by, crf, last_reference}}} when used_by == [] ->
# {worker_pid, crf, last_reference} end)
unused_conns =
Registry.select(
@registry,
[
{{:_, :"$1", {:_, :"$2", :"$3", :"$4"}}, [{:==, :"$2", []}], [{{:"$1", :"$3", :"$4"}}]}
]
)
case unused_conns do
[] ->
:telemetry.execute(
[:pleroma, :connection_pool, :reclaim, :stop],
%{reclaimed_count: 0},
%{
max_connections: max_connections
}
)
{:stop, :no_unused_conns, nil}
unused_conns ->
reclaimed =
unused_conns
|> Enum.sort(fn {_pid1, crf1, last_reference1}, {_pid2, crf2, last_reference2} ->
crf1 <= crf2 and last_reference1 <= last_reference2
end)
|> Enum.take(reclaim_max)
reclaimed
|> Enum.each(fn {pid, _, _} ->
DynamicSupervisor.terminate_child(Pleroma.Gun.ConnectionPool.WorkerSupervisor, pid)
end)
:telemetry.execute(
[:pleroma, :connection_pool, :reclaim, :stop],
%{reclaimed_count: Enum.count(reclaimed)},
%{max_connections: max_connections}
)
{:stop, :normal, nil}
end