nixpkgs/nixos/tests/consul.nix

import ./make-test-python.nix (
  { pkgs, lib, ... }:

  let
    # Settings for both servers and agents
    webUi = true;
    retry_interval = "1s";
    raft_multiplier = 1;

    defaultExtraConfig = {
      inherit retry_interval;
      performance = {
        inherit raft_multiplier;
      };
    };

    allConsensusServerHosts = [
      "192.168.1.1"
      "192.168.1.2"
      "192.168.1.3"
    ];

    allConsensusClientHosts = [
      "192.168.2.1"
      "192.168.2.2"
    ];

    firewallSettings = {
      # See https://www.consul.io/docs/install/ports.html
      allowedTCPPorts = [
        8301
        8302
        8600
        8500
        8300
      ];
      allowedUDPPorts = [
        8301
        8302
        8600
      ];
    };

    client =
      index:
      { pkgs, ... }:
      let
        ip = builtins.elemAt allConsensusClientHosts index;
      in
      {
        environment.systemPackages = [ pkgs.consul ];

        networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [
          {
            address = ip;
            prefixLength = 16;
          }
        ];
        networking.firewall = firewallSettings;

        nixpkgs.config.allowUnfreePredicate = pkg: builtins.elem (lib.getName pkg) [ "consul" ];

        services.consul = {
          enable = true;
          inherit webUi;
          extraConfig = defaultExtraConfig // {
            server = false;
            retry_join = allConsensusServerHosts;
            bind_addr = ip;
          };
        };
      };

    server =
      index:
      { pkgs, ... }:
      let
        numConsensusServers = builtins.length allConsensusServerHosts;
        thisConsensusServerHost = builtins.elemAt allConsensusServerHosts index;
        ip = thisConsensusServerHost; # since we already use IPs to identify servers
      in
      {
        networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [
          {
            address = ip;
            prefixLength = 16;
          }
        ];
        networking.firewall = firewallSettings;

        nixpkgs.config.allowUnfreePredicate = pkg: builtins.elem (lib.getName pkg) [ "consul" ];

        services.consul =
          assert builtins.elem thisConsensusServerHost allConsensusServerHosts;
          {
            enable = true;
            inherit webUi;
            extraConfig = defaultExtraConfig // {
              server = true;
              bootstrap_expect = numConsensusServers;
              # Tell Consul that we never intend to drop below this many servers.
              # Ensures to not permanently lose consensus after temporary loss.
              # See https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040
              autopilot.min_quorum = numConsensusServers;
              retry_join =
                # If there's only 1 node in the network, we allow self-join;
                # otherwise, the node must not try to join itself, and join only the other servers.
                # See https://github.com/hashicorp/consul/issues/2868
                if numConsensusServers == 1 then
                  allConsensusServerHosts
                else
                  builtins.filter (h: h != thisConsensusServerHost) allConsensusServerHosts;
              bind_addr = ip;
            };
          };
      };
  in
  {
    name = "consul";

    nodes = {
      server1 = server 0;
      server2 = server 1;
      server3 = server 2;

      client1 = client 0;
      client2 = client 1;
    };

    testScript = ''
      servers = [server1, server2, server3]
      machines = [server1, server2, server3, client1, client2]

      for m in machines:
          m.wait_for_unit("consul.service")


      def wait_for_healthy_servers():
          # See https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040
          # for why the `Voter` column of `list-peers` has that info.
          # TODO: The `grep true` relies on the fact that currently in
          #       the output like
          #           # consul operator raft list-peers
          #           Node     ID   Address           State     Voter  RaftProtocol
          #           server3  ...  192.168.1.3:8300  leader    true   3
          #           server2  ...  192.168.1.2:8300  follower  true   3
          #           server1  ...  192.168.1.1:8300  follower  false  3
          #       `Voter`is the only boolean column.
          #       Change this to the more reliable way to be defined by
          #       https://github.com/hashicorp/consul/issues/8118
          #       once that ticket is closed.
          for m in machines:
              m.wait_until_succeeds(
                  "[ $(consul operator raft list-peers | grep true | wc -l) == 3 ]"
              )


      def wait_for_all_machines_alive():
          """
          Note that Serf-"alive" does not mean "Raft"-healthy;
          see `wait_for_healthy_servers()` for that instead.
          """
          for m in machines:
              m.wait_until_succeeds("[ $(consul members | grep -o alive | wc -l) == 5 ]")


      wait_for_healthy_servers()
      # Also wait for clients to be alive.
      wait_for_all_machines_alive()

      client1.succeed("consul kv put testkey 42")
      client2.succeed("[ $(consul kv get testkey) == 42 ]")


      def rolling_restart_test(proper_rolling_procedure=True):
          """
          Tests that the cluster can tolearate failures of any single server,
          following the recommended rolling upgrade procedure from
          https://www.consul.io/docs/upgrading#standard-upgrades.

          Optionally, `proper_rolling_procedure=False` can be given
          to wait only for each server to be back `Healthy`, not `Stable`
          in the Raft consensus, see Consul setting `ServerStabilizationTime` and
          https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040.
          """

          for server in servers:
              server.block()
              server.systemctl("stop consul")

              # Make sure the stopped peer is recognized as being down
              client1.wait_until_succeeds(
                f"[ $(consul members | grep {server.name} | grep -o -E 'failed|left' | wc -l) == 1 ]"
              )

              # For each client, wait until they have connection again
              # using `kv get -recurse` before issuing commands.
              client1.wait_until_succeeds("consul kv get -recurse")
              client2.wait_until_succeeds("consul kv get -recurse")

              # Do some consul actions while one server is down.
              client1.succeed("consul kv put testkey 43")
              client2.succeed("[ $(consul kv get testkey) == 43 ]")
              client2.succeed("consul kv delete testkey")

              server.unblock()
              server.systemctl("start consul")

              if proper_rolling_procedure:
                  # Wait for recovery.
                  wait_for_healthy_servers()
              else:
                  # NOT proper rolling upgrade procedure, see above.
                  wait_for_all_machines_alive()

              # Wait for client connections.
              client1.wait_until_succeeds("consul kv get -recurse")
              client2.wait_until_succeeds("consul kv get -recurse")

              # Do some consul actions with server back up.
              client1.succeed("consul kv put testkey 44")
              client2.succeed("[ $(consul kv get testkey) == 44 ]")
              client2.succeed("consul kv delete testkey")


      def all_servers_crash_simultaneously_test():
          """
          Tests that the cluster will eventually come back after all
          servers crash simultaneously.
          """

          for server in servers:
              server.block()
              server.systemctl("stop --no-block consul")

          for server in servers:
              # --no-block is async, so ensure it has been stopped by now
              server.wait_until_fails("systemctl is-active --quiet consul")
              server.unblock()
              server.systemctl("start consul")

          # Wait for recovery.
          wait_for_healthy_servers()

          # Wait for client connections.
          client1.wait_until_succeeds("consul kv get -recurse")
          client2.wait_until_succeeds("consul kv get -recurse")

          # Do some consul actions with servers back up.
          client1.succeed("consul kv put testkey 44")
          client2.succeed("[ $(consul kv get testkey) == 44 ]")
          client2.succeed("consul kv delete testkey")


      # Run the tests.

      print("rolling_restart_test()")
      rolling_restart_test()

      print("all_servers_crash_simultaneously_test()")
      all_servers_crash_simultaneously_test()

      print("rolling_restart_test(proper_rolling_procedure=False)")
      rolling_restart_test(proper_rolling_procedure=False)
    '';
  }
)