Wormhole: An Ingress Tunnel

A utility to mask the IP address of your server. THIS WAS WRITTEN BY AN AI AND I HAVE NOT FULLY REVIEWED IT OR EVEN TESTED IT!
This is the Nix module I wrote with Claude. I DO NOT RECOMMEND USING THIS. Once I’ve properly reviewed it and test it, I’ll update this page with a better description. But until then, DON’T USE IT.
If I can license this code, which I’m pretty sure I cannot, I release it under the MIT license.
{
  config,
  lib,
  pkgs,
  ...
}:

let
  cfg = config.services.wormholeSource.instances;

  # ---------------------------------------------------------------------------
  # IP helpers
  # ---------------------------------------------------------------------------

  # Detect whether an IP string (with or without a CIDR prefix) is IPv6.
  # We use the presence of a colon as the distinguishing feature -- IPv4
  # addresses never contain colons, IPv6 addresses always do. The CIDR
  # prefix (e.g. "/64") is stripped first so it doesn't interfere.
  isIPv6 = ip: builtins.match ".*:.*" (builtins.head (lib.splitString "/" ip)) != null;

  # Remove the subnet prefix length from a CIDR string, e.g. "10.0.0.1/24"
  # becomes "10.0.0.1". Used when constructing nftables DNAT targets, which
  # take a bare address rather than a network prefix.
  stripSubnet = ip: builtins.head (lib.splitString "/" ip);

  # Return a list of the IP families present in a list of IP strings.
  # Produces a subset of [ "ipv4" "ipv6" ] depending on what is found.
  # Used by familiesOverlap to check that two sets of IPs can communicate.
  ipFamilies =
    ips:
    let
      has4 = builtins.any (ip: !isIPv6 ip) ips;
      has6 = builtins.any isIPv6 ips;
    in
    lib.optional has4 "ipv4" ++ lib.optional has6 "ipv6";

  # Return true if two lists of IP addresses share at least one address family.
  # We use this to validate that traffic can actually flow between two sets of
  # addresses -- an IPv4 inbound IP paired with an IPv6-only tunnel, for
  # example, can never be forwarded regardless of the nftables configuration.
  familiesOverlap =
    a: b:
    let
      fa = ipFamilies a;
      fb = ipFamilies b;
    in
    builtins.any (f: builtins.elem f fb) fa;

  # ---------------------------------------------------------------------------
  # Endpoint helpers
  # ---------------------------------------------------------------------------

  # Extract the host portion from a WireGuard endpoint string. WireGuard
  # endpoints are always "host:port", but the host may itself contain colons
  # if it is an IPv6 address, in which case it must be bracketed per RFC 3986.
  # The three formats we handle:
  #   "1.2.3.4:51820"          -> "1.2.3.4"         (IPv4)
  #   "[::1]:51820"            -> "::1"              (IPv6, bracketed)
  #   "peer.example.com:51820" -> "peer.example.com" (hostname)
  # Returns null for a null input or a string with no colon (malformed).
  endpointHost =
    endpoint:
    if endpoint == null then
      null
    else if builtins.substring 0 1 endpoint == "[" then
      # Bracketed IPv6: strip the leading "[", split on "]", take what's inside
      builtins.head (lib.splitString "]" (builtins.substring 1 (-1) endpoint))
    else
      # IPv4 or hostname: the port is after the last colon, so we rejoin
      # everything before it. For IPv4 there is only one colon, so this is
      # just the address. For hostnames it preserves any dots.
      let
        parts = lib.splitString ":" endpoint;
      in
      if builtins.length parts < 2 then
        null # no colon at all -> malformed, treat as unknown
      else
        lib.concatStringsSep ":" (lib.init parts);

  # Determine the IP family of the WireGuard outer transport from the endpoint
  # string alone. Returns "ipv4", "ipv6", or null.
  #
  # null means the family cannot be determined -- either the endpoint is a
  # hostname (which could resolve to either family at runtime), null (the peer
  # initiates, so we never dial out and don't know what address we receive on),
  # or the string is malformed. In all of these cases the caller must decide on
  # a fallback; see resolveEndpointFamily below.
  #
  # Note: we explicitly check that the host looks like a dotted-decimal IPv4
  # address before returning "ipv4". Without this check, a hostname like
  # "peer.example.com" would have no colons and be incorrectly classified as
  # IPv4 -- but a hostname could resolve to an AAAA record, making the IPv4
  # MSS value too large and causing packets to be silently dropped.
  detectEndpointFamily =
    endpoint:
    let
      host = endpointHost endpoint;
    in
    if host == null then
      null
    else if isIPv6 host then
      "ipv6"
    else if builtins.match "[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+" host != null then
      "ipv4"
    else
      null; # hostname -- family is ambiguous, treat as unknown

  # Resolve the definitive endpoint family used for MSS clamping, combining
  # automatic detection with the user's explicit endpointFamily override.
  #
  # Priority:
  #   1. If the user has set endpointFamily, use it. This takes precedence
  #      over detection even when the endpoint is a literal IP address,
  #      because the user may intentionally want a more conservative value.
  #      The canonical case is a chained tunnel: if this source's sink is
  #      itself a wormhole source forwarding into another tunnel, the TCP
  #      MSS negotiation must account for all the headers in the chain, not
  #      just the immediate hop. Setting endpointFamily = "ipv6" forces the
  #      most conservative MSS regardless of the immediate peer's address.
  #      The one override that is always an error -- detected IPv6 with a
  #      user-specified "ipv4" -- is caught by an assertion before we get here.
  #   2. If no override is set, use the detected family from the endpoint string.
  #   3. If detection also returned null (hostname, null endpoint, or
  #      unparseable), fall back to "ipv6" -- the conservative choice, since
  #      the IPv6 MSS is smaller. Over-clamping wastes a few bytes per packet
  #      but is always safe; under-clamping can stall connections entirely
  #      when ICMP "fragmentation needed" messages are filtered, which is common.
  resolveEndpointFamily =
    endpoint: override:
    let
      detected = detectEndpointFamily endpoint;
    in
    if override != null then
      override
    else if detected != null then
      detected
    else
      "ipv6";

  # ---------------------------------------------------------------------------
  # Types
  # ---------------------------------------------------------------------------

  # Validated CIDR address (IPv4 or IPv6 with mandatory prefix length).
  # Used for local tunnelIPs, where the prefix is required so the kernel knows
  # the size of the WireGuard tunnel network (e.g. "10.0.0.1/24").
  # The pattern is intentionally permissive on the address octets/groups --
  # catching gross format errors here is useful, but the kernel will reject
  # anything genuinely malformed at interface bring-up time anyway.
  cidrType = lib.types.strMatching "([0-9]{1,3}(\\.[0-9]{1,3}){3}/[0-9]{1,2}|[0-9a-fA-F:]+/[0-9]{1,3})";

  # Bare IP address without a prefix, IPv4 or IPv6.
  # Used for target tunnel IPs and match IPs, where there is no subnet to
  # specify -- these are point addresses, not network prefixes.
  ipType = lib.types.strMatching "([0-9]{1,3}(\\.[0-9]{1,3}){3}|[0-9a-fA-F:]+)";

  # A single port forwarding mapping: { from, to } where `to` defaults to
  # `from` so that simple same-port forwarding requires only one field.
  # Using typed port values (validated integers 1-65535) rather than strings
  # avoids the error-prone manual splitting and parsing of "src:dst" strings.
  portMappingType = lib.types.submodule (
    { config, ... }:
    {
      options = {
        from = lib.mkOption {
          type = lib.types.port;
          description = "Source port to match on inbound traffic.";
        };
        to = lib.mkOption {
          type = lib.types.port;
          description = "Destination port to forward to on the sink. Defaults to `from`.";
        };
      };
      config.to = lib.mkDefault config.from;
    }
  );

  # Inbound traffic selector. Populate exactly one of `interfaces` or `ips`;
  # the active mode is inferred at evaluation time from whichever is non-empty.
  #
  #   interfaces: match all traffic arriving on these network interfaces.
  #               The same interfaces are used as the WAN side in nftables
  #               forward rules, so they serve double duty as both the inbound
  #               traffic selector and the routing boundary for the tunnel.
  #
  #   ips:        match traffic destined for these specific IP addresses,
  #               regardless of which interface it arrives on. Because we do
  #               not know the WAN interface in this mode, forward rules are
  #               written without an inbound interface constraint (iif), relying
  #               solely on the outbound interface (oif) toward the tunnel.
  matchType = lib.types.submodule {
    options = {
      interfaces = lib.mkOption {
        type = lib.types.listOf lib.types.str;
        default = [ ];
        example = [
          "eno1"
          "eno2"
        ];
        description = "WAN interfaces to match inbound traffic on. Mutually exclusive with `ips`.";
      };

      ips = lib.mkOption {
        type = lib.types.listOf ipType;
        default = [ ];
        example = [
          "203.0.113.1"
          "2001:db8::1"
        ];
        description = "Public destination IPs to match inbound traffic for. Mutually exclusive with `interfaces`.";
      };
    };
  };

  # Target (sink) submodule -- the remote machine that will receive forwarded
  # traffic over the WireGuard tunnel.
  targetType = lib.types.submodule {
    options = {
      tunnelIPs = lib.mkOption {
        type = lib.types.nonEmptyListOf ipType;
        example = [
          "10.0.0.2"
          "fd00::2"
        ];
        description = "WireGuard tunnel IPs of the sink machine to DNAT traffic toward.";
      };

      endpoint = lib.mkOption {
        type = lib.types.nullOr lib.types.str;
        default = null;
        example = "203.0.113.2:51820";
        description = ''
          WireGuard endpoint of the sink (host:port or [IPv6addr]:port).
          If null, the sink must initiate the WireGuard connection to us rather
          than the other way around. The IP family of this endpoint determines
          which MSS clamp values are generated; see `endpointFamily` if the
          family cannot be detected automatically (hostname or null endpoints).
        '';
      };

      endpointFamily = lib.mkOption {
        type = lib.types.nullOr (
          lib.types.enum [
            "ipv4"
            "ipv6"
          ]
        );
        default = null;
        example = "ipv6";
        description = ''
          Override the IP family used for MSS clamping on the WireGuard
          transport. When set, this takes precedence over the family detected
          from the endpoint address, so it is meaningful even for literal IP
          endpoints.

          The primary use case is chained tunnels: if the sink machine is
          itself a wormhole source forwarding into a further tunnel, the TCP
          MSS must account for all encapsulation headers in the chain, not
          just the immediate hop. Set endpointFamily = "ipv6" to force the
          most conservative (smallest) MSS regardless of the immediate peer
          address family, ensuring packets are not too large for any link in
          the chain.

          If left null, the family is detected from the endpoint address, or
          "ipv6" is assumed when it cannot be detected (hostname or null
          endpoint). The one forbidden combination is endpointFamily = "ipv4"
          with a literal IPv6 endpoint address -- the outer header is provably
          40 bytes in that case, and understating it would allow oversized
          packets that will be dropped inside the tunnel.
        '';
      };

      pathMTU = lib.mkOption {
        type = lib.types.ints.between 576 9000;
        default = 1500;
        example = 1492;
        description = ''
          The MTU of the path between this machine and the sink. Used as the
          basis for MSS clamping instead of the standard Ethernet MTU of 1500.

          You should lower this if the tunnel path has a smaller MTU than
          standard Ethernet -- common cases are PPPoE uplinks (1492 bytes) or
          paths that cross other tunnels with their own encapsulation overhead.

          This matters here in a way it would not for a userspace proxy: because
          this module forwards packets at layer 3 using NAT, the original TCP
          sender never receives ICMP "fragmentation needed" messages generated
          by links inside the tunnel path. Those messages are addressed to this
          machine (the outer packet source) rather than to the client, and the
          NAT makes it impossible to translate them back upstream. MSS clamping
          is therefore the only mechanism available to account for reduced MTU
          on the tunnel path, and it must be configured explicitly when the path
          MTU is known to be smaller than 1500.

          The valid range (576-9000) covers standard Ethernet, PPPoE, jumbo
          frames, and the IPv4 minimum reassembly MTU of 576 bytes.
        '';
      };

      publicKey = lib.mkOption {
        type = lib.types.str;
        description = "WireGuard public key of the sink machine.";
      };

      ports = lib.mkOption {
        type = lib.types.nonEmptyListOf portMappingType;
        example = [
          { from = 80; }
          {
            from = 443;
            to = 8443;
          }
        ];
        description = "Port mappings to forward. `to` defaults to `from` when omitted.";
      };

      protocols = lib.mkOption {
        type = lib.types.nonEmptyListOf (
          lib.types.enum [
            "tcp"
            "udp"
          ]
        );
        default = [
          "tcp"
          "udp"
        ];
        description = "Transport protocols to forward. Defaults to both TCP and UDP.";
      };
    };
  };

  # ---------------------------------------------------------------------------
  # nftables rule generation
  # ---------------------------------------------------------------------------
  #
  # nftables organises firewall logic into tables, chains, and rules:
  #
  #   Table  -- a named container for chains. We create one per wormhole
  #             instance ("wormhole-<n>") so instances are fully isolated
  #             from each other and from the host's own firewall rules.
  #             Tables are created and torn down with the WireGuard interface.
  #
  #   Chain  -- an ordered list of rules attached to a specific point in the
  #             kernel's packet processing pipeline via a "hook". The hook
  #             determines when in the packet's journey the rules are evaluated.
  #             Chains also have a "type" (nat or filter) and a "priority"
  #             that controls ordering relative to other chains on the same hook.
  #
  #   Rule   -- a match-action pair. Packets that satisfy the match conditions
  #             have the action applied (e.g. dnat, masquerade, accept).
  #
  # We create three chains per instance:
  #
  #   prerouting  (type nat, hook prerouting, priority dstnat)
  #     Evaluated before routing decisions are made. We use this for DNAT
  #     (destination NAT), which rewrites the packet's destination address and
  #     port so the kernel routes it into the WireGuard tunnel rather than
  #     delivering it locally.
  #
  #   postrouting  (type nat, hook postrouting, priority srcnat)
  #     Evaluated after routing, just before the packet leaves the machine.
  #     We use snat here, rewriting the source address to one of this machine's
  #     tunnel IPs. This is necessary because the sink knows only its tunnel
  #     peer (us), not the original client; without this rewrite the sink's
  #     reply packets would be addressed to the original client IP, which it
  #     has no route for. Using an explicit pool of tunnel IPs rather than
  #     plain masquerade allows multiple source IPs to be used, multiplying
  #     the number of concurrent connections beyond the single-IP port limit.
  #
  #   forward  (type filter, hook forward, priority filter)
  #     Evaluated for packets that are being routed through this machine rather
  #     than destined for it. The kernel's default is to drop forwarded packets,
  #     so we must explicitly accept traffic flowing between the WAN side and
  #     the tunnel interface in both directions. MSS clamping rules also live
  #     here, applied on the SYN packet that opens each TCP connection.

  instanceRules =
    name: icfg:
    let
      wif = "wh-${name}"; # WireGuard tunnel interface name
      tbl = "wormhole-${name}"; # nftables table name

      # Whether the sink's tunnel IPs include each address family. We need
      # this to know which MSS clamping rules to generate -- there is no
      # point emitting IPv6 MSS rules if the tunnel carries only IPv4 traffic.
      targetHasIPv4 = builtins.any (ip: !isIPv6 ip) icfg.target.tunnelIPs;
      targetHasIPv6 = builtins.any isIPv6 icfg.target.tunnelIPs;

      # The IP family of the WireGuard outer transport (the "outer" packet that
      # carries the encrypted WireGuard payload across the internet). This
      # determines the outer IP header size used in the MSS calculation.
      outerFamily = resolveEndpointFamily icfg.target.endpoint icfg.target.endpointFamily;
      outerIsIPv6 = outerFamily == "ipv6";

      # Calculate the correct TCP MSS (Maximum Segment Size) for a given
      # combination of inner and outer IP families.
      #
      # MSS is the largest TCP payload a segment may carry. Hosts negotiate
      # it during the TCP handshake. If MSS is too large, packets exceed the
      # link MTU after encapsulation and either get fragmented (hurting
      # performance) or dropped (breaking connections entirely if ICMP
      # "fragmentation needed" messages are filtered, which is common).
      #
      # Standard Ethernet MTU is 1500 bytes. From that we subtract every
      # header that wraps the TCP payload on the path through the tunnel:
      #
      #   - Outer IP header:  20 bytes (IPv4) or 40 bytes (IPv6)
      #     The header of the packet that crosses the internet to the peer.
      #   - UDP header:       20 bytes (WireGuard always uses UDP)
      #   - WireGuard header: 32 bytes (fixed overhead for the WG framing)
      #   - Inner IP header:  20 bytes (IPv4) or 40 bytes (IPv6)
      #     The header of the original packet being forwarded inside the tunnel.
      #   - TCP header:       20 bytes (minimum, no options)
      #
      # What remains is the maximum safe TCP payload size. We use
      # target.pathMTU as the base rather than a hardcoded 1500 so that users
      # who know their tunnel path has a smaller MTU -- PPPoE uplinks at 1492,
      # or paths that cross additional tunnels -- can express that here. This
      # is especially important for this module because the NAT severs the ICMP
      # feedback loop that would normally allow the sender to discover a reduced
      # path MTU on their own; see the target.pathMTU option description.
      mssFor =
        innerIsIPv6: outerIsIPv6_:
        let
          outerIP = if outerIsIPv6_ then 40 else 20;
          innerIP = if innerIsIPv6 then 40 else 20;
        in
        icfg.target.pathMTU - outerIP - 20 - 32 - innerIP - 20;

      # Build the inbound match fragments for the prerouting DNAT rules, paired
      # with the IP family implied by each fragment (or null for interface mode
      # where no specific family is implied).
      #
      # Each fragment is a complete nft expression that selects which packets
      # should be redirected into the tunnel. The paired family is used in
      # dnatRules to ensure we only DNAT to tunnel IPs of a compatible family --
      # mixing families in a single nftables rule (e.g. "ip daddr" with
      # "dnat ip6 to") is a hard error in nftables.
      #
      # In interface mode, "iif" (input interface) matches packets arriving on
      # a named interface, regardless of their destination address. This is the
      # simpler case -- all traffic on that interface is a forwarding candidate,
      # and the tunnel IP family is not constrained by the inbound match.
      #
      # In IP mode, we match on the destination address ("ip daddr" for IPv4,
      # "ip6 daddr" for IPv6). Each IP becomes its own fragment because nftables
      # rules have a single match condition of this type; to match multiple
      # addresses we emit multiple rules rather than trying to combine them.
      # The family is known from the address, so we record it alongside the
      # fragment and use it to filter compatible tunnel IPs in dnatRules.
      usingInterfaces = icfg.match.interfaces != [ ];
      inboundMatchFragments =
        if usingInterfaces then
          map (i: {
            frag = "iif \"${i}\"";
            family = null;
          }) icfg.match.interfaces
        else
          map (
            ip:
            if isIPv6 ip then
              {
                frag = "ip6 daddr ${ip}";
                family = "ipv6";
              }
            else
              {
                frag = "ip daddr ${ip}";
                family = "ipv4";
              }
          ) icfg.match.ips;

      # Generate the DNAT prerouting rules. For each combination of inbound
      # match, tunnel destination IP, port mapping, and protocol, we emit one
      # rule of the form:
      #
      #   nft add rule inet <tbl> prerouting <match> <proto> dport <from>
      #     dnat <ip|ip6> to <tunnelIP>:<to>
      #
      # "dnat to" rewrites the packet's destination address and port before the
      # kernel makes its routing decision, so the packet is sent into the tunnel
      # rather than delivered locally. The "ip" or "ip6" qualifier tells nftables
      # which NAT helper to use and must match the address family of the tunnel IP.
      #
      # In IP mode we filter tunnel IPs to only those whose family matches the
      # inbound match fragment's family. Pairing an "ip daddr" match with a
      # "dnat ip6 to" action is a hard nftables error, and the familiesOverlap
      # assertion only ensures some families are shared -- it does not prevent
      # mixed lists where individual IPs might be mismatched. We silently skip
      # incompatible pairings rather than erroring, since the compatible rules
      # for the same IP are still emitted and will handle that traffic correctly.
      dnatRules = lib.concatMap (
        m:
        let
          compatibleTunnelIPs = lib.filter (
            tunnelIP:
            m.family == null # interface mode: no family constraint
            || (isIPv6 tunnelIP) == (m.family == "ipv6") # IP mode: families must match
          ) icfg.target.tunnelIPs;
        in
        lib.concatMap (
          tunnelIP:
          lib.concatMap (
            pm:
            let
              proto = if isIPv6 tunnelIP then "ip6" else "ip";
            in
            map (
              protocol:
              "nft add rule inet ${tbl} prerouting ${m.frag} ${protocol} dport ${toString pm.from} dnat ${proto} to ${stripSubnet tunnelIP}:${toString pm.to}"
            ) icfg.target.protocols
          ) icfg.target.ports
        ) compatibleTunnelIPs
      ) inboundMatchFragments;

      # The outbound interface expression matching the WireGuard tunnel.
      # "oif" (output interface) selects packets that are about to leave via
      # this interface, i.e. traffic headed into the tunnel toward the sink.
      oifWif = "oif \"${wif}\"";
      # "iif" (input interface) selects packets arriving from the tunnel,
      # i.e. reply traffic from the sink traveling back toward the client.
      iifWif = "iif \"${wif}\"";

      # Generate MSS clamping rules for one inner IP family (the family of
      # the payload traffic being forwarded, not the WireGuard transport).
      #
      # MSS clamping works by intercepting the TCP SYN and SYN-ACK packets
      # that negotiate the connection's MSS during the handshake, and
      # rewriting the MSS option to a value that is safe for the tunnel.
      # We target only SYN packets ("tcp flags syn") because that is where the
      # MSS option appears -- clamping on other packets would be a no-op.
      #
      # The inner match ("ip protocol tcp" for IPv4, "ip6 nexthdr tcp" for
      # IPv6) selects TCP traffic by inspecting the protocol field in the inner
      # IP header -- the one inside the tunnel payload, not the outer WireGuard
      # transport packet. We need separate rules for each inner family because
      # the field names differ between IPv4 and IPv6.
      #
      # We emit rules for both directions (WAN->tunnel and tunnel->WAN) because
      # MSS is negotiated in both the SYN (client->server) and SYN-ACK
      # (server->client), and both may need clamping.
      #
      # In interface mode, we can constrain both iif and oif, producing tightly
      # scoped rules. In IP mode we do not know the WAN interface, so we emit
      # one rule matching on oif only (traffic entering the tunnel) and one
      # matching on iif only (traffic leaving the tunnel). Together these cover
      # both directions without needing to know the WAN interface name.
      #
      # IPv4-outer clamp rules are only emitted when the transport is known to
      # be IPv4 (a smaller outer header means a larger safe MSS, so emitting
      # IPv4 rules when the transport is actually IPv6 would allow packets that
      # are too large). IPv6-outer rules are always emitted because they
      # produce the more conservative (smaller) MSS value and are safe to apply
      # even if the transport turns out to be IPv4.
      mssClampsFor =
        innerIsIPv6:
        let
          innerMatch = if innerIsIPv6 then "ip6 nexthdr tcp" else "ip protocol tcp";
          mss4 = toString (mssFor innerIsIPv6 false);
          mss6 = toString (mssFor innerIsIPv6 true);
          clampRulesFor =
            mss:
            if usingInterfaces then
              # Interface mode: emit a pair of rules per WAN interface,
              # one for each direction through the tunnel.
              lib.concatMap (i: [
                "nft add rule inet ${tbl} forward iif \"${i}\" ${oifWif} ${innerMatch} tcp flags syn tcp option maxseg size set ${mss}"
                "nft add rule inet ${tbl} forward ${iifWif} oif \"${i}\" ${innerMatch} tcp flags syn tcp option maxseg size set ${mss}"
              ]) icfg.match.interfaces
            else
              # IP mode: emit one rule per direction, keyed solely on the
              # tunnel interface since the WAN interface is unknown.
              [
                "nft add rule inet ${tbl} forward ${oifWif} ${innerMatch} tcp flags syn tcp option maxseg size set ${mss}"
                "nft add rule inet ${tbl} forward ${iifWif} ${innerMatch} tcp flags syn tcp option maxseg size set ${mss}"
              ];
        in
        lib.optionals (!outerIsIPv6) (clampRulesFor mss4) ++ clampRulesFor mss6;

      mssClamping =
        lib.optionals targetHasIPv4 (mssClampsFor false) ++ lib.optionals targetHasIPv6 (mssClampsFor true);

      # Accept rules for the forward chain. The kernel drops forwarded packets
      # by default; these rules permit traffic flowing in both directions
      # between the WAN side and the tunnel interface. MSS clamping rules
      # appear before these so they are evaluated first -- once a packet is
      # accepted, no further rules in the chain are checked.
      #
      # As with MSS clamping, in interface mode we scope rules to the named
      # WAN interfaces; in IP mode we fall back to matching only on the tunnel
      # interface, accepting all forwarded traffic entering or leaving it.
      # Build the snat address pools from this machine's tunnel IPs, split by
      # family. stripSubnet removes the CIDR prefix since nftables snat pools
      # take bare addresses. The sink's WireGuard peer config uses the full
      # CIDR tunnelIPs so it accepts the entire subnet as a valid source range,
      # covering all IPs in the pool regardless of how many are provisioned.
      snatPool4 = map stripSubnet (builtins.filter (ip: !isIPv6 ip) icfg.tunnelIPs);
      snatPool6 = map stripSubnet (builtins.filter isIPv6 icfg.tunnelIPs);

      forwardAccept =
        if usingInterfaces then
          lib.concatMap (i: [
            "nft add rule inet ${tbl} forward iif \"${i}\" ${oifWif} accept"
            "nft add rule inet ${tbl} forward ${iifWif} oif \"${i}\" accept"
          ]) icfg.match.interfaces
        else
          [
            "nft add rule inet ${tbl} forward ${oifWif} accept"
            "nft add rule inet ${tbl} forward ${iifWif} accept"
          ];

    in
    lib.concatStringsSep "\n" (
      [
        # Create the per-instance table. Using a separate table per instance
        # means teardown is a single "nft delete table" with no risk of
        # accidentally removing rules that belong to another instance or to
        # the host's own firewall.
        "nft add table inet ${tbl}"

        # Prerouting chain: DNAT inbound traffic to the sink's tunnel IP.
        # Priority "dstnat" is the conventional priority for destination NAT,
        # ensuring it runs before any filtering chains see the packet.
        "nft add chain inet ${tbl} prerouting '{ type nat hook prerouting priority dstnat; }'"
      ]
      ++ dnatRules
      ++ [

        # Postrouting chain: rewrite the source IP of forwarded packets to
        # one of this machine's tunnel IPs so the sink can route replies back.
        # Priority "srcnat" is the conventional priority for source NAT.
        "nft add chain inet ${tbl} postrouting '{ type nat hook postrouting priority srcnat; }'"

        # Source NAT (snat) rewrites the source address of each forwarded
        # packet to one of this machine's tunnel IPs before it enters the
        # WireGuard tunnel. The sink sees the packet as coming from a tunnel
        # IP it knows about, so it can route replies back through the tunnel.
        # Without this the sink would receive packets sourced from the original
        # client's public IP, which it has no route for.
        #
        # We use an explicit address pool rather than masquerade so that
        # multiple tunnel IPs can be used as source addresses. The kernel
        # distributes connections across the pool via conntrack, effectively
        # multiplying the per-IP connection limit by the number of tunnel IPs
        # provisioned. Each tunnel IP contributes one full ephemeral port range
        # worth of concurrent connections (28232 by default; see the ephemeral
        # port range tuning notes in the kernel tuning section below for how to
        # expand this). Users who need more concurrent connections should
        # provision additional IPs within the tunnel subnet and add them to
        # tunnelIPs.
        #
        # IPv4 and IPv6 pools are separate rules because a single packet is
        # either IPv4 or IPv6 and nftables cannot mix families in one snat rule.
      ]
      ++ lib.optionals (snatPool4 != [ ]) [
        "nft add rule inet ${tbl} postrouting ${oifWif} ip saddr != { ${lib.concatStringsSep ", " snatPool4} } snat ip to { ${lib.concatStringsSep ", " snatPool4} }"
      ]
      ++ lib.optionals (snatPool6 != [ ]) [
        "nft add rule inet ${tbl} postrouting ${oifWif} ip6 saddr != { ${lib.concatStringsSep ", " snatPool6} } snat ip6 to { ${lib.concatStringsSep ", " snatPool6} }"
      ]
      ++ [

        # Forward chain: allow and shape traffic crossing the tunnel boundary.
        # Priority "filter" places this at the standard filtering hook point.
        "nft add chain inet ${tbl} forward '{ type filter hook forward priority filter; }'"
      ]
      ++ mssClamping # MSS clamp rules must precede the accept rules
      ++ forwardAccept
    );

  enabledInstances = lib.filterAttrs (_: icfg: icfg.enable) cfg;

in
{
  options.services.wormholeSource = {

    instances = lib.mkOption {
      type = lib.types.attrsOf (
        lib.types.submodule {
          options = {
            enable = lib.mkEnableOption "wormhole source service";

            match = lib.mkOption {
              type = matchType;
              description = ''
                How to select inbound traffic for forwarding.
                Populate exactly one of `match.interfaces` or `match.ips`.
              '';
              example = {
                interfaces = [ "eno1" ];
              };
            };

            tunnelIPs = lib.mkOption {
              type = lib.types.nonEmptyListOf cidrType;
              example = [
                "10.0.0.1/24"
                "fd00::1/64"
              ];
              description = "WireGuard tunnel IP addresses for this machine, in CIDR notation.";
            };

            target = lib.mkOption {
              type = targetType;
              description = "Configuration for the sink (destination) machine.";
            };

            publicKeyFile = lib.mkOption {
              type = lib.types.nullOr lib.types.path;
              default = null;
              example = "/var/lib/wormhole/my-instance.pub";
              description = ''
                If set, the WireGuard public key for this instance will be
                derived from the private key and written to this path after
                the interface comes up. The file is written by root and made
                world-readable (0444), so any user or service on this machine
                can read it without needing access to the private key.

                This is useful for exposing the public key to a separate service
                that serves it over HTTP or another protocol, allowing sink
                operators to fetch it without out-of-band coordination. Public
                keys are safe to expose freely -- knowing a public key gives an
                attacker no ability to decrypt traffic or impersonate this machine.

                Leave null to skip key export.
              '';
            };
          };
        }
      );
      default = { };
      description = "Wormhole source instances, keyed by name.";
    };

  };

  config = lib.mkIf (enabledInstances != { }) {

    assertions = lib.concatLists (
      lib.mapAttrsToList (name: icfg: [
        {
          # Linux enforces a 15-character limit on network interface names.
          # Our tunnel interface is named "wh-<n>", so the instance name
          # may be at most 12 characters (15 - length("wh-") = 12).
          assertion = builtins.stringLength name <= 12;
          message =
            "wormholeSource.${name}: instance name must be 12 characters or fewer "
            + "(\"wh-${name}\" would exceed the 15-character Linux interface name limit).";
        }
        {
          # Exactly one of the two match modes must be active. Both empty or
          # both populated are both errors -- we cannot infer the user's intent.
          assertion = (icfg.match.interfaces != [ ]) != (icfg.match.ips != [ ]);
          message = "wormholeSource.${name}: exactly one of match.interfaces or match.ips must be set.";
        }
        {
          # When matching by destination IP, the inbound IPs and the tunnel IPs
          # must share at least one address family. If they do not -- for example
          # if match.ips are all IPv4 but tunnelIPs are all IPv6 -- then matched
          # packets cannot be encapsulated into the tunnel and will be dropped.
          assertion = icfg.match.ips == [ ] || familiesOverlap icfg.match.ips icfg.tunnelIPs;
          message = "wormholeSource.${name}: match.ips and tunnelIPs share no IP address family -- traffic cannot be routed.";
        }
        {
          # The local and target tunnel IPs must share at least one family.
          # A mismatch here means DNAT would rewrite the destination to an
          # address family that the WireGuard tunnel is not carrying, making
          # the rewritten packets undeliverable.
          assertion = familiesOverlap icfg.tunnelIPs icfg.target.tunnelIPs;
          message = "wormholeSource.${name}: tunnelIPs and target.tunnelIPs share no IP address family -- DNAT targets would be unreachable.";
        }
        {
          # The only hard constraint on endpointFamily is that it must not be
          # "ipv4" when the endpoint is a literal IPv6 address. An IPv6 endpoint
          # means the outer transport header is provably 40 bytes, so claiming
          # it is only 20 bytes (IPv4) would produce an MSS 20 bytes too large,
          # allowing packets that will be silently dropped inside the tunnel.
          #
          # The inverse -- specifying "ipv6" when the endpoint is a literal IPv4
          # address -- is explicitly permitted. This is the chained tunnel case:
          # the user knows there is additional encapsulation beyond the immediate
          # peer and is deliberately choosing the more conservative MSS.
          assertion =
            let
              detected = detectEndpointFamily icfg.target.endpoint;
            in
            detected != "ipv6" || icfg.target.endpointFamily != "ipv4";
          message =
            "wormholeSource.${name}: target.endpointFamily is \"ipv4\" "
            + "but the endpoint \"${toString icfg.target.endpoint}\" is an IPv6 address. "
            + "The outer header is provably 40 bytes; specifying \"ipv4\" would produce "
            + "an MSS that is 20 bytes too large and packets would be dropped in the tunnel.";
        }
      ]) enabledInstances
    );

    # These two sysctls are required for this module to function. Without them
    # the kernel silently drops all forwarded packets regardless of nftables
    # rules, because the default policy is to not route packets between
    # interfaces.
    boot.kernel.sysctl = {
      "net.ipv4.ip_forward" = 1;
      "net.ipv6.conf.all.forwarding" = 1;
    };

    # ---------------------------------------------------------------------------
    # Kernel tuning -- not managed by this module
    # ---------------------------------------------------------------------------
    #
    # The following sysctls are NOT set by this module because they are global
    # settings that affect all NAT and connection tracking on the machine.
    # Setting them from a module risks silently breaking unrelated services.
    # They are documented here because they are important for correct operation
    # under load and should be considered by operators deploying this module.
    #
    # All of these can be set in NixOS via boot.kernel.sysctl.
    #
    # Connection tracking table size
    # --------------------------------
    # net.netfilter.nf_conntrack_max (default: varies, typically 65536)
    #
    #   The maximum number of connections the kernel will track simultaneously
    #   across all NAT. When the table is full, new connections are dropped
    #   entirely with no error visible to the client. Under high connection
    #   counts -- especially with the snat pool capacity discussion below in
    #   mind -- this limit can be reached before the port space is exhausted.
    #   Operators expecting many concurrent connections should raise this.
    #
    # TCP timeouts
    # ------------
    #
    # TCP is a stateful protocol with a well-defined lifecycle. conntrack tracks
    # each connection through that lifecycle and applies a different timeout at
    # each stage. Understanding the stages helps operators choose appropriate
    # values:
    #
    #   Opening handshake (SYN -> SYN-ACK -> ACK):
    #     The client sends SYN. The server replies SYN-ACK. The client confirms
    #     with ACK. Only after all three does the connection enter ESTABLISHED.
    #
    #   Data transfer:
    #     Both sides exchange data freely. The connection stays ESTABLISHED until
    #     one side begins closing.
    #
    #   Closing handshake (FIN -> ACK, FIN -> ACK):
    #     Either side sends FIN to signal it has finished sending. The other side
    #     ACKs, then sends its own FIN, which is ACKed in turn. Both sides must
    #     complete this exchange for a clean close. After the final ACK the
    #     initiating side enters TIME_WAIT briefly before the entry is removed,
    #     to absorb any delayed packets still in flight.
    #
    #   Abnormal close:
    #     Either side may send RST at any time to abort immediately, skipping
    #     the closing handshake entirely. conntrack removes the entry quickly.
    #     If neither FIN nor RST is sent -- the peer simply disappears -- the
    #     entry stays until the ESTABLISHED timeout expires.
    #
    # net.netfilter.nf_conntrack_tcp_timeout_syn_sent (default: 120)
    #
    #   A SYN has been sent but no SYN-ACK has arrived yet. The server may be
    #   unreachable or slow to respond. 120 seconds is generous; this can be
    #   lowered to fail fast on unreachable destinations.
    #
    # net.netfilter.nf_conntrack_tcp_timeout_syn_recv (default: 60)
    #
    #   A SYN-ACK has been sent but the final ACK completing the handshake has
    #   not arrived. This is typically a half-open connection from a client that
    #   disappeared mid-handshake or a SYN flood. 60 seconds is reasonable.
    #
    # net.netfilter.nf_conntrack_tcp_timeout_established (default: 432000 -- 5 days)
    #
    #   The connection completed its handshake and is in the data transfer phase,
    #   but has gone silent. If a client disappears without sending a FIN or RST
    #   -- power failure, network outage, crashed process -- its entry sits in
    #   the table for 5 days, holding a slot in the snat pool and the conntrack
    #   table. For forwarding proxies serving many short-lived or unreliable
    #   clients, lowering this to 3600 (1 hour) or less is common practice.
    #
    # net.netfilter.nf_conntrack_tcp_timeout_fin_wait (default: 120)
    #
    #   One side has sent FIN and is waiting for the other side's FIN in return.
    #   The connection is half-closed: the initiating side has finished sending
    #   but the other side may still be sending data. 120 seconds allows the
    #   peer time to finish its own transmissions and send its FIN.
    #
    # net.netfilter.nf_conntrack_tcp_timeout_close_wait (default: 60)
    #
    #   A FIN has been received and ACKed, but this side has not yet sent its
    #   own FIN. The local application still has data to send or has not yet
    #   called close(). 60 seconds gives the application time to finish up.
    #
    # net.netfilter.nf_conntrack_tcp_timeout_time_wait (default: 120)
    #
    #   Both FINs have been exchanged and ACKed. The connection is functionally
    #   closed but the entry is retained briefly to absorb any delayed packets
    #   that are still in flight and might otherwise be misassigned to a new
    #   connection reusing the same port. 120 seconds (2 * maximum segment
    #   lifetime) is the value RFC 793 recommends. Lowering this risks rare
    #   packet misdelivery on heavily reused ports.
    #
    # net.netfilter.nf_conntrack_tcp_timeout_close (default: 10)
    #
    #   A RST has been seen and the connection is being torn down immediately.
    #   10 seconds is enough to flush any remaining in-flight packets before
    #   the entry is removed. This rarely needs adjustment.
    #
    # UDP timeouts
    # ------------
    # net.netfilter.nf_conntrack_udp_timeout (default: 30)
    #
    #   How long to retain a conntrack entry for a UDP flow where traffic has
    #   only been seen in one direction. For UDP hole punching, the client must
    #   send a new packet before this expires to keep the mapping alive. If the
    #   client's keepalive interval is longer than 30 seconds, raise this to
    #   match it.
    #
    # net.netfilter.nf_conntrack_udp_timeout_stream (default: 180)
    #
    #   How long to retain a conntrack entry for a UDP flow where traffic has
    #   been seen in both directions (considered an established stream). Once
    #   a UDP hole punching exchange is bidirectional this timeout applies.
    #   Raise this if the application's keepalive interval exceeds 180 seconds.
    #
    # Capacity exhaustion and monitoring
    # ------------------------------------
    #
    #   When the snat port pool for a tunnel IP is exhausted, the kernel
    #   silently drops new connection attempts -- no RST, no ICMP unreachable,
    #   nothing. The external client sends a SYN and waits. No SYN-ACK ever
    #   arrives. After exhausting its retransmission attempts (typically 75
    #   seconds to 3 minutes depending on the OS), the client's connect() call
    #   finally returns an error. The experience is indistinguishable from the
    #   server being entirely unreachable. The same silent drop behavior occurs
    #   if nf_conntrack_max is reached.
    #
    #   Because exhaustion is silent, proactive monitoring is essential.
    #   The system-wide fill level is visible at:
    #
    #     /proc/sys/net/netfilter/nf_conntrack_count  (current tracked connections)
    #     /proc/sys/net/netfilter/nf_conntrack_max    (the limit)
    #
    #   However these are global counters and do not distinguish between
    #   wormhole traffic and other NAT on the machine. To monitor exhaustion
    #   specifically for each tunnel IP in the snat pool, use the conntrack
    #   utility (conntrack-tools in nixpkgs) to query the conntrack table
    #   directly via netlink:
    #
    #     conntrack -L -s <tunnel-ip> 2>/dev/null | wc -l
    #
    #   This counts the number of active connections currently sourced from
    #   that tunnel IP. Compare it against the ephemeral port range to gauge
    #   how close to exhaustion each IP is. Run once per tunnel IP in the pool.
    #
    #   Note that the conntrack table is only populated once the module's
    #   nftables rules are active and traffic has flowed through them.
    #   The conntrack utility communicates with the kernel via netlink sockets
    #   rather than procfs, so it works regardless of whether
    #   /proc/net/nf_conntrack is present on the system.
    #
    # Ephemeral port range and capacity tuning
    # -----------------------------------------
    #
    #   The ephemeral port range governs which ports the kernel may allocate as
    #   source ports for outgoing connections, and therefore determines the
    #   maximum concurrent connections per tunnel IP. It applies to both IPv4
    #   and IPv6 -- there is no separate IPv6 setting; the ip_local_port_range
    #   sysctl controls both address families despite the "ipv4" in its path.
    #
    #   The current range can be read from:
    #
    #     /proc/sys/net/ipv4/ip_local_port_range
    #
    #   The default is typically "32768 60999". Both bounds are inclusive, so
    #   the default gives 60999 - 32768 + 1 = 28232 ports per tunnel IP. The
    #   total concurrent connection capacity across the snat pool is:
    #
    #     (upper - lower + 1) * number_of_tunnel_ips
    #
    #   The range can be expanded to increase capacity. In NixOS:
    #
    #     boot.kernel.sysctl."net.ipv4.ip_local_port_range" = "1024 65535";
    #
    #   Note that this option takes a space-separated string of both bounds
    #   rather than a single integer, unlike most other sysctls.
    #
    #   The theoretical maximum is 1-65535, but in practice:
    #
    #   - Ports below 1024 are privileged and conventionally reserved for
    #     well-known services. Going below 1024 is inadvisable.
    #
    #   - Ports already actively bound by listening services on this machine
    #     are skipped by the kernel during ephemeral port allocation. However
    #     this check is not race-free under high load -- if the lower bound of
    #     the ephemeral range overlaps with ports that services listen on,
    #     there is a small risk that a reply to an outgoing connection arrives
    #     on the same port a service is listening on. conntrack takes priority
    #     over local delivery for established flows, so the reply will normally
    #     be routed correctly, but the overlap is a source of unnecessary
    #     ambiguity. Set the lower bound above the highest port any service on
    #     this machine listens on to avoid it entirely.
    #
    #   A common production setting is "1024 65535", giving 65535 - 1024 + 1 =
    #   64512 ports per tunnel IP. Combined with multiple tunnel IPs this
    #   allows for very large connection counts without requiring additional
    #   infrastructure.

    networking.wireguard.interfaces = lib.mapAttrs' (
      name: icfg:
      lib.nameValuePair "wh-${name}" {
        ips = icfg.tunnelIPs;
        privateKeyFile = "/etc/wireguard/wormhole-${name}.key";
        peers = [
          {
            publicKey = icfg.target.publicKey;
            # allowedIPs tells WireGuard which destination addresses should be
            # routed into this tunnel toward the sink. Only traffic destined for
            # the sink's tunnel IPs is sent through the tunnel; everything else
            # is unaffected.
            #
            # Note: the sink's WireGuard peer config must list this machine's
            # entire tunnel subnet (e.g. "10.0.0.0/24") in its own allowedIPs,
            # not just a single IP. This is because snat may source packets from
            # any of the provisioned tunnel IPs, and WireGuard on the sink will
            # silently drop packets whose source address is not covered by the
            # peer's allowedIPs. This is configured on the sink side and is
            # outside the scope of this module.
            allowedIPs = icfg.target.tunnelIPs;
            endpoint = icfg.target.endpoint;
            # allowedIPsAsRoutes installs kernel routes for the allowedIPs so
            # the kernel knows to send packets for those addresses into this
            # WireGuard interface rather than looking for another route.
            allowedIPsAsRoutes = true;
            # persistentKeepalive sends a keepalive packet every 25 seconds.
            # This keeps the NAT mapping alive on any stateful firewall or NAT
            # device between us and the peer, which is important when the sink
            # is behind a NAT and needs to receive unsolicited packets from us.
            persistentKeepalive = 25;
          }
        ];
        # nftables rules are created after the interface comes up (postSetup)
        # and removed when it goes down (postShutdown). Deleting the entire
        # table on shutdown is intentional -- it removes all rules and chains
        # in one atomic operation, with no risk of leaving stale rules behind.
        postSetup = instanceRules name icfg;
        postShutdown = "nft delete table inet wormhole-${name}";
      }
    ) enabledInstances;

    # For each instance that has publicKeyFile set, create a oneshot systemd
    # service that derives the public key from the private key and writes it
    # to the configured path. The service runs after the WireGuard interface
    # is up (which ensures the private key file exists and is readable by root)
    # and is ordered before any service that might want to serve the key.
    #
    # We use a oneshot service rather than running this in postSetup because
    # the public key is stable for the lifetime of the private key -- it does
    # not need to be re-derived every time the tunnel restarts. The service
    # will re-run on boot and whenever the WireGuard interface unit is restarted,
    # which is sufficient to keep the file current if the private key is rotated.
    systemd.services = lib.mapAttrs' (
      name: icfg:
      lib.nameValuePair "wormhole-pubkey-${name}" {
        description = "Export WireGuard public key for wormhole instance ${name}";
        # Run after the WireGuard interface is up so the private key file
        # is guaranteed to exist. wantedBy network-online so it runs on boot
        # without needing to be explicitly activated.
        after = [ "wireguard-wh-${name}.service" ];
        wantedBy = [ "wireguard-wh-${name}.service" ];
        serviceConfig = {
          Type = "oneshot";
          # Run as root so we can read the private key from /etc/wireguard
          # and write to wherever the user has configured, creating any
          # intermediate directories as needed.
          User = "root";
          RemainAfterExit = true;
        };
        script = ''
          # Derive the public key from the private key. wg pubkey reads the
          # private key from stdin and writes the public key to stdout.
          # The public key is safe to expose -- it cannot be used to decrypt
          # traffic or derive the private key.
          pubkey=$(${pkgs.wireguard-tools}/bin/wg pubkey             < /etc/wireguard/wormhole-${name}.key)

          # Create the destination directory if it does not exist, preserving
          # any existing permissions on parent directories.
          install -d -m 0755 "$(dirname '${toString icfg.publicKeyFile}')"

          # Write the public key and make it world-readable. We use install
          # rather than a redirect so the permissions are set atomically with
          # the file creation rather than in a separate chmod call.
          echo "$pubkey" | install -m 0444 /dev/stdin             '${toString icfg.publicKeyFile}'
        '';
      }
    ) (lib.filterAttrs (_: icfg: icfg.enable && icfg.publicKeyFile != null) cfg);
  };
}