Edit

IABSD.fr/src/sys/net/if_pfsync.c

Branch :

  • Show log

    Commit

  • Author : dlg
    Date : 2024-12-18 02:25:30
    Hash : e541a7ae
    Message : go back to r1.326, before i fiddled with packet generation and bpf. i've had a couple of reports of redundant firewalls misbehaving since these changes, so until i can figure out what's wrong i'm backing them out. reported by hrvoje popovski and mark patruck

  • sys/net/if_pfsync.c
  • /*	$OpenBSD: if_pfsync.c,v 1.330 2024/12/18 02:25:30 dlg Exp $	*/
    
    /*
     * Copyright (c) 2002 Michael Shalayeff
     * All rights reserved.
     *
     * Redistribution and use in source and binary forms, with or without
     * modification, are permitted provided that the following conditions
     * are met:
     * 1. Redistributions of source code must retain the above copyright
     *    notice, this list of conditions and the following disclaimer.
     * 2. Redistributions in binary form must reproduce the above copyright
     *    notice, this list of conditions and the following disclaimer in the
     *    documentation and/or other materials provided with the distribution.
     *
     * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
     * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
     * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
     * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT,
     * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
     * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
     * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
     * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
     * THE POSSIBILITY OF SUCH DAMAGE.
     */
    
    /*
     * Copyright (c) 2009, 2022, 2023 David Gwynne <dlg@openbsd.org>
     *
     * Permission to use, copy, modify, and distribute this software for any
     * purpose with or without fee is hereby granted, provided that the above
     * copyright notice and this permission notice appear in all copies.
     *
     * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
     * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
     * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
     * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
     * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
     * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
     * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
     */
    
    #include "bpfilter.h"
    #include "pfsync.h"
    #include "kstat.h"
    
    #include <sys/param.h>
    #include <sys/systm.h>
    #include <sys/time.h>
    #include <sys/malloc.h>
    #include <sys/mbuf.h>
    #include <sys/socket.h>
    #include <sys/ioctl.h>
    #include <sys/timeout.h>
    #include <sys/kernel.h>
    #include <sys/sysctl.h>
    #include <sys/pool.h>
    #include <sys/syslog.h>
    #include <sys/tree.h>
    #include <sys/smr.h>
    #include <sys/percpu.h>
    #include <sys/refcnt.h>
    #include <sys/kstat.h>
    #include <sys/stdarg.h>
    
    #include <net/if.h>
    #include <net/if_types.h>
    #include <net/bpf.h>
    #include <net/netisr.h>
    #include <net/route.h>
    
    #include <netinet/in.h>
    #include <netinet/if_ether.h>
    #include <netinet/ip.h>
    #include <netinet/in_var.h>
    #include <netinet/ip_var.h>
    #include <netinet/ip_ipsp.h>
    #include <netinet/ip_icmp.h>
    #include <netinet/icmp6.h>
    #include <netinet/tcp.h>
    #include <netinet/tcp_seq.h>
    #include <netinet/tcp_fsm.h>
    #include <netinet/udp.h>
    
    #ifdef INET6
    #include <netinet6/in6_var.h>
    #include <netinet/ip6.h>
    #include <netinet6/ip6_var.h>
    #include <netinet6/nd6.h>
    #endif /* INET6 */
    
    #include "carp.h"
    #if NCARP > 0
    #include <netinet/ip_carp.h>
    #endif
    
    #include <net/pfvar.h>
    #include <net/pfvar_priv.h>
    #include <net/if_pfsync.h>
    
    #define PFSYNC_MINPKT ( \
    	sizeof(struct ip) + \
    	sizeof(struct pfsync_header))
    
    struct pfsync_softc;
    
    struct pfsync_deferral {
    	TAILQ_ENTRY(pfsync_deferral)		 pd_entry;
    	struct pf_state				*pd_st;
    	struct mbuf				*pd_m;
    	uint64_t				 pd_deadline;
    };
    TAILQ_HEAD(pfsync_deferrals, pfsync_deferral);
    
    #define PFSYNC_DEFER_NSEC	20000000ULL
    #define PFSYNC_DEFER_LIMIT	128
    #define PFSYNC_BULK_SND_IVAL_MS	20
    
    static struct pool pfsync_deferrals_pool;
    
    enum pfsync_bulk_req_state {
    	PFSYNC_BREQ_S_NONE,
    	PFSYNC_BREQ_S_START,
    	PFSYNC_BREQ_S_SENT,
    	PFSYNC_BREQ_S_BULK,
    	PFSYNC_BREQ_S_DONE,
    };
    
    static const char *pfsync_bulk_req_state_names[] = {
    	[PFSYNC_BREQ_S_NONE]		= "none",
    	[PFSYNC_BREQ_S_START]		= "start",
    	[PFSYNC_BREQ_S_SENT]		= "sent",
    	[PFSYNC_BREQ_S_BULK]		= "bulk",
    	[PFSYNC_BREQ_S_DONE]		= "done",
    };
    
    enum pfsync_bulk_req_event {
    	PFSYNC_BREQ_EVT_UP,
    	PFSYNC_BREQ_EVT_DOWN,
    	PFSYNC_BREQ_EVT_TMO,
    	PFSYNC_BREQ_EVT_LINK,
    	PFSYNC_BREQ_EVT_BUS_START,
    	PFSYNC_BREQ_EVT_BUS_END,
    };
    
    static const char *pfsync_bulk_req_event_names[] = {
    	[PFSYNC_BREQ_EVT_UP]		= "up",
    	[PFSYNC_BREQ_EVT_DOWN]		= "down",
    	[PFSYNC_BREQ_EVT_TMO]		= "timeout",
    	[PFSYNC_BREQ_EVT_LINK]		= "link",
    	[PFSYNC_BREQ_EVT_BUS_START]	= "bus-start",
    	[PFSYNC_BREQ_EVT_BUS_END]	= "bus-end",
    };
    
    struct pfsync_slice {
    	struct pfsync_softc	*s_pfsync;
    	struct mutex		 s_mtx;
    
    	struct pf_state_queue	 s_qs[PFSYNC_S_COUNT];
    	TAILQ_HEAD(, tdb)	 s_tdb_q;
    	size_t			 s_len;
    	struct mbuf_list	 s_ml;
    
    	struct taskq		*s_softnet;
    	struct task		 s_task;
    	struct timeout		 s_tmo;
    
    	struct mbuf_queue	 s_sendq;
    	struct task		 s_send;
    
    	struct pfsync_deferrals	 s_deferrals;
    	unsigned int		 s_deferred;
    	struct task		 s_deferrals_task;
    	struct timeout		 s_deferrals_tmo;
    
    	uint64_t		 s_stat_locks;
    	uint64_t		 s_stat_contended;
    	uint64_t		 s_stat_write_nop;
    	uint64_t		 s_stat_task_add;
    	uint64_t		 s_stat_task_run;
    	uint64_t		 s_stat_enqueue;
    	uint64_t		 s_stat_dequeue;
    
    	uint64_t		 s_stat_defer_add;
    	uint64_t		 s_stat_defer_ack;
    	uint64_t		 s_stat_defer_run;
    	uint64_t		 s_stat_defer_overlimit;
    
    	struct kstat		*s_kstat;
    } __aligned(CACHELINESIZE);
    
    #define PFSYNC_SLICE_BITS	 1
    #define PFSYNC_NSLICES		 (1 << PFSYNC_SLICE_BITS)
    
    struct pfsync_softc {
    	struct ifnet		 sc_if;
    	unsigned int		 sc_dead;
    	unsigned int		 sc_up;
    	struct refcnt		 sc_refs;
    
    	/* config */
    	struct in_addr		 sc_syncpeer;
    	unsigned int		 sc_maxupdates;
    	unsigned int		 sc_defer;
    
    	/* operation */
    	unsigned int		 sc_sync_ifidx;
    	unsigned int		 sc_sync_if_down;
    	void			*sc_inm;
    	struct task		 sc_ltask;
    	struct task		 sc_dtask;
    	struct ip		 sc_template;
    
    	struct pfsync_slice	 sc_slices[PFSYNC_NSLICES];
    
    	struct {
    		struct rwlock			 req_lock;
    		struct timeout			 req_tmo;
    		enum pfsync_bulk_req_state	 req_state;
    		unsigned int			 req_tries;
    		unsigned int			 req_demoted;
    	}			 sc_bulk_req;
    
    	struct {
    		struct rwlock			 snd_lock;
    		struct timeout			 snd_tmo;
    		time_t				 snd_requested;
    
    		struct pf_state			*snd_next;
    		struct pf_state			*snd_tail;
    		unsigned int			 snd_again;
    	}			 sc_bulk_snd;
    };
    
    static struct pfsync_softc	*pfsyncif = NULL;
    static struct cpumem		*pfsynccounters;
    
    static inline void
    pfsyncstat_inc(enum pfsync_counters c)
    {
    	counters_inc(pfsynccounters, c);
    }
    
    static int	pfsync_clone_create(struct if_clone *, int);
    static int	pfsync_clone_destroy(struct ifnet *);
    
    static int	pfsync_output(struct ifnet *, struct mbuf *, struct sockaddr *,
    		    struct rtentry *);
    static void	pfsync_start(struct ifqueue *);
    
    static int	pfsync_ioctl(struct ifnet *, u_long, caddr_t);
    static int	pfsync_up(struct pfsync_softc *);
    static int	pfsync_down(struct pfsync_softc *);
    
    static int	pfsync_set_mtu(struct pfsync_softc *, unsigned int);
    static int	pfsync_set_parent(struct pfsync_softc *,
    		    const struct if_parent *);
    static int	pfsync_get_parent(struct pfsync_softc *, struct if_parent *);
    static int	pfsync_del_parent(struct pfsync_softc *);
    
    static int	pfsync_get_ioc(struct pfsync_softc *, struct ifreq *);
    static int	pfsync_set_ioc(struct pfsync_softc *, struct ifreq *);
    
    static void	pfsync_syncif_link(void *);
    static void	pfsync_syncif_detach(void *);
    
    static void	pfsync_sendout(struct pfsync_softc *, struct mbuf *);
    static void	pfsync_slice_drop(struct pfsync_softc *, struct pfsync_slice *);
    
    static void	pfsync_slice_tmo(void *);
    static void	pfsync_slice_task(void *);
    static void	pfsync_slice_sendq(void *);
    
    static void	pfsync_deferrals_tmo(void *);
    static void	pfsync_deferrals_task(void *);
    static void	pfsync_defer_output(struct pfsync_deferral *);
    
    static void	pfsync_bulk_req_evt(struct pfsync_softc *,
    		    enum pfsync_bulk_req_event);
    static void	pfsync_bulk_req_tmo(void *);
    
    static void	pfsync_bulk_snd_tmo(void *);
    
    #if NKSTAT > 0
    struct pfsync_kstat_data {
    	struct kstat_kv pd_locks;
    	struct kstat_kv pd_contended;
    	struct kstat_kv pd_write_nop;
    	struct kstat_kv pd_task_add;
    	struct kstat_kv pd_task_run;
    	struct kstat_kv pd_enqueue;
    	struct kstat_kv pd_dequeue;
    	struct kstat_kv pd_qdrop;
    
    	struct kstat_kv pd_defer_len;
    	struct kstat_kv pd_defer_add;
    	struct kstat_kv pd_defer_ack;
    	struct kstat_kv pd_defer_run;
    	struct kstat_kv pd_defer_overlimit;
    };
    
    static const struct pfsync_kstat_data pfsync_kstat_tpl = {
    	KSTAT_KV_INITIALIZER("locks",		KSTAT_KV_T_COUNTER64),
    	KSTAT_KV_INITIALIZER("contended",	KSTAT_KV_T_COUNTER64),
    	KSTAT_KV_INITIALIZER("write-nops",	KSTAT_KV_T_COUNTER64),
    	KSTAT_KV_INITIALIZER("send-sched",	KSTAT_KV_T_COUNTER64),
    	KSTAT_KV_INITIALIZER("send-run",	KSTAT_KV_T_COUNTER64),
    	KSTAT_KV_INITIALIZER("enqueues",	KSTAT_KV_T_COUNTER64),
    	KSTAT_KV_INITIALIZER("dequeues",	KSTAT_KV_T_COUNTER64),
    	KSTAT_KV_UNIT_INITIALIZER("qdrops",
    	    KSTAT_KV_T_COUNTER32, KSTAT_KV_U_PACKETS),
    
    	KSTAT_KV_UNIT_INITIALIZER("defer-len",
    	    KSTAT_KV_T_COUNTER32, KSTAT_KV_U_PACKETS),
    	KSTAT_KV_INITIALIZER("defer-add",	KSTAT_KV_T_COUNTER64),
    	KSTAT_KV_INITIALIZER("defer-ack",	KSTAT_KV_T_COUNTER64),
    	KSTAT_KV_INITIALIZER("defer-run",	KSTAT_KV_T_COUNTER64),
    	KSTAT_KV_INITIALIZER("defer-over",	KSTAT_KV_T_COUNTER64),
    };
    
    static int
    pfsync_kstat_copy(struct kstat *ks, void *dst)
    {
    	struct pfsync_slice *s = ks->ks_softc;
    	struct pfsync_kstat_data *pd = dst;
    
    	*pd = pfsync_kstat_tpl;
    	kstat_kv_u64(&pd->pd_locks) = s->s_stat_locks;
    	kstat_kv_u64(&pd->pd_contended) = s->s_stat_contended;
    	kstat_kv_u64(&pd->pd_write_nop) = s->s_stat_write_nop;
    	kstat_kv_u64(&pd->pd_task_add) = s->s_stat_task_add;
    	kstat_kv_u64(&pd->pd_task_run) = s->s_stat_task_run;
    	kstat_kv_u64(&pd->pd_enqueue) = s->s_stat_enqueue;
    	kstat_kv_u64(&pd->pd_dequeue) = s->s_stat_dequeue;
    	kstat_kv_u32(&pd->pd_qdrop) = mq_drops(&s->s_sendq);
    
    	kstat_kv_u32(&pd->pd_defer_len) = s->s_deferred;
    	kstat_kv_u64(&pd->pd_defer_add) = s->s_stat_defer_add;
    	kstat_kv_u64(&pd->pd_defer_ack) = s->s_stat_defer_ack;
    	kstat_kv_u64(&pd->pd_defer_run) = s->s_stat_defer_run;
    	kstat_kv_u64(&pd->pd_defer_overlimit) = s->s_stat_defer_overlimit;
    
    	return (0);
    }
    #endif /* NKSTAT > 0 */
    
    #define PFSYNC_MAX_BULKTRIES	12
    
    struct if_clone	pfsync_cloner =
        IF_CLONE_INITIALIZER("pfsync", pfsync_clone_create, pfsync_clone_destroy);
    
    void
    pfsyncattach(int npfsync)
    {
    	pfsynccounters = counters_alloc(pfsyncs_ncounters);
    	if_clone_attach(&pfsync_cloner);
    }
    
    static int
    pfsync_clone_create(struct if_clone *ifc, int unit)
    {
    	struct pfsync_softc *sc;
    	struct ifnet *ifp;
    	size_t i, q;
    
    	if (unit != 0)
    		return (ENXIO);
    
    	if (pfsync_deferrals_pool.pr_size == 0) {
    		pool_init(&pfsync_deferrals_pool,
    		    sizeof(struct pfsync_deferral), 0,
    		    IPL_MPFLOOR, 0, "pfdefer", NULL);
    		/* pool_cache_init(&pfsync_deferrals_pool); */
    	}
    
    	sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO|M_CANFAIL);
    	if (sc == NULL)
    		return (ENOMEM);
    
    	/* sc_refs is "owned" by IFF_RUNNING */
    
    	sc->sc_syncpeer.s_addr = INADDR_PFSYNC_GROUP;
    	sc->sc_maxupdates = 128;
    	sc->sc_defer = 0;
    
    	task_set(&sc->sc_ltask, pfsync_syncif_link, sc);
    	task_set(&sc->sc_dtask, pfsync_syncif_detach, sc);
    
    	rw_init(&sc->sc_bulk_req.req_lock, "pfsyncbreq");
    	/* need process context to take net lock to call ip_output */
    	timeout_set_proc(&sc->sc_bulk_req.req_tmo, pfsync_bulk_req_tmo, sc);
    
    	rw_init(&sc->sc_bulk_snd.snd_lock, "pfsyncbsnd");
    	/* need process context to take net lock to call ip_output */
    	timeout_set_proc(&sc->sc_bulk_snd.snd_tmo, pfsync_bulk_snd_tmo, sc);
    
    	ifp = &sc->sc_if;
    	snprintf(ifp->if_xname, sizeof ifp->if_xname, "%s%d",
    	    ifc->ifc_name, unit);
    	ifp->if_softc = sc;
    	ifp->if_ioctl = pfsync_ioctl;
    	ifp->if_output = pfsync_output;
    	ifp->if_qstart = pfsync_start;
    	ifp->if_type = IFT_PFSYNC;
    	ifp->if_hdrlen = sizeof(struct pfsync_header);
    	ifp->if_mtu = ETHERMTU;
    	ifp->if_xflags = IFXF_CLONED | IFXF_MPSAFE;
    
    	for (i = 0; i < nitems(sc->sc_slices); i++) {
    		struct pfsync_slice *s = &sc->sc_slices[i];
    
    		s->s_pfsync = sc;
    
    		mtx_init_flags(&s->s_mtx, IPL_SOFTNET, "pfslice", 0);
    		s->s_softnet = net_tq(i);
    		timeout_set(&s->s_tmo, pfsync_slice_tmo, s);
    		task_set(&s->s_task, pfsync_slice_task, s);
    
    		mq_init(&s->s_sendq, 16, IPL_SOFTNET);
    		task_set(&s->s_send, pfsync_slice_sendq, s);
    
    		s->s_len = PFSYNC_MINPKT;
    		ml_init(&s->s_ml);
    
    		for (q = 0; q < nitems(s->s_qs); q++)
    			TAILQ_INIT(&s->s_qs[q]);
    		TAILQ_INIT(&s->s_tdb_q);
    
    		/* stupid NET_LOCK */
    		timeout_set(&s->s_deferrals_tmo, pfsync_deferrals_tmo, s);
    		task_set(&s->s_deferrals_task, pfsync_deferrals_task, s);
    		TAILQ_INIT(&s->s_deferrals);
    
    #if NKSTAT > 0
    		s->s_kstat = kstat_create(ifp->if_xname, 0, "pfsync-slice", i,
    		    KSTAT_T_KV, 0);
    
    		kstat_set_mutex(s->s_kstat, &s->s_mtx);
    		s->s_kstat->ks_softc = s;
    		s->s_kstat->ks_datalen = sizeof(pfsync_kstat_tpl);
    		s->s_kstat->ks_copy = pfsync_kstat_copy;
    		kstat_install(s->s_kstat);
    #endif
    	}
    
    	if_counters_alloc(ifp);
    	if_attach(ifp);
    	if_alloc_sadl(ifp);
    
    #if NCARP > 0
    	if_addgroup(ifp, "carp");
    #endif
    
    #if NBPFILTER > 0
    	bpfattach(&sc->sc_if.if_bpf, ifp, DLT_PFSYNC, PFSYNC_HDRLEN);
    #endif
    
    	return (0);
    }
    
    static int
    pfsync_clone_destroy(struct ifnet *ifp)
    {
    	struct pfsync_softc *sc = ifp->if_softc;
    #if NKSTAT > 0
    	size_t i;
    #endif
    
    	NET_LOCK();
    	sc->sc_dead = 1;
    
    	if (ISSET(ifp->if_flags, IFF_RUNNING))
    		pfsync_down(sc);
    	NET_UNLOCK();
    
    	if_detach(ifp);
    
    #if NKSTAT > 0
    	for (i = 0; i < nitems(sc->sc_slices); i++) {
    		struct pfsync_slice *s = &sc->sc_slices[i];
    
    		kstat_destroy(s->s_kstat);
    	}
    #endif
    
    	free(sc, M_DEVBUF, sizeof(*sc));
    
    	return (0);
    }
    
    static void
    pfsync_dprintf(struct pfsync_softc *sc, const char *fmt, ...)
    {
    	struct ifnet *ifp = &sc->sc_if;
    	va_list ap;
    
    	if (!ISSET(ifp->if_flags, IFF_DEBUG))
    		return;
    
    	printf("%s: ", ifp->if_xname);
    	va_start(ap, fmt);
    	vprintf(fmt, ap);
    	va_end(ap);
    	printf("\n");
    }
    
    static void
    pfsync_syncif_link(void *arg)
    {
    	struct pfsync_softc *sc = arg;
    	struct ifnet *ifp0;
    	unsigned int sync_if_down = 1;
    
    	ifp0 = if_get(sc->sc_sync_ifidx);
    	if (ifp0 != NULL && LINK_STATE_IS_UP(ifp0->if_link_state)) {
    		pfsync_bulk_req_evt(sc, PFSYNC_BREQ_EVT_LINK);
    		sync_if_down = 0;
    	}
    	if_put(ifp0);
    
    #if NCARP > 0
    	if (sc->sc_sync_if_down != sync_if_down) {
    		carp_group_demote_adj(&sc->sc_if,
    		    sync_if_down ? 1 : -1, "pfsync link");
    	}
    #endif
    
    	sc->sc_sync_if_down = sync_if_down;
    }
    
    static void
    pfsync_syncif_detach(void *arg)
    {
    	struct pfsync_softc *sc = arg;
    	struct ifnet *ifp = &sc->sc_if;
    
    	if (ISSET(ifp->if_flags, IFF_RUNNING)) {
    		pfsync_down(sc);
    		if_down(ifp);
    	}
    
    	sc->sc_sync_ifidx = 0;
    }
    
    static int
    pfsync_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst,
        struct rtentry *rt)
    {
    	m_freem(m);	/* drop packet */
    	return (EAFNOSUPPORT);
    }
    
    static int
    pfsync_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
    {
    	struct pfsync_softc *sc = ifp->if_softc;
    	struct ifreq *ifr = (struct ifreq *)data;
    	int error = ENOTTY;
    
    	switch (cmd) {
    	case SIOCSIFADDR:
    		error = EOPNOTSUPP;
    		break;
    
    	case SIOCSIFFLAGS:
    		if (ISSET(ifp->if_flags, IFF_UP)) {
    			if (!ISSET(ifp->if_flags, IFF_RUNNING))
    				error = pfsync_up(sc);
    			else
    				error = ENETRESET;
    		} else {
    			if (ISSET(ifp->if_flags, IFF_RUNNING))
    				error = pfsync_down(sc);
    		}
    		break;
    
    	case SIOCSIFMTU:
    		error = pfsync_set_mtu(sc, ifr->ifr_mtu);
    		break;
    
    	case SIOCSIFPARENT:
    		error = pfsync_set_parent(sc, (struct if_parent *)data);
    		break;
    	case SIOCGIFPARENT:
    		error = pfsync_get_parent(sc, (struct if_parent *)data);
    		break;
    	case SIOCDIFPARENT:
    		error = pfsync_del_parent(sc);
    		break;
    
    	case SIOCSETPFSYNC:
    		error = pfsync_set_ioc(sc, ifr);
    		break;
    	case SIOCGETPFSYNC:
    		error = pfsync_get_ioc(sc, ifr);
    		break;
    
    	default:
    		break;
    	}
    
    	if (error == ENETRESET)
    		error = 0;
    
    	return (error);
    }
    
    static int
    pfsync_set_mtu(struct pfsync_softc *sc, unsigned int mtu)
    {
    	struct ifnet *ifp = &sc->sc_if;
    	struct ifnet *ifp0;
    	int error = 0;
    
    	ifp0 = if_get(sc->sc_sync_ifidx);
    	if (ifp0 == NULL)
    		return (EINVAL);
    
    	if (mtu <= PFSYNC_MINPKT || mtu > ifp0->if_mtu) {
    		error = EINVAL;
    		goto put;
    	}
    
    	/* commit */
    	ifp->if_mtu = mtu;
    
    put:
    	if_put(ifp0);
    	return (error);
    }
    
    static int
    pfsync_set_parent(struct pfsync_softc *sc, const struct if_parent *p)
    {
    	struct ifnet *ifp = &sc->sc_if;
    	struct ifnet *ifp0;
    	int error = 0;
    
    	ifp0 = if_unit(p->ifp_parent);
    	if (ifp0 == NULL)
    		return (ENXIO);
    
    	if (ifp0->if_index == sc->sc_sync_ifidx)
    		goto put;
    
    	if (ISSET(ifp->if_flags, IFF_RUNNING)) {
    		error = EBUSY;
    		goto put;
    	}
    
    	/* commit */
    	sc->sc_sync_ifidx = ifp0->if_index;
    
    put:
    	if_put(ifp0);
    	return (error);
    }
    
    static int
    pfsync_get_parent(struct pfsync_softc *sc, struct if_parent *p)
    {
    	struct ifnet *ifp0;
    	int error = 0;
    
    	ifp0 = if_get(sc->sc_sync_ifidx);
    	if (ifp0 == NULL)
    		error = EADDRNOTAVAIL;
    	else
    		strlcpy(p->ifp_parent, ifp0->if_xname, sizeof(p->ifp_parent));
    	if_put(ifp0);
    
    	return (error);
    }
    
    static int
    pfsync_del_parent(struct pfsync_softc *sc)
    {
    	struct ifnet *ifp = &sc->sc_if;
    
    	if (ISSET(ifp->if_flags, IFF_RUNNING))
    		return (EBUSY);
    
    	/* commit */
    	sc->sc_sync_ifidx = 0;
    
    	return (0);
    }
    
    static int
    pfsync_get_ioc(struct pfsync_softc *sc, struct ifreq *ifr)
    {
    	struct pfsyncreq pfsyncr;
    	struct ifnet *ifp0;
    
    	memset(&pfsyncr, 0, sizeof(pfsyncr));
    
    	ifp0 = if_get(sc->sc_sync_ifidx);
    	if (ifp0 != NULL) {
    		strlcpy(pfsyncr.pfsyncr_syncdev, ifp0->if_xname,
    		    sizeof(pfsyncr.pfsyncr_syncdev));
    	}
    	if_put(ifp0);
    
    	pfsyncr.pfsyncr_syncpeer = sc->sc_syncpeer;
    	pfsyncr.pfsyncr_maxupdates = sc->sc_maxupdates;
    	pfsyncr.pfsyncr_defer = sc->sc_defer;
    
    	return (copyout(&pfsyncr, ifr->ifr_data, sizeof(pfsyncr)));
    }
    
    static int
    pfsync_set_ioc(struct pfsync_softc *sc, struct ifreq *ifr)
    {
    	struct ifnet *ifp = &sc->sc_if;
    	struct pfsyncreq pfsyncr;
    	unsigned int sync_ifidx = sc->sc_sync_ifidx;
    	int wantdown = 0;
    	int error;
    
    	error = suser(curproc);
    	if (error != 0)
    		return (error);
    
    	error = copyin(ifr->ifr_data, &pfsyncr, sizeof(pfsyncr));
    	if (error != 0)
    		return (error);
    
    	if (pfsyncr.pfsyncr_maxupdates > 255)
    		return (EINVAL);
    
    	if (pfsyncr.pfsyncr_syncdev[0] != '\0') { /* set */
    		struct ifnet *ifp0 = if_unit(pfsyncr.pfsyncr_syncdev);
    		if (ifp0 == NULL)
    			return (ENXIO);
    
    		if (ifp0->if_index != sync_ifidx)
    			wantdown = 1;
    
    		sync_ifidx = ifp0->if_index;
    		if_put(ifp0);
    	} else { /* del */
    		wantdown = 1;
    		sync_ifidx = 0;
    	}
    
    	if (pfsyncr.pfsyncr_syncpeer.s_addr == INADDR_ANY)
    		pfsyncr.pfsyncr_syncpeer.s_addr = INADDR_PFSYNC_GROUP;
    	if (pfsyncr.pfsyncr_syncpeer.s_addr != sc->sc_syncpeer.s_addr)
    		wantdown = 1;
    
    	if (wantdown && ISSET(ifp->if_flags, IFF_RUNNING))
    		return (EBUSY);
    
    	/* commit */
    	sc->sc_sync_ifidx = sync_ifidx;
    	sc->sc_syncpeer = pfsyncr.pfsyncr_syncpeer;
    	sc->sc_maxupdates = pfsyncr.pfsyncr_maxupdates;
    	sc->sc_defer = pfsyncr.pfsyncr_defer;
    
    	return (0);
    }
    
    static int
    pfsync_up(struct pfsync_softc *sc)
    {
    	struct ifnet *ifp = &sc->sc_if;
    	struct ifnet *ifp0;
    	void *inm = NULL;
    	int error = 0;
    	struct ip *ip;
    
    	NET_ASSERT_LOCKED();
    	KASSERT(!ISSET(ifp->if_flags, IFF_RUNNING));
    
    	if (sc->sc_dead)
    		return (ENXIO);
    
    	/*
    	 * coordinate with pfsync_down(). if sc_up is still up and
    	 * we're here then something else is tearing pfsync down.
    	 */
    	if (sc->sc_up)
    		return (EBUSY);
    
    	if (sc->sc_syncpeer.s_addr == INADDR_ANY ||
    	    sc->sc_syncpeer.s_addr == INADDR_BROADCAST)
    		return (EDESTADDRREQ);
    
    	ifp0 = if_get(sc->sc_sync_ifidx);
    	if (ifp0 == NULL)
    		return (ENXIO);
    
    	if (IN_MULTICAST(sc->sc_syncpeer.s_addr)) {
    		if (!ISSET(ifp0->if_flags, IFF_MULTICAST)) {
    			error = ENODEV;
    			goto put;
    		}
    		inm = in_addmulti(&sc->sc_syncpeer, ifp0);
    		if (inm == NULL) {
    			error = ECONNABORTED;
    			goto put;
    		}
    	}
    
    	sc->sc_up = 1;
    
    	ip = &sc->sc_template;
    	memset(ip, 0, sizeof(*ip));
    	ip->ip_v = IPVERSION;
    	ip->ip_hl = sizeof(*ip) >> 2;
    	ip->ip_tos = IPTOS_LOWDELAY;
    	/* len and id are set later */
    	ip->ip_off = htons(IP_DF);
    	ip->ip_ttl = PFSYNC_DFLTTL;
    	ip->ip_p = IPPROTO_PFSYNC;
    	ip->ip_src.s_addr = INADDR_ANY;
    	ip->ip_dst.s_addr = sc->sc_syncpeer.s_addr;
    
    	/* commit */
    	refcnt_init(&sc->sc_refs); /* IFF_RUNNING kind of owns this */
    
    #if NCARP > 0
    	sc->sc_sync_if_down = 1;
    	carp_group_demote_adj(&sc->sc_if, 1, "pfsync up");
    #endif
    
    	if_linkstatehook_add(ifp0, &sc->sc_ltask);
    	if_detachhook_add(ifp0, &sc->sc_dtask);
    
    	sc->sc_inm = inm;
    	SET(ifp->if_flags, IFF_RUNNING);
    
    	pfsync_bulk_req_evt(sc, PFSYNC_BREQ_EVT_UP);
    
    	refcnt_take(&sc->sc_refs); /* give one to SMR */
    	SMR_PTR_SET_LOCKED(&pfsyncif, sc);
    
    	pfsync_syncif_link(sc); /* try and push the bulk req state forward */
    
    put:
    	if_put(ifp0);
    	return (error);
    }
    
    static struct mbuf *
    pfsync_encap(struct pfsync_softc *sc, struct mbuf *m)
    {
    	struct {
    		struct ip		ip;
    		struct pfsync_header	ph;
    	} __packed __aligned(4) *h;
    	unsigned int mlen = m->m_pkthdr.len;
    
    	m = m_prepend(m, sizeof(*h), M_DONTWAIT);
    	if (m == NULL)
    		return (NULL);
    
    	h = mtod(m, void *);
    	memset(h, 0, sizeof(*h));
    
    	mlen += sizeof(h->ph);
    	h->ph.version = PFSYNC_VERSION;
    	h->ph.len = htons(mlen);
    	/* h->ph.pfcksum */
    
    	mlen += sizeof(h->ip);
    	h->ip = sc->sc_template;
    	h->ip.ip_len = htons(mlen);
    	h->ip.ip_id = htons(ip_randomid());
    
    	return (m);
    }
    
    static void
    pfsync_bulk_req_send(struct pfsync_softc *sc)
    {
    	struct {
    		struct pfsync_subheader	subh;
    		struct pfsync_upd_req	ur;
    	} __packed __aligned(4) *h;
    	unsigned mlen = max_linkhdr +
    	    sizeof(struct ip) + sizeof(struct pfsync_header) + sizeof(*h);
    	struct mbuf *m;
    
    	m = m_gethdr(M_DONTWAIT, MT_DATA);
    	if (m == NULL)
    		goto fail;
    
    	if (mlen > MHLEN) {
    		MCLGETL(m, M_DONTWAIT, mlen);
    		if (!ISSET(m->m_flags, M_EXT))
    			goto drop;
    	}
    
    	m_align(m, sizeof(*h));
    	m->m_len = m->m_pkthdr.len = sizeof(*h);
    
    	h = mtod(m, void *);
    	memset(h, 0, sizeof(*h));
    
    	h->subh.action = PFSYNC_ACT_UPD_REQ;
    	h->subh.len = sizeof(h->ur) >> 2;
    	h->subh.count = htons(1);
    
    	h->ur.id = htobe64(0);
    	h->ur.creatorid = htobe32(0);
    
    	m = pfsync_encap(sc, m);
    	if (m == NULL)
    		goto fail;
    
    	pfsync_sendout(sc, m);
    	return;
    
    drop:
    	m_freem(m);
    fail:
    	printf("%s: unable to request bulk update\n", sc->sc_if.if_xname);
    }
    
    static void
    pfsync_bulk_req_nstate(struct pfsync_softc *sc,
        enum pfsync_bulk_req_state nstate, int seconds)
    {
    	sc->sc_bulk_req.req_state = nstate;
    	if (seconds > 0)
    		timeout_add_sec(&sc->sc_bulk_req.req_tmo, seconds);
    	else
    		timeout_del(&sc->sc_bulk_req.req_tmo);
    }
    
    static void
    pfsync_bulk_req_invstate(struct pfsync_softc *sc,
        enum pfsync_bulk_req_event evt)
    {
    	panic("%s: unexpected event %s in state %s", sc->sc_if.if_xname,
    	    pfsync_bulk_req_event_names[evt],
    	    pfsync_bulk_req_state_names[sc->sc_bulk_req.req_state]);
    }
    
    static void
    pfsync_bulk_req_nstate_bulk(struct pfsync_softc *sc)
    {
    	/* calculate the number of packets we expect */
    	int t = pf_pool_limits[PF_LIMIT_STATES].limit /
    	    ((sc->sc_if.if_mtu - PFSYNC_MINPKT) /
    	     sizeof(struct pfsync_state));
    
    	/* turn it into seconds */
    	t /= 1000 / PFSYNC_BULK_SND_IVAL_MS;
    
    	if (t == 0)
    		t = 1;
    
    	pfsync_bulk_req_nstate(sc, PFSYNC_BREQ_S_BULK, t * 4);
    }
    
    static inline void
    pfsync_bulk_req_nstate_done(struct pfsync_softc *sc)
    {
    	pfsync_bulk_req_nstate(sc, PFSYNC_BREQ_S_DONE, 0);
    
    	KASSERT(sc->sc_bulk_req.req_demoted == 1);
    	sc->sc_bulk_req.req_demoted = 0;
    
    #if NCARP > 0
    	carp_group_demote_adj(&sc->sc_if, -32, "pfsync done");
    #endif
    }
    
    static void
    pfsync_bulk_req_evt(struct pfsync_softc *sc, enum pfsync_bulk_req_event evt)
    {
    	struct ifnet *ifp = &sc->sc_if;
    
    	rw_enter_write(&sc->sc_bulk_req.req_lock);
    	pfsync_dprintf(sc, "%s state %s evt %s", __func__,
    	    pfsync_bulk_req_state_names[sc->sc_bulk_req.req_state],
    	    pfsync_bulk_req_event_names[evt]);
    
    	if (evt == PFSYNC_BREQ_EVT_DOWN) {
    		/* unconditionally move down */
    		sc->sc_bulk_req.req_tries = 0;
    		pfsync_bulk_req_nstate(sc, PFSYNC_BREQ_S_NONE, 0);
    
    		if (sc->sc_bulk_req.req_demoted) {
    			sc->sc_bulk_req.req_demoted = 0;
    #if NCARP > 0
    			carp_group_demote_adj(&sc->sc_if, -32,
    			    "pfsync down");
    #endif
    		}
    	} else switch (sc->sc_bulk_req.req_state) {
    	case PFSYNC_BREQ_S_NONE:
    		switch (evt) {
    		case PFSYNC_BREQ_EVT_UP:
    			KASSERT(sc->sc_bulk_req.req_demoted == 0);
    			sc->sc_bulk_req.req_demoted = 1;
    #if NCARP > 0
    			carp_group_demote_adj(&sc->sc_if, 32,
    			    "pfsync start");
    #endif
    			pfsync_bulk_req_nstate(sc, PFSYNC_BREQ_S_START, 30);
    			break;
    		default:
    			pfsync_bulk_req_invstate(sc, evt);
    		}
    
    		break;
    
    	case PFSYNC_BREQ_S_START:
    		switch (evt) {
    		case PFSYNC_BREQ_EVT_LINK:
    			pfsync_bulk_req_send(sc);
    			pfsync_bulk_req_nstate(sc, PFSYNC_BREQ_S_SENT, 2);
    			break;
    		case PFSYNC_BREQ_EVT_TMO:
    			pfsync_dprintf(sc, "timeout waiting for link");
    			pfsync_bulk_req_nstate_done(sc);
    			break;
    		case PFSYNC_BREQ_EVT_BUS_START:
    			pfsync_bulk_req_nstate_bulk(sc);
    			break;
    		case PFSYNC_BREQ_EVT_BUS_END:
    			/* ignore this */
    			break;
    		default:
    			pfsync_bulk_req_invstate(sc, evt);
    		}
    		break;
    
    	case PFSYNC_BREQ_S_SENT:
    		switch (evt) {
    		case PFSYNC_BREQ_EVT_BUS_START:
    			pfsync_bulk_req_nstate_bulk(sc);
    			break;
    		case PFSYNC_BREQ_EVT_BUS_END:
    		case PFSYNC_BREQ_EVT_LINK:
    			/* ignore this */
    			break;
    		case PFSYNC_BREQ_EVT_TMO:
    			if (++sc->sc_bulk_req.req_tries <
    			    PFSYNC_MAX_BULKTRIES) {
    				pfsync_bulk_req_send(sc);
    				pfsync_bulk_req_nstate(sc,
    				    PFSYNC_BREQ_S_SENT, 2);
    				break;
    			}
    
    			pfsync_dprintf(sc,
    			    "timeout waiting for bulk transfer start");
    			pfsync_bulk_req_nstate_done(sc);
    			break;
    		default:
    			pfsync_bulk_req_invstate(sc, evt);
    		}
    		break;
    
    	case PFSYNC_BREQ_S_BULK:
    		switch (evt) {
    		case PFSYNC_BREQ_EVT_BUS_START:
    		case PFSYNC_BREQ_EVT_LINK:
    			/* ignore this */
    			break;
    		case PFSYNC_BREQ_EVT_BUS_END:
    			pfsync_bulk_req_nstate_done(sc);
    			break;
    		case PFSYNC_BREQ_EVT_TMO:
    			if (++sc->sc_bulk_req.req_tries <
    			    PFSYNC_MAX_BULKTRIES) {
    				pfsync_bulk_req_send(sc);
    				pfsync_bulk_req_nstate(sc,
    				    PFSYNC_BREQ_S_SENT, 2);
    			}
    
    			pfsync_dprintf(sc,
    			    "timeout waiting for bulk transfer end");
    			pfsync_bulk_req_nstate_done(sc);
    			break;
    		default:
    			pfsync_bulk_req_invstate(sc, evt);
    		}
    		break;
    
    	case PFSYNC_BREQ_S_DONE: /* pfsync is up and running */
    		switch (evt) {
    		case PFSYNC_BREQ_EVT_BUS_START:
    		case PFSYNC_BREQ_EVT_BUS_END:
    		case PFSYNC_BREQ_EVT_LINK:
    			/* nops */
    			break;
    		default:
    			pfsync_bulk_req_invstate(sc, evt);
    		}
    		break;
    
    	default:
    		panic("%s: unknown event %d", ifp->if_xname, evt);
    		/* NOTREACHED */
    	}
    	rw_exit_write(&sc->sc_bulk_req.req_lock);
    }
    
    static void
    pfsync_bulk_req_tmo(void *arg)
    {
    	struct pfsync_softc *sc = arg;
    
    	NET_LOCK();
    	pfsync_bulk_req_evt(sc, PFSYNC_BREQ_EVT_TMO);
    	NET_UNLOCK();
    }
    
    static int
    pfsync_down(struct pfsync_softc *sc)
    {
    	struct ifnet *ifp = &sc->sc_if;
    	struct ifnet *ifp0;
    	struct smr_entry smr;
    	size_t i;
    	void *inm = NULL;
    	unsigned int sndbar = 0;
    	struct pfsync_deferrals pds = TAILQ_HEAD_INITIALIZER(pds);
    	struct pfsync_deferral *pd;
    
    	NET_ASSERT_LOCKED();
    	KASSERT(ISSET(ifp->if_flags, IFF_RUNNING));
    
    	/*
    	 * tearing down pfsync involves waiting for pfsync to stop
    	 * running in various contexts including softnet taskqs.
    	 * this thread cannot hold netlock while waiting for a
    	 * barrier in softnet because softnet might be waiting for
    	 * the netlock. sc->sc_up is used to coordinate with
    	 * pfsync_up.
    	 */
    
    	CLR(ifp->if_flags, IFF_RUNNING);
    
    	ifp0 = if_get(sc->sc_sync_ifidx);
    	if (ifp0 != NULL) {
    		if_linkstatehook_del(ifp0, &sc->sc_ltask);
    		if_detachhook_del(ifp0, &sc->sc_dtask);
    	}
    	if_put(ifp0);
    
    #if NCARP > 0
    	if (sc->sc_sync_if_down)
    		carp_group_demote_adj(&sc->sc_if, -1, "pfsync down");
    #endif
    
    	NET_UNLOCK();
    
    	KASSERTMSG(SMR_PTR_GET_LOCKED(&pfsyncif) == sc,
    	   "pfsyncif %p != sc %p", pfsyncif, sc);
    	SMR_PTR_SET_LOCKED(&pfsyncif, NULL);
    	smr_init(&smr);
    	smr_call(&smr, (void (*)(void *))refcnt_rele_wake, &sc->sc_refs);
    
    	/* stop pf producing work before cleaning up the timeouts and tasks */
    	refcnt_finalize(&sc->sc_refs, "pfsyncfini");
    
    	pfsync_bulk_req_evt(sc, PFSYNC_BREQ_EVT_DOWN);
    
    	rw_enter_read(&pf_state_list.pfs_rwl);
    	rw_enter_write(&sc->sc_bulk_snd.snd_lock);
    	if (sc->sc_bulk_snd.snd_tail != NULL) {
    		sndbar = !timeout_del(&sc->sc_bulk_snd.snd_tmo);
    
    		sc->sc_bulk_snd.snd_again = 0;
    		sc->sc_bulk_snd.snd_next = NULL;
    		sc->sc_bulk_snd.snd_tail = NULL;
    	}
    	rw_exit_write(&sc->sc_bulk_snd.snd_lock);
    	rw_exit_read(&pf_state_list.pfs_rwl);
    
    	/*
    	 * do a single barrier for all the timeouts. because the
    	 * timeouts in each slice are configured the same way, the
    	 * barrier for one will work for all of them.
    	 */
    	for (i = 0; i < nitems(sc->sc_slices); i++) {
    		struct pfsync_slice *s = &sc->sc_slices[i];
    
    		timeout_del(&s->s_tmo);
    		task_del(s->s_softnet, &s->s_task);
    		task_del(s->s_softnet, &s->s_send);
    
    		timeout_del(&s->s_deferrals_tmo);
    		task_del(s->s_softnet, &s->s_deferrals_task);
    	}
    	timeout_barrier(&sc->sc_slices[0].s_tmo);
    	timeout_barrier(&sc->sc_bulk_req.req_tmo); /* XXX proc */
    	if (sndbar) {
    		/* technically the preceding barrier does the same job */
    		timeout_barrier(&sc->sc_bulk_snd.snd_tmo);
    	}
    	net_tq_barriers("pfsyncbar");
    
    	/* pfsync is no longer running */
    
    	if (sc->sc_inm != NULL) {
    		inm = sc->sc_inm;
    		sc->sc_inm = NULL;
    	}
    
    	for (i = 0; i < nitems(sc->sc_slices); i++) {
    		struct pfsync_slice *s = &sc->sc_slices[i];
    		struct pf_state *st;
    
    		pfsync_slice_drop(sc, s);
    		mq_purge(&s->s_sendq);
    
    		while ((pd = TAILQ_FIRST(&s->s_deferrals)) != NULL) {
    			TAILQ_REMOVE(&s->s_deferrals, pd, pd_entry);
    
    			st = pd->pd_st;
    			st->sync_defer = NULL;
    
    			TAILQ_INSERT_TAIL(&pds, pd, pd_entry);
    		}
    		s->s_deferred = 0;
    	}
    
    	NET_LOCK();
    	sc->sc_up = 0;
    
    	if (inm != NULL)
    		in_delmulti(inm);
    
    	while ((pd = TAILQ_FIRST(&pds)) != NULL) {
    		TAILQ_REMOVE(&pds, pd, pd_entry);
    
    		pfsync_defer_output(pd);
    	}
    
    	return (0);
    }
    
    int
    pfsync_is_up(void)
    {
    	int rv;
    
    	smr_read_enter();
    	rv = SMR_PTR_GET(&pfsyncif) != NULL;
    	smr_read_leave();
    
    	return (rv);
    }
    
    static void
    pfsync_start(struct ifqueue *ifq)
    {
    	ifq_purge(ifq);
    }
    
    struct pfsync_q {
    	void		(*write)(struct pf_state *, void *);
    	size_t		len;
    	u_int8_t	action;
    };
    
    static struct pfsync_slice *
    pfsync_slice_enter(struct pfsync_softc *sc, const struct pf_state *st)
    {
    	unsigned int idx = st->key[0]->hash % nitems(sc->sc_slices); 
    	struct pfsync_slice *s = &sc->sc_slices[idx];
    
    	if (!mtx_enter_try(&s->s_mtx)) {
    		mtx_enter(&s->s_mtx);
    		s->s_stat_contended++;
    	}
    	s->s_stat_locks++;
    
    	return (s);
    }
    
    static void
    pfsync_slice_leave(struct pfsync_softc *sc, struct pfsync_slice *s)
    {
    	mtx_leave(&s->s_mtx);
    }
    
    /* we have one of these for every PFSYNC_S_ */
    static void	pfsync_out_state(struct pf_state *, void *);
    static void	pfsync_out_iack(struct pf_state *, void *);
    static void	pfsync_out_upd_c(struct pf_state *, void *);
    static void	pfsync_out_del(struct pf_state *, void *);
    #if defined(IPSEC)
    static void	pfsync_out_tdb(struct tdb *, void *);
    #endif
    
    static const struct pfsync_q pfsync_qs[] = {
    	{ pfsync_out_iack,  sizeof(struct pfsync_ins_ack), PFSYNC_ACT_INS_ACK },
    	{ pfsync_out_upd_c, sizeof(struct pfsync_upd_c),   PFSYNC_ACT_UPD_C },
    	{ pfsync_out_del,   sizeof(struct pfsync_del_c),   PFSYNC_ACT_DEL_C },
    	{ pfsync_out_state, sizeof(struct pfsync_state),   PFSYNC_ACT_INS },
    	{ pfsync_out_state, sizeof(struct pfsync_state),   PFSYNC_ACT_UPD }
    };
    
    static void
    pfsync_out_state(struct pf_state *st, void *buf)
    {
    	struct pfsync_state *sp = buf;
    
    	mtx_enter(&st->mtx);
    	pf_state_export(sp, st);
    	mtx_leave(&st->mtx);
    }
    
    static void
    pfsync_out_iack(struct pf_state *st, void *buf)
    {
    	struct pfsync_ins_ack *iack = buf;
    
    	iack->id = st->id;
    	iack->creatorid = st->creatorid;
    }
    
    static void
    pfsync_out_upd_c(struct pf_state *st, void *buf)
    {
    	struct pfsync_upd_c *up = buf;
    
    	memset(up, 0, sizeof(*up));
    	up->id = st->id;
    	up->creatorid = st->creatorid;
    
    	mtx_enter(&st->mtx);
    	pf_state_peer_hton(&st->src, &up->src);
    	pf_state_peer_hton(&st->dst, &up->dst);
    	up->timeout = st->timeout;
    	mtx_leave(&st->mtx);
    }
    
    static void
    pfsync_out_del(struct pf_state *st, void *buf)
    {
    	struct pfsync_del_c *dp = buf;
    
    	dp->id = st->id;
    	dp->creatorid = st->creatorid;
    
    	st->sync_state = PFSYNC_S_DEAD;
    }
    
    #if defined(IPSEC)
    static inline void
    pfsync_tdb_enter(struct tdb *tdb)
    {
    	mtx_enter(&tdb->tdb_mtx);
    }
    
    static inline void
    pfsync_tdb_leave(struct tdb *tdb)
    {
    	unsigned int snapped = ISSET(tdb->tdb_flags, TDBF_PFSYNC_SNAPPED);
    	mtx_leave(&tdb->tdb_mtx);
    	if (snapped)
    		wakeup_one(&tdb->tdb_updates);
    }
    #endif /* defined(IPSEC) */
    
    static void
    pfsync_slice_drop(struct pfsync_softc *sc, struct pfsync_slice *s)
    {
    	struct pf_state *st;
    	int q;
    #if defined(IPSEC)
    	struct tdb *tdb;
    #endif
    
    	for (q = 0; q < nitems(s->s_qs); q++) {
    		if (TAILQ_EMPTY(&s->s_qs[q]))
    			continue;
    
    		while ((st = TAILQ_FIRST(&s->s_qs[q])) != NULL) {
    			TAILQ_REMOVE(&s->s_qs[q], st, sync_list);
    #ifdef PFSYNC_DEBUG
    			KASSERT(st->sync_state == q);
    #endif
    			st->sync_state = PFSYNC_S_NONE;
    			pf_state_unref(st);
    		}
    	}
    
    #if defined(IPSEC)
    	while ((tdb = TAILQ_FIRST(&s->s_tdb_q)) != NULL) {
    		TAILQ_REMOVE(&s->s_tdb_q, tdb, tdb_sync_entry);
    
    		pfsync_tdb_enter(tdb);
    		KASSERT(ISSET(tdb->tdb_flags, TDBF_PFSYNC));
    		CLR(tdb->tdb_flags, TDBF_PFSYNC);
    		pfsync_tdb_leave(tdb);
    	}
    #endif /* defined(IPSEC) */
    
    	timeout_del(&s->s_tmo);
    	s->s_len = PFSYNC_MINPKT;
    }
    
    static struct mbuf *
    pfsync_slice_write(struct pfsync_slice *s)
    {
    	struct pfsync_softc *sc = s->s_pfsync;
    	struct mbuf *m;
    
    	struct ip *ip;
    	struct pfsync_header *ph;
    	struct pfsync_subheader *subh;
    
    	unsigned int mlen = max_linkhdr + s->s_len;
    	unsigned int q, count;
    	caddr_t ptr;
    	size_t off;
    
    	MUTEX_ASSERT_LOCKED(&s->s_mtx);
    	if (s->s_len == PFSYNC_MINPKT) {
    		s->s_stat_write_nop++;
    		return (NULL);
    	}
    
    	task_del(s->s_softnet, &s->s_task);
    
    	m = m_gethdr(M_DONTWAIT, MT_DATA);
    	if (m == NULL)
    		goto drop;
    
    	if (mlen > MHLEN) {
    		MCLGETL(m, M_DONTWAIT, mlen);
    		if (!ISSET(m->m_flags, M_EXT))
    			goto drop;
    	}
    
    	m_align(m, s->s_len);
    	m->m_len = m->m_pkthdr.len = s->s_len;
    
    	ptr = mtod(m, caddr_t);
    	off = 0;
    
    	ip = (struct ip *)(ptr + off);
    	off += sizeof(*ip);
    	*ip = sc->sc_template;
    	ip->ip_len = htons(m->m_pkthdr.len);
    	ip->ip_id = htons(ip_randomid());
    
    	ph = (struct pfsync_header *)(ptr + off);
    	off += sizeof(*ph);
    	memset(ph, 0, sizeof(*ph));
    	ph->version = PFSYNC_VERSION;
    	ph->len = htons(m->m_pkthdr.len - sizeof(*ip));
    
    	for (q = 0; q < nitems(s->s_qs); q++) {
    		struct pf_state_queue *psq = &s->s_qs[q];
    		struct pf_state *st;
    
    		if (TAILQ_EMPTY(psq))
    			continue;
    
    		subh = (struct pfsync_subheader *)(ptr + off);
    		off += sizeof(*subh);
    
    		count = 0;
    		while ((st = TAILQ_FIRST(psq)) != NULL) {
    			TAILQ_REMOVE(psq, st, sync_list);
    			count++;
    
    			KASSERT(st->sync_state == q);
    			/* the write handler below may override this */
    			st->sync_state = PFSYNC_S_NONE;
    
    			pfsync_qs[q].write(st, ptr + off);
    			off += pfsync_qs[q].len;
    
    			pf_state_unref(st);
    		}
    
    		subh->action = pfsync_qs[q].action;
    		subh->len = pfsync_qs[q].len >> 2;
    		subh->count = htons(count);
    	}
    
    #if defined(IPSEC)
    	if (!TAILQ_EMPTY(&s->s_tdb_q)) {
    		struct tdb *tdb;
    
    		subh = (struct pfsync_subheader *)(ptr + off);
    		off += sizeof(*subh);
    
    		count = 0;
    		while ((tdb = TAILQ_FIRST(&s->s_tdb_q)) != NULL) {
    			TAILQ_REMOVE(&s->s_tdb_q, tdb, tdb_sync_entry);
    			count++;
    
    			pfsync_tdb_enter(tdb);
    			KASSERT(ISSET(tdb->tdb_flags, TDBF_PFSYNC));
    
    			/* get a consistent view of the counters */
    			pfsync_out_tdb(tdb, ptr + off);
    
    			CLR(tdb->tdb_flags, TDBF_PFSYNC);
    			pfsync_tdb_leave(tdb);
    
    			off += sizeof(struct pfsync_tdb);
    		}
    
    		subh->action = PFSYNC_ACT_TDB;
    		subh->len = sizeof(struct pfsync_tdb) >> 2;
    		subh->count = htons(count);
    	}
    #endif
    
    	timeout_del(&s->s_tmo);
    	s->s_len = PFSYNC_MINPKT;
    
    	return (m);
    drop:
    	m_freem(m);
    	pfsyncstat_inc(pfsyncs_onomem);
    	pfsync_slice_drop(sc, s);
    	return (NULL);
    }
    
    static void
    pfsync_sendout(struct pfsync_softc *sc, struct mbuf *m)
    {
    	struct ip_moptions imo;
    	unsigned int len = m->m_pkthdr.len;
    #if NBPFILTER > 0
    	caddr_t if_bpf = sc->sc_if.if_bpf;
    	if (if_bpf)
    		bpf_mtap(if_bpf, m, BPF_DIRECTION_OUT);
    #endif
    
    	imo.imo_ifidx = sc->sc_sync_ifidx;
    	imo.imo_ttl = PFSYNC_DFLTTL;
    	imo.imo_loop = 0;
    	m->m_pkthdr.ph_rtableid = sc->sc_if.if_rdomain;
    
    	if (ip_output(m, NULL, NULL, IP_RAWOUTPUT, &imo, NULL, 0) == 0) {
    		counters_pkt(sc->sc_if.if_counters, ifc_opackets,
    		    ifc_obytes, len);
    		pfsyncstat_inc(pfsyncs_opackets);
    	} else {
    		counters_inc(sc->sc_if.if_counters, ifc_oerrors);
    		pfsyncstat_inc(pfsyncs_oerrors);
    	}
    }
    
    static void
    pfsync_slice_tmo(void *arg)
    {
    	struct pfsync_slice *s = arg;
    
    	task_add(s->s_softnet, &s->s_task);
    }
    
    static void
    pfsync_slice_sched(struct pfsync_slice *s)
    {
    	s->s_stat_task_add++;
    	task_add(s->s_softnet, &s->s_task);
    }
    
    static void
    pfsync_slice_task(void *arg)
    {
    	struct pfsync_slice *s = arg;
    	struct mbuf *m;
    
    	mtx_enter(&s->s_mtx);
    	s->s_stat_task_run++;
    
    	m = pfsync_slice_write(s);
    	mtx_leave(&s->s_mtx);
    	if (m != NULL) {
    		NET_LOCK();
    		pfsync_sendout(s->s_pfsync, m);
    		NET_UNLOCK();
    	}
    }
    
    static void
    pfsync_slice_sendq(void *arg)
    {
    	struct pfsync_slice *s = arg;
    	struct mbuf_list ml;
    	struct mbuf *m;
    
    	mq_delist(&s->s_sendq, &ml);
    	if (ml_empty(&ml))
    		return;
    
    	mtx_enter(&s->s_mtx);
    	s->s_stat_dequeue++;
    	mtx_leave(&s->s_mtx);
    
    	NET_LOCK();
    	while ((m = ml_dequeue(&ml)) != NULL)
    		pfsync_sendout(s->s_pfsync, m);
    	NET_UNLOCK();
    }
    
    static void
    pfsync_q_ins(struct pfsync_slice *s, struct pf_state *st, unsigned int q)
    {
    	size_t nlen = pfsync_qs[q].len;
    	struct mbuf *m = NULL;
    
    	MUTEX_ASSERT_LOCKED(&s->s_mtx);
    	KASSERT(st->sync_state == PFSYNC_S_NONE);
    	KASSERT(s->s_len >= PFSYNC_MINPKT);
    
    	if (TAILQ_EMPTY(&s->s_qs[q]))
    		nlen += sizeof(struct pfsync_subheader);
    
    	if (s->s_len + nlen > s->s_pfsync->sc_if.if_mtu) {
    		m = pfsync_slice_write(s);
    		if (m != NULL) {
    			s->s_stat_enqueue++;
    			if (mq_enqueue(&s->s_sendq, m) == 0)
    				task_add(s->s_softnet, &s->s_send);
    		}
    
    		nlen = sizeof(struct pfsync_subheader) + pfsync_qs[q].len;
    	}
    
    	s->s_len += nlen;
    	pf_state_ref(st);
    	TAILQ_INSERT_TAIL(&s->s_qs[q], st, sync_list);
    	st->sync_state = q;
    
    	if (!timeout_pending(&s->s_tmo))
    		timeout_add_sec(&s->s_tmo, 1);
    }
    
    static void
    pfsync_q_del(struct pfsync_slice *s, struct pf_state *st)
    {
    	unsigned int q = st->sync_state;
    
    	MUTEX_ASSERT_LOCKED(&s->s_mtx);
    	KASSERT(st->sync_state < PFSYNC_S_NONE);
    
    	st->sync_state = PFSYNC_S_NONE;
    	TAILQ_REMOVE(&s->s_qs[q], st, sync_list);
    	pf_state_unref(st);
    	s->s_len -= pfsync_qs[q].len;
    
    	if (TAILQ_EMPTY(&s->s_qs[q]))
    		s->s_len -= sizeof(struct pfsync_subheader);
    }
    
    /*
     * the pfsync hooks that pf calls
     */
    
    void
    pfsync_init_state(struct pf_state *st, const struct pf_state_key *skw,
        const struct pf_state_key *sks, int flags)
    {
    	/* this is called before pf_state_insert */
    
    	if (skw->proto == IPPROTO_PFSYNC)
    		SET(st->state_flags, PFSTATE_NOSYNC);
    
    	if (ISSET(st->state_flags, PFSTATE_NOSYNC)) {
    		st->sync_state = PFSYNC_S_DEAD;
    		return;
    	}
    
    	if (ISSET(flags, PFSYNC_SI_IOCTL)) {
    		/* all good */
    		return;
    	}
    
    	/* state came off the wire */
    	if (ISSET(flags, PFSYNC_SI_PFSYNC)) {
    		if (ISSET(st->state_flags, PFSTATE_ACK)) {
    			CLR(st->state_flags, PFSTATE_ACK);
    
    			/* peer wants an iack, not an insert */
    			st->sync_state = PFSYNC_S_SYNC;
    		} else
    			st->sync_state = PFSYNC_S_PFSYNC;
    	}
    }
    
    void
    pfsync_insert_state(struct pf_state *st)
    {
    	struct pfsync_softc *sc;
    
    	MUTEX_ASSERT_UNLOCKED(&st->mtx);
    
    	if (ISSET(st->state_flags, PFSTATE_NOSYNC) ||
    	    st->sync_state == PFSYNC_S_DEAD)
    		return;
    
    	smr_read_enter();
    	sc = SMR_PTR_GET(&pfsyncif);
    	if (sc != NULL) {
    		struct pfsync_slice *s = pfsync_slice_enter(sc, st);
    
    		switch (st->sync_state) {
    		case PFSYNC_S_UPD_C:
    			/* we must have lost a race after insert */
    			pfsync_q_del(s, st);
    			/* FALLTHROUGH */
    		case PFSYNC_S_NONE:
    			pfsync_q_ins(s, st, PFSYNC_S_INS);
    			break;
    		case PFSYNC_S_SYNC:
    			st->sync_state = PFSYNC_S_NONE; /* gross */
    			pfsync_q_ins(s, st, PFSYNC_S_IACK);
    			pfsync_slice_sched(s); /* the peer is waiting */
    			break;
    		case PFSYNC_S_PFSYNC:
    			/* state was just inserted by pfsync */
    			st->sync_state = PFSYNC_S_NONE;
    			break;
    		default:
    			panic("%s: state %p unexpected sync_state %d",
    			    __func__, st, st->sync_state);
    			/* NOTREACHED */
    		}
    
    		pfsync_slice_leave(sc, s);
    	}
    	smr_read_leave();
    }
    
    void
    pfsync_update_state(struct pf_state *st)
    {
    	struct pfsync_softc *sc;
    
    	MUTEX_ASSERT_UNLOCKED(&st->mtx);
    
    	if (ISSET(st->state_flags, PFSTATE_NOSYNC) ||
    	    st->sync_state == PFSYNC_S_DEAD)
    		return;
    
    	smr_read_enter();
    	sc = SMR_PTR_GET(&pfsyncif);
    	if (sc != NULL) {
    		struct pfsync_slice *s = pfsync_slice_enter(sc, st);
    		int sync = 0;
    
    		switch (st->sync_state) {
    		case PFSYNC_S_UPD_C:
    		case PFSYNC_S_UPD:
    			/* we're already handling it */
    			if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP) {
    				st->sync_updates++;
    				if (st->sync_updates >= sc->sc_maxupdates)
    					sync = 1;
    			}
    			/* FALLTHROUGH */
    		case PFSYNC_S_INS:
    		case PFSYNC_S_DEL:
    		case PFSYNC_S_DEAD:
    			break;
    
    		case PFSYNC_S_IACK:
    			pfsync_q_del(s, st);
    			/* FALLTHROUGH */
    		case PFSYNC_S_NONE:
    			pfsync_q_ins(s, st, PFSYNC_S_UPD_C);
    			st->sync_updates = 0;
    			break;
    		default:
    			panic("%s: state %p unexpected sync_state %d",
    			    __func__, st, st->sync_state);
    			/* NOTREACHED */
    		}
    
    		if (!sync && (getuptime() - st->pfsync_time) < 2)
    			sync = 1;
    
    		if (sync)
    			pfsync_slice_sched(s);
    		pfsync_slice_leave(sc, s);
    	}
    	smr_read_leave();
    }
    
    void
    pfsync_delete_state(struct pf_state *st)
    {
    	struct pfsync_softc *sc;
    
    	MUTEX_ASSERT_UNLOCKED(&st->mtx);
    
    	if (ISSET(st->state_flags, PFSTATE_NOSYNC) ||
    	    st->sync_state == PFSYNC_S_DEAD)
    		return;
    
    	smr_read_enter();
    	sc = SMR_PTR_GET(&pfsyncif);
    	if (sc != NULL) {
    		struct pfsync_slice *s = pfsync_slice_enter(sc, st);
    
    		switch (st->sync_state) {
    		case PFSYNC_S_INS:
    			/* let's pretend this never happened */
    			pfsync_q_del(s, st);
    			break;
    
    		case PFSYNC_S_UPD_C:
    		case PFSYNC_S_UPD:
    		case PFSYNC_S_IACK:
    			pfsync_q_del(s, st);
    			/* FALLTHROUGH */
    		case PFSYNC_S_NONE:
    			pfsync_q_ins(s, st, PFSYNC_S_DEL);
    			st->sync_updates = 0;
    			break;
    		case PFSYNC_S_DEL:
    		case PFSYNC_S_DEAD:
    			/* XXX we should count this */
    			break;
    		default:
    			panic("%s: state %p unexpected sync_state %d",
    			    __func__, st, st->sync_state);
    			/* NOTREACHED */
    		}
    
    		pfsync_slice_leave(sc, s);
    	}
    	smr_read_leave();
    }
    
    struct pfsync_subh_clr {
    	struct pfsync_subheader	subh;
    	struct pfsync_clr	clr;
    } __packed __aligned(4);
    
    void
    pfsync_clear_states(u_int32_t creatorid, const char *ifname)
    {
    	struct pfsync_softc *sc;
    	struct pfsync_subh_clr *h;
    	struct mbuf *m;
    	unsigned int hlen, mlen;
    
    	smr_read_enter();
    	sc = SMR_PTR_GET(&pfsyncif);
    	if (sc != NULL)
    		refcnt_take(&sc->sc_refs);
    	smr_read_leave();
    
    	if (sc == NULL)
    		return;
    
    	hlen = sizeof(sc->sc_template) +
    	    sizeof(struct pfsync_header) +
    	    sizeof(*h);
    
    	mlen = max_linkhdr + hlen;
    
    	m = m_gethdr(M_DONTWAIT, MT_DATA);
    	if (m == NULL) {
    		/* count error */
    		goto leave;
    	}
    
    	if (mlen > MHLEN) {
    		MCLGETL(m, M_DONTWAIT, mlen);
    		if (!ISSET(m->m_flags, M_EXT)) {
    			m_freem(m);
    			goto leave;
    		}
    	}
    
    	m_align(m, sizeof(*h));
    	h = mtod(m, struct pfsync_subh_clr *);
    
    	h->subh.action = PFSYNC_ACT_CLR;
    	h->subh.len = sizeof(h->clr) >> 2;
    	h->subh.count = htons(1);
    
    	strlcpy(h->clr.ifname, ifname, sizeof(h->clr.ifname));
    	h->clr.creatorid = creatorid;
    
    	m->m_pkthdr.len = m->m_len = sizeof(*h);
    	m = pfsync_encap(sc, m);
    	if (m == NULL)
    		goto leave;
    
    	pfsync_sendout(sc, m);
    leave:
    	refcnt_rele_wake(&sc->sc_refs);
    }
    
    int
    pfsync_state_in_use(struct pf_state *st)
    {
    	struct pfsync_softc *sc;
    	int rv = 0;
    
    	smr_read_enter();
    	sc = SMR_PTR_GET(&pfsyncif);
    	if (sc != NULL) {
    		/*
    		 * pfsync bulk sends run inside
    		 * rw_enter_read(&pf_state_list.pfs_rwl), and this
    		 * code (pfsync_state_in_use) is only called from the
    		 * purge code inside
    		 * rw_enter_write(&pf_state_list.pfs_rwl). therefore,
    		 * those two sections are exclusive so we can safely
    		 * look at the bulk send pointers.
    		 */
    		/* rw_assert_wrlock(&pf_state_list.pfs_rwl); */
    		if (sc->sc_bulk_snd.snd_next == st ||
    		    sc->sc_bulk_snd.snd_tail == st)
    			rv = 1;
    	}
    	smr_read_leave();
    
    	return (rv);
    }
    
    int
    pfsync_defer(struct pf_state *st, struct mbuf *m)
    {
    	struct pfsync_softc *sc;
    	struct pfsync_slice *s;
    	struct pfsync_deferral *pd;
    	int sched = 0;
    	int rv = 0;
    
    	if (ISSET(st->state_flags, PFSTATE_NOSYNC) ||
    	    ISSET(m->m_flags, M_BCAST|M_MCAST))
    		return (0);
    
    	smr_read_enter();
    	sc = SMR_PTR_GET(&pfsyncif);
    	if (sc == NULL || !sc->sc_defer)
    		goto leave;
    
    	pd = pool_get(&pfsync_deferrals_pool, M_NOWAIT);
    	if (pd == NULL) {
    		goto leave;
    	}
    
    	s = pfsync_slice_enter(sc, st);
    	s->s_stat_defer_add++;
    
    	pd->pd_st = pf_state_ref(st);
    	pd->pd_m = m;
    	pd->pd_deadline = getnsecuptime() + PFSYNC_DEFER_NSEC;
    
    	m->m_pkthdr.pf.flags |= PF_TAG_GENERATED;
    	st->sync_defer = pd;
    
    	sched = s->s_deferred++;
    	TAILQ_INSERT_TAIL(&s->s_deferrals, pd, pd_entry);
    
    	if (sched == 0)
    		timeout_add_nsec(&s->s_deferrals_tmo, PFSYNC_DEFER_NSEC);
    	else if (sched >= PFSYNC_DEFER_LIMIT) {
    		s->s_stat_defer_overlimit++;
    		timeout_del(&s->s_deferrals_tmo);
    		task_add(s->s_softnet, &s->s_deferrals_task);
    	}
    
    	pfsync_slice_sched(s);
    	pfsync_slice_leave(sc, s);
    	rv = 1;
    leave:
    	smr_read_leave();
    
    	return (rv);
    }
    
    static void
    pfsync_deferred(struct pfsync_softc *sc, struct pf_state *st)
    {
    	struct pfsync_slice *s;
    	struct pfsync_deferral *pd;
    
    	s = pfsync_slice_enter(sc, st);
    
    	pd = st->sync_defer;
    	if (pd != NULL) {
    		s->s_stat_defer_ack++;
    
    		TAILQ_REMOVE(&s->s_deferrals, pd, pd_entry);
    		s->s_deferred--;
    
    		st = pd->pd_st;
    		st->sync_defer = NULL;
    	}
    	pfsync_slice_leave(sc, s);
    
    	if (pd != NULL)
    		pfsync_defer_output(pd);
    }
    
    static void
    pfsync_deferrals_tmo(void *arg)
    {
    	struct pfsync_slice *s = arg;
    
    	if (READ_ONCE(s->s_deferred) > 0)
    		task_add(s->s_softnet, &s->s_deferrals_task);
    }
    
    static void
    pfsync_deferrals_task(void *arg)
    {
    	struct pfsync_slice *s = arg;
    	struct pfsync_deferral *pd;
    	struct pf_state *st;
    	uint64_t now, nsec = 0;
    	struct pfsync_deferrals pds = TAILQ_HEAD_INITIALIZER(pds);
    
    	now = getnsecuptime();
    
    	mtx_enter(&s->s_mtx);
    	s->s_stat_defer_run++; /* maybe move this into the loop */
    	for (;;) {
    		pd = TAILQ_FIRST(&s->s_deferrals);
    		if (pd == NULL)
    			break;
    
    		if (s->s_deferred < PFSYNC_DEFER_LIMIT &&
    		    now < pd->pd_deadline) {
    			nsec = pd->pd_deadline - now;
    			break;
    		}
    
    		TAILQ_REMOVE(&s->s_deferrals, pd, pd_entry);
    		s->s_deferred--;
    
    		/*
    		 * detach the pd from the state. the pd still refers
    		 * to the state though.
    		 */
    		st = pd->pd_st;
    		st->sync_defer = NULL;
    
    		TAILQ_INSERT_TAIL(&pds, pd, pd_entry);
    	}
    	mtx_leave(&s->s_mtx);
    
    	if (nsec > 0) {
    		/* we were looking at a pd, but it wasn't old enough */
    		timeout_add_nsec(&s->s_deferrals_tmo, nsec);
    	}
    
    	if (TAILQ_EMPTY(&pds))
    		return;
    
    	NET_LOCK();
    	while ((pd = TAILQ_FIRST(&pds)) != NULL) {
    		TAILQ_REMOVE(&pds, pd, pd_entry);
    
    		pfsync_defer_output(pd);
    	}
    	NET_UNLOCK();
    }
    
    static void
    pfsync_defer_output(struct pfsync_deferral *pd)
    {
    	struct pf_pdesc pdesc;
    	struct pf_state *st = pd->pd_st;
    
    	if (st->rt == PF_ROUTETO) {
    		if (pf_setup_pdesc(&pdesc, st->key[PF_SK_WIRE]->af,
    		    st->direction, NULL, pd->pd_m, NULL) != PF_PASS)
    			return;
    		switch (st->key[PF_SK_WIRE]->af) {
    		case AF_INET:
    			pf_route(&pdesc, st);
    			break;
    #ifdef INET6
    		case AF_INET6:
    			pf_route6(&pdesc, st);
    			break;
    #endif /* INET6 */
    		default:
    			unhandled_af(st->key[PF_SK_WIRE]->af);
    		}
    		pd->pd_m = pdesc.m;
    	} else {
    		switch (st->key[PF_SK_WIRE]->af) {
    		case AF_INET:
    			ip_output(pd->pd_m, NULL, NULL, 0, NULL, NULL, 0);
    			break;
    #ifdef INET6
    		case AF_INET6:
    			ip6_output(pd->pd_m, NULL, NULL, 0, NULL, NULL);
    			break;
    #endif /* INET6 */
    		default:
    			unhandled_af(st->key[PF_SK_WIRE]->af);
    		}
    
    		pd->pd_m = NULL;
    	}
    
    	pf_state_unref(st);
    	m_freem(pd->pd_m);
    	pool_put(&pfsync_deferrals_pool, pd);
    }
    
    struct pfsync_subh_bus {
    	struct pfsync_subheader	subh;
    	struct pfsync_bus	bus;
    } __packed __aligned(4);
    
    static unsigned int
    pfsync_bulk_snd_bus(struct pfsync_softc *sc,
        struct mbuf *m, const unsigned int space,
        uint32_t endtime, uint8_t status)
    {
    	struct pfsync_subh_bus *h;
    	unsigned int nlen;
    
    	nlen = m->m_len + sizeof(*h);
    	if (space < nlen)
    		return (0);
    
    	h = (struct pfsync_subh_bus *)(mtod(m, caddr_t) + m->m_len);
    	memset(h, 0, sizeof(*h));
    
    	h->subh.action = PFSYNC_ACT_BUS;
    	h->subh.len = sizeof(h->bus) >> 2;
    	h->subh.count = htons(1);
    
    	h->bus.creatorid = pf_status.hostid;
    	h->bus.endtime = htonl(endtime);
    	h->bus.status = status;
    
    	m->m_len = nlen;
    
    	return (1);
    }
    
    static unsigned int
    pfsync_bulk_snd_states(struct pfsync_softc *sc,
        struct mbuf *m, const unsigned int space, unsigned int len)
    {
    	struct pf_state *st;
    	struct pfsync_state *sp;
    	unsigned int nlen;
    	unsigned int count = 0;
    
    	st = sc->sc_bulk_snd.snd_next;
    
    	for (;;) {
    		nlen = len + sizeof(*sp);
    		sp = (struct pfsync_state *)(mtod(m, caddr_t) + len);
    		if (space < nlen)
    			break;
    
    		mtx_enter(&st->mtx);
    		pf_state_export(sp, st);
    		mtx_leave(&st->mtx);
    
    		/* commit */
    		count++;
    		m->m_len = len = nlen;
    
    		if (st == sc->sc_bulk_snd.snd_tail) {
    			if (pfsync_bulk_snd_bus(sc, m, space,
    			    0, PFSYNC_BUS_END) == 0) {
    				/* couldn't fit the BUS */
    				st = NULL;
    				break;
    			}
    
    			/* this BUS is done */
    			pfsync_dprintf(sc, "bulk send done (%s)", __func__);
    			sc->sc_bulk_snd.snd_again = 0; /* XXX */
    			sc->sc_bulk_snd.snd_next = NULL;
    			sc->sc_bulk_snd.snd_tail = NULL;
    			return (count);
    		}
    
    		st = TAILQ_NEXT(st, entry_list);
    	}
    
    	/* there's still work to do */
    	sc->sc_bulk_snd.snd_next = st;
    	timeout_add_msec(&sc->sc_bulk_snd.snd_tmo, PFSYNC_BULK_SND_IVAL_MS);
    
    	return (count);
    }
    
    static unsigned int
    pfsync_bulk_snd_sub(struct pfsync_softc *sc,
        struct mbuf *m, const unsigned int space)
    {
    	struct pfsync_subheader *subh;
    	unsigned int count;
    	unsigned int len, nlen;
    
    	len = m->m_len;
    	nlen = len + sizeof(*subh);
    	if (nlen > space)
    		return (0);
    
    	subh = (struct pfsync_subheader *)(mtod(m, caddr_t) + len);
    
    	/*
    	 * pfsync_bulk_snd_states only updates m->m_len after
    	 * filling in a state after the offset we gave it.
    	 */
    	count = pfsync_bulk_snd_states(sc, m, space, nlen);
    	if (count == 0)
    		return (0);
    
    	subh->action = PFSYNC_ACT_UPD;
    	subh->len = sizeof(struct pfsync_state) >> 2;
    	subh->count = htons(count);
    
    	return (count);
    }
    
    static void
    pfsync_bulk_snd_start(struct pfsync_softc *sc)
    {
    	const unsigned int space = sc->sc_if.if_mtu -
    	    (sizeof(struct ip) + sizeof(struct pfsync_header));
    	struct mbuf *m;
    
    	rw_enter_read(&pf_state_list.pfs_rwl);
    
    	rw_enter_write(&sc->sc_bulk_snd.snd_lock);
    	if (sc->sc_bulk_snd.snd_next != NULL) {
    		sc->sc_bulk_snd.snd_again = 1;
    		goto leave;
    	}
    
    	mtx_enter(&pf_state_list.pfs_mtx);
    	sc->sc_bulk_snd.snd_next = TAILQ_FIRST(&pf_state_list.pfs_list);
    	sc->sc_bulk_snd.snd_tail = TAILQ_LAST(&pf_state_list.pfs_list,
    	    pf_state_queue);
    	mtx_leave(&pf_state_list.pfs_mtx);
    
    	m = m_gethdr(M_DONTWAIT, MT_DATA);
    	if (m == NULL)
    		goto leave;
    
    	MCLGETL(m, M_DONTWAIT, max_linkhdr + sc->sc_if.if_mtu);
    	if (!ISSET(m->m_flags, M_EXT)) {
    		/* some error++ */
    		m_freem(m); /* drop */
    		goto leave;
    	}
    
    	m_align(m, space);
    	m->m_len = 0;
    
    	if (sc->sc_bulk_snd.snd_tail == NULL) {
    		pfsync_dprintf(sc, "bulk send empty (%s)", __func__);
    
    		/* list is empty */
    		if (pfsync_bulk_snd_bus(sc, m, space, 0, PFSYNC_BUS_END) == 0)
    			panic("%s: mtu is too low", __func__);
    		goto encap;
    	}
    
    	pfsync_dprintf(sc, "bulk send start (%s)", __func__);
    
    	/* start a bulk update. */
    	if (pfsync_bulk_snd_bus(sc, m, space, 0, PFSYNC_BUS_START) == 0)
    		panic("%s: mtu is too low", __func__);
    
    	/* fill it up with state updates. */
    	pfsync_bulk_snd_sub(sc, m, space);
    
    encap:
    	m->m_pkthdr.len = m->m_len;
    	m = pfsync_encap(sc, m);
    	if (m == NULL)
    		goto leave;
    
    	pfsync_sendout(sc, m);
    
    leave:
    	rw_exit_write(&sc->sc_bulk_snd.snd_lock);
    
    	rw_exit_read(&pf_state_list.pfs_rwl);
    }
    
    static void
    pfsync_bulk_snd_tmo(void *arg)
    {
    	struct pfsync_softc *sc = arg;
    	const unsigned int space = sc->sc_if.if_mtu -
    	    (sizeof(struct ip) + sizeof(struct pfsync_header));
    	struct mbuf *m;
    
    	m = m_gethdr(M_DONTWAIT, MT_DATA);
    	if (m == NULL) {
    		/* some error++ */
    		/* retry later */
    		timeout_add_msec(&sc->sc_bulk_snd.snd_tmo,
    		    PFSYNC_BULK_SND_IVAL_MS);
    		return;
    	}
    
    	MCLGETL(m, M_DONTWAIT, max_linkhdr + sc->sc_if.if_mtu);
    	if (!ISSET(m->m_flags, M_EXT)) {
    		/* some error++ */
    		m_freem(m);
    		/* retry later */
    		timeout_add_msec(&sc->sc_bulk_snd.snd_tmo,
    		    PFSYNC_BULK_SND_IVAL_MS);
    		return;
    	}
    
    	m_align(m, space);
    	m->m_len = 0;
    
    	rw_enter_read(&pf_state_list.pfs_rwl);
    	rw_enter_write(&sc->sc_bulk_snd.snd_lock);
    
    	if (sc->sc_bulk_snd.snd_next == NULL) {
    		/* there was no space in the previous packet for a BUS END */
    
    		if (pfsync_bulk_snd_bus(sc, m, space, 0, PFSYNC_BUS_END) == 0)
    			panic("%s: mtu is too low", __func__);
    
    		/* this bulk is done */
    		pfsync_dprintf(sc, "bulk send done (%s)", __func__);
    		sc->sc_bulk_snd.snd_again = 0; /* XXX */
    		sc->sc_bulk_snd.snd_tail = NULL;
    	} else {
    		pfsync_dprintf(sc, "bulk send again (%s)", __func__);
    
    		/* fill it up with state updates. */
    		pfsync_bulk_snd_sub(sc, m, space);
    	}
    
    	m->m_pkthdr.len = m->m_len;
    	m = pfsync_encap(sc, m);
    
    	rw_exit_write(&sc->sc_bulk_snd.snd_lock);
    	rw_exit_read(&pf_state_list.pfs_rwl);
    
    	if (m != NULL) {
    		NET_LOCK();
    		pfsync_sendout(sc, m);
    		NET_UNLOCK();
    	}
    }
    
    static void
    pfsync_update_state_req(struct pfsync_softc *sc, struct pf_state *st)
    {
    	struct pfsync_slice *s = pfsync_slice_enter(sc, st);
    
    	switch (st->sync_state) {
    	case PFSYNC_S_UPD_C:
    	case PFSYNC_S_IACK:
    		pfsync_q_del(s, st);
    		/* FALLTHROUGH */
    	case PFSYNC_S_NONE:
    		pfsync_q_ins(s, st, PFSYNC_S_UPD);
    		break;
    
    	case PFSYNC_S_INS:
    	case PFSYNC_S_UPD:
    	case PFSYNC_S_DEL:
    		/* we're already handling it */
    		break;
    	default:
    		panic("%s: state %p unexpected sync_state %d",
    		    __func__, st, st->sync_state);
    	}
    
    	pfsync_slice_sched(s);
    	pfsync_slice_leave(sc, s);
    }
    
    #if defined(IPSEC)
    static void
    pfsync_out_tdb(struct tdb *tdb, void *buf)
    {
    	struct pfsync_tdb *ut = buf;
    
    	memset(ut, 0, sizeof(*ut));
    	ut->spi = tdb->tdb_spi;
    	memcpy(&ut->dst, &tdb->tdb_dst, sizeof(ut->dst));
    	/*
    	 * When a failover happens, the master's rpl is probably above
    	 * what we see here (we may be up to a second late), so
    	 * increase it a bit for outbound tdbs to manage most such
    	 * situations.
    	 *
    	 * For now, just add an offset that is likely to be larger
    	 * than the number of packets we can see in one second. The RFC
    	 * just says the next packet must have a higher seq value.
    	 *
    	 * XXX What is a good algorithm for this? We could use
    	 * a rate-determined increase, but to know it, we would have
    	 * to extend struct tdb.
    	 * XXX pt->rpl can wrap over MAXINT, but if so the real tdb
    	 * will soon be replaced anyway. For now, just don't handle
    	 * this edge case.
    	 */
    #define RPL_INCR 16384
    	ut->rpl = htobe64(tdb->tdb_rpl +
    	    (ISSET(tdb->tdb_flags, TDBF_PFSYNC_RPL) ? RPL_INCR : 0));
    	ut->cur_bytes = htobe64(tdb->tdb_cur_bytes);
    	ut->sproto = tdb->tdb_sproto;
    	ut->rdomain = htons(tdb->tdb_rdomain);
    }
    
    static struct pfsync_slice *
    pfsync_slice_enter_tdb(struct pfsync_softc *sc, const struct tdb *t)
    {
    	/*
    	 * just use the first slice for all ipsec (for now) until
    	 * it's more obvious what property (eg, spi) we can distribute
    	 * tdbs over slices with.
    	 */
    	struct pfsync_slice *s = &sc->sc_slices[0];
    
    	if (!mtx_enter_try(&s->s_mtx)) {
    		mtx_enter(&s->s_mtx);
    		s->s_stat_contended++;
    	}
    	s->s_stat_locks++;
    
    	return (s);
    }
    
    static void
    pfsync_tdb_ins(struct pfsync_slice *s, struct tdb *tdb)
    {
    	size_t nlen = sizeof(struct pfsync_tdb);
    	struct mbuf *m = NULL;
    
    	KASSERT(s->s_len >= PFSYNC_MINPKT);
    
    	MUTEX_ASSERT_LOCKED(&s->s_mtx);
    	MUTEX_ASSERT_UNLOCKED(&tdb->tdb_mtx);
    
    	if (TAILQ_EMPTY(&s->s_tdb_q))
    		nlen += sizeof(struct pfsync_subheader);
    
    	if (s->s_len + nlen > s->s_pfsync->sc_if.if_mtu) {
    		m = pfsync_slice_write(s);
    		if (m != NULL) {
    			s->s_stat_enqueue++;
    			if (mq_enqueue(&s->s_sendq, m) == 0)
    				task_add(s->s_softnet, &s->s_send);
    		}
    
    		nlen = sizeof(struct pfsync_subheader) +
    		    sizeof(struct pfsync_tdb);
    	}
    
    	s->s_len += nlen;
    	TAILQ_INSERT_TAIL(&s->s_tdb_q, tdb, tdb_sync_entry);
    	tdb->tdb_updates = 0;
    
    	if (!timeout_pending(&s->s_tmo))
    		timeout_add_sec(&s->s_tmo, 1);
    }
    
    static void
    pfsync_tdb_del(struct pfsync_slice *s, struct tdb *tdb)
    {
    	MUTEX_ASSERT_LOCKED(&s->s_mtx);
    	MUTEX_ASSERT_UNLOCKED(&tdb->tdb_mtx);
    
    	TAILQ_REMOVE(&s->s_tdb_q, tdb, tdb_sync_entry);
    
    	s->s_len -= sizeof(struct pfsync_tdb);
    	if (TAILQ_EMPTY(&s->s_tdb_q))
    		s->s_len -= sizeof(struct pfsync_subheader);
    }
    
    /*
     * the reference that pfsync has to a tdb is accounted for by the
     * TDBF_PFSYNC flag, not by tdb_ref/tdb_unref. tdb_delete_tdb() is
     * called after all other references to a tdb are dropped (with
     * tdb_unref) as part of the tdb_free().
     *
     * tdb_free() needs to wait for pfsync to let go of the tdb though,
     * which would be best handled by a reference count, but tdb_free
     * needs the NET_LOCK which pfsync is already fighting with. instead
     * use the TDBF_PFSYNC_SNAPPED flag to coordinate the pfsync write/drop
     * with tdb_free.
     */
    
    void
    pfsync_update_tdb(struct tdb *tdb, int output)
    {
    	struct pfsync_softc *sc;
    
    	MUTEX_ASSERT_UNLOCKED(&tdb->tdb_mtx);
    
    	smr_read_enter();
    	sc = SMR_PTR_GET(&pfsyncif);
    	if (sc != NULL) {
    		struct pfsync_slice *s = pfsync_slice_enter_tdb(sc, tdb);
    
    		/* TDBF_PFSYNC is only changed while the slice mtx is held */
    		if (!ISSET(tdb->tdb_flags, TDBF_PFSYNC)) {
    			mtx_enter(&tdb->tdb_mtx);
    			SET(tdb->tdb_flags, TDBF_PFSYNC);
    			mtx_leave(&tdb->tdb_mtx);
    
    			pfsync_tdb_ins(s, tdb);
    		} else if (++tdb->tdb_updates >= sc->sc_maxupdates)
    			pfsync_slice_sched(s);
    
    		/* XXX no sync timestamp on tdbs to check */
    
    		pfsync_slice_leave(sc, s);
    	}
    	smr_read_leave();
    }
    
    void
    pfsync_delete_tdb(struct tdb *tdb)
    {
    	struct pfsync_softc *sc;
    
    	MUTEX_ASSERT_UNLOCKED(&tdb->tdb_mtx);
    
    	smr_read_enter();
    	sc = SMR_PTR_GET(&pfsyncif);
    	if (sc != NULL) {
    		struct pfsync_slice *s = pfsync_slice_enter_tdb(sc, tdb);
    
    		/* TDBF_PFSYNC is only changed while the slice mtx is held */
    		if (ISSET(tdb->tdb_flags, TDBF_PFSYNC)) {
    			pfsync_tdb_del(s, tdb);
    
    			mtx_enter(&tdb->tdb_mtx);
    			CLR(tdb->tdb_flags, TDBF_PFSYNC);
    			mtx_leave(&tdb->tdb_mtx);
    		}
    
    		pfsync_slice_leave(sc, s);
    	}
    	smr_read_leave();
    
    	/*
    	 * handle pfsync_slice_drop being called from pfsync_down
    	 * and the smr/slice access above won't work.
    	 */
    
    	mtx_enter(&tdb->tdb_mtx);
    	SET(tdb->tdb_flags, TDBF_PFSYNC_SNAPPED); /* like a thanos snap */
    	while (ISSET(tdb->tdb_flags, TDBF_PFSYNC)) {
    		msleep_nsec(&tdb->tdb_updates, &tdb->tdb_mtx, PWAIT,
    		    "tdbfree", INFSLP);
    	}
    	mtx_leave(&tdb->tdb_mtx);
    }
    #endif /* defined(IPSEC) */
    
    struct pfsync_act {
    	void (*in)(struct pfsync_softc *, const caddr_t,
    	    unsigned int, unsigned int);
    	size_t len;
    };
    
    static void	pfsync_in_clr(struct pfsync_softc *,
    		    const caddr_t, unsigned int, unsigned int);
    static void	pfsync_in_iack(struct pfsync_softc *,
    		    const caddr_t, unsigned int, unsigned int);
    static void	pfsync_in_upd_c(struct pfsync_softc *,
    		    const caddr_t, unsigned int, unsigned int);
    static void	pfsync_in_ureq(struct pfsync_softc *,
    		    const caddr_t, unsigned int, unsigned int);
    static void	pfsync_in_del(struct pfsync_softc *,
    		    const caddr_t, unsigned int, unsigned int);
    static void	pfsync_in_del_c(struct pfsync_softc *,
    		    const caddr_t, unsigned int, unsigned int);
    static void	pfsync_in_bus(struct pfsync_softc *,
    		    const caddr_t, unsigned int, unsigned int);
    static void	pfsync_in_tdb(struct pfsync_softc *,
    		    const caddr_t, unsigned int, unsigned int);
    static void	pfsync_in_ins(struct pfsync_softc *,
    		    const caddr_t, unsigned int, unsigned int);
    static void	pfsync_in_upd(struct pfsync_softc *,
    		    const caddr_t, unsigned int, unsigned int);
    
    static const struct pfsync_act pfsync_acts[] = {
    	[PFSYNC_ACT_CLR] =
    	    { pfsync_in_clr,	sizeof(struct pfsync_clr) },
    	[PFSYNC_ACT_INS_ACK] =
    	    { pfsync_in_iack,	sizeof(struct pfsync_ins_ack) },
    	[PFSYNC_ACT_UPD_C] =
    	    { pfsync_in_upd_c,	sizeof(struct pfsync_upd_c) },
    	[PFSYNC_ACT_UPD_REQ] =
    	    { pfsync_in_ureq,	sizeof(struct pfsync_upd_req) },
    	[PFSYNC_ACT_DEL] =
    	    { pfsync_in_del,	sizeof(struct pfsync_state) },
    	[PFSYNC_ACT_DEL_C] =
    	    { pfsync_in_del_c,	sizeof(struct pfsync_del_c) },
    	[PFSYNC_ACT_BUS] =
    	    { pfsync_in_bus,	sizeof(struct pfsync_bus) },
    	[PFSYNC_ACT_INS] =
    	    { pfsync_in_ins,	sizeof(struct pfsync_state) },
    	[PFSYNC_ACT_UPD] =
    	    { pfsync_in_upd,	sizeof(struct pfsync_state) },
    	[PFSYNC_ACT_TDB] =
    	    { pfsync_in_tdb,	sizeof(struct pfsync_tdb) },
    };
    
    static void
    pfsync_in_skip(struct pfsync_softc *sc,
        const caddr_t buf, unsigned int mlen, unsigned int count)
    {
    	/* nop */
    }
    
    static struct mbuf *
    pfsync_input(struct mbuf *m, uint8_t ttl, unsigned int hlen)
    {
    	struct pfsync_softc *sc;
    	struct pfsync_header *ph;
    	struct pfsync_subheader *subh;
    	unsigned int len;
    	void (*in)(struct pfsync_softc *,
    	    const caddr_t, unsigned int, unsigned int);
    
    	pfsyncstat_inc(pfsyncs_ipackets);
    
    	if (!pf_status.running)
    		return (m);
    
    	/*
    	 * pfsyncif is only set if it is up and running correctly.
    	 */
    	smr_read_enter();
    	sc = SMR_PTR_GET(&pfsyncif);
    	if (sc == NULL)
    		goto leave;
    
    	if (sc->sc_sync_ifidx != m->m_pkthdr.ph_ifidx) {
    		pfsyncstat_inc(pfsyncs_badif);
    		goto leave;
    	}
    
    	/* verify that the IP TTL is 255. */
    	if (ttl != PFSYNC_DFLTTL) {
    		pfsyncstat_inc(pfsyncs_badttl);
    		goto leave;
    	}
    
    	m_adj(m, hlen);
    
    	if (m->m_pkthdr.len < sizeof(*ph)) {
    		pfsyncstat_inc(pfsyncs_hdrops);
    		goto leave;
    	}
    	if (m->m_len < sizeof(*ph)) {
    		m = m_pullup(m, sizeof(*ph));
    		if (m == NULL)
    			goto leave;
    	}
    
    	ph = mtod(m, struct pfsync_header *);
    	if (ph->version != PFSYNC_VERSION) {
    		pfsyncstat_inc(pfsyncs_badver);
    		goto leave;
    	}
    
    	len = ntohs(ph->len);
    	if (m->m_pkthdr.len < len) {
    		pfsyncstat_inc(pfsyncs_badlen);
    		goto leave;
    	}
    	if (m->m_pkthdr.len > len)
    		m->m_pkthdr.len = len;
    
    	/* ok, it's serious now */
    	refcnt_take(&sc->sc_refs);
    	smr_read_leave();
    
    	counters_pkt(sc->sc_if.if_counters, ifc_ipackets, ifc_ibytes, len);
    
    	m_adj(m, sizeof(*ph));
    
    	while (m->m_pkthdr.len >= sizeof(*subh)) {
    		unsigned int action, mlen, count;
    
    		if (m->m_len < sizeof(*subh)) {
    			m = m_pullup(m, sizeof(*subh));
    			if (m == NULL)
    				goto rele;
    		}
    		subh = mtod(m, struct pfsync_subheader *);
    
    		action = subh->action;
    		mlen = subh->len << 2;
    		count = ntohs(subh->count);
    
    		if (action >= PFSYNC_ACT_MAX ||
    		    action >= nitems(pfsync_acts) ||
    		    mlen < pfsync_acts[subh->action].len) {
    			/*
    			 * subheaders are always followed by at least one
    			 * message, so if the peer is new
    			 * enough to tell us how big its messages are then we
    			 * know enough to skip them.
    			 */
    			if (count == 0 || mlen == 0) {
    				pfsyncstat_inc(pfsyncs_badact);
    				goto rele;
    			}
    
    			in = pfsync_in_skip;
    		} else {
    			in = pfsync_acts[action].in;
    			if (in == NULL)
    				in = pfsync_in_skip;
    		}
    
    		m_adj(m, sizeof(*subh));
    		len = mlen * count;
    		if (len > m->m_pkthdr.len) {
    			pfsyncstat_inc(pfsyncs_badlen);
    			goto rele;
    		}
    		if (m->m_len < len) {
    			m = m_pullup(m, len);
    			if (m == NULL)
    				goto rele;
    		}
    
    		(*in)(sc, mtod(m, caddr_t), mlen, count);
    		m_adj(m, len);
    	}
    
    rele:
    	refcnt_rele_wake(&sc->sc_refs);
    	return (m);
    
    leave:
    	smr_read_leave();
    	return (m);
    }
    
    static void
    pfsync_in_clr(struct pfsync_softc *sc,
        const caddr_t buf, unsigned int mlen, unsigned int count)
    {
    	const struct pfsync_clr *clr;
    	struct pf_state *head, *tail, *st, *next;
    	struct pfi_kif *kif;
    	uint32_t creatorid;
    	unsigned int i;
    
    	rw_enter_read(&pf_state_list.pfs_rwl);
    
    	/* get a view of the state list */
    	mtx_enter(&pf_state_list.pfs_mtx);
    	head = TAILQ_FIRST(&pf_state_list.pfs_list);
    	tail = TAILQ_LAST(&pf_state_list.pfs_list, pf_state_queue);
    	mtx_leave(&pf_state_list.pfs_mtx);
    
    	PF_LOCK();
    	for (i = 0; i < count; i++) {
    		clr = (struct pfsync_clr *)(buf + i * mlen);
    
    		creatorid = clr->creatorid;
    		if (clr->ifname[0] == '\0')
    			kif = NULL;
    		else {
    			kif = pfi_kif_find(clr->ifname);
    			if (kif == NULL)
    				continue;
    		}
    
    		st = NULL;
    		next = head;
    
    		PF_STATE_ENTER_WRITE();
    		while (st != tail) {
    			st = next;
    			next = TAILQ_NEXT(st, entry_list);
    
    			if (creatorid != st->creatorid)
    				continue;
    			if (kif != NULL && kif != st->kif)
    				continue;
    
    			mtx_enter(&st->mtx);
    			SET(st->state_flags, PFSTATE_NOSYNC);
    			mtx_leave(&st->mtx);
    			pf_remove_state(st);
    		}
    		PF_STATE_EXIT_WRITE();
    	}
    	PF_UNLOCK();
    
    	rw_exit_read(&pf_state_list.pfs_rwl);
    }
    
    static void
    pfsync_in_ins(struct pfsync_softc *sc,
        const caddr_t buf, unsigned int mlen, unsigned int count)
    {
    	const struct pfsync_state *sp;
    	sa_family_t af1, af2;
    	unsigned int i;
    
    	PF_LOCK();
    	for (i = 0; i < count; i++) {
    		sp = (struct pfsync_state *)(buf + mlen * i);
    		af1 = sp->key[0].af;
    		af2 = sp->key[1].af;
    
    		/* check for invalid values */
    		if (sp->timeout >= PFTM_MAX ||
    		    sp->src.state > PF_TCPS_PROXY_DST ||
    		    sp->dst.state > PF_TCPS_PROXY_DST ||
    		    sp->direction > PF_OUT ||
    		    (((af1 || af2) &&
    		     ((af1 != AF_INET && af1 != AF_INET6) ||
    		      (af2 != AF_INET && af2 != AF_INET6))) ||
    		     (sp->af != AF_INET && sp->af != AF_INET6))) {
    			pfsyncstat_inc(pfsyncs_badval);
    			continue;
    		}
    
    		if (pf_state_import(sp, PFSYNC_SI_PFSYNC) == ENOMEM) {
    			/* drop out, but process the rest of the actions */
    			break;
    		}
    	}
    	PF_UNLOCK();
    }
    
    static void
    pfsync_in_iack(struct pfsync_softc *sc,
        const caddr_t buf, unsigned int mlen, unsigned int count)
    {
    	const struct pfsync_ins_ack *ia;
    	struct pf_state_cmp id_key;
    	struct pf_state *st;
    	unsigned int i;
    
    	for (i = 0; i < count; i++) {
    		ia = (struct pfsync_ins_ack *)(buf + mlen * i);
    
    		id_key.id = ia->id;
    		id_key.creatorid = ia->creatorid;
    
    		PF_STATE_ENTER_READ();
    		st = pf_find_state_byid(&id_key);
    		pf_state_ref(st);
    		PF_STATE_EXIT_READ();
    		if (st == NULL)
    			continue;
    
    		if (READ_ONCE(st->sync_defer) != NULL)
    			pfsync_deferred(sc, st);
    
    		pf_state_unref(st);
    	}
    }
    
    static int
    pfsync_upd_tcp(struct pf_state *st, const struct pfsync_state_peer *src,
        const struct pfsync_state_peer *dst)
    {
    	int sync = 0;
    
    	/*
    	 * The state should never go backwards except
    	 * for syn-proxy states.  Neither should the
    	 * sequence window slide backwards.
    	 */
    	if ((st->src.state > src->state &&
    	    (st->src.state < PF_TCPS_PROXY_SRC ||
    	     src->state >= PF_TCPS_PROXY_SRC)) ||
    
    	    (st->src.state == src->state &&
    	     SEQ_GT(st->src.seqlo, ntohl(src->seqlo))))
    		sync++;
    	else
    		pf_state_peer_ntoh(src, &st->src);
    
    	if ((st->dst.state > dst->state) ||
    
    	    (st->dst.state == dst->state &&
    	     SEQ_GT(st->dst.seqlo, ntohl(dst->seqlo))))
    		sync++;
    	else
    		pf_state_peer_ntoh(dst, &st->dst);
    
    	return (sync);
    }
    
    static void
    pfsync_in_updates(struct pfsync_softc *sc, struct pf_state *st,
        const struct pfsync_state_peer *src, const struct pfsync_state_peer *dst,
        uint8_t timeout)
    {
    	struct pf_state_scrub *sscrub = NULL;
    	struct pf_state_scrub *dscrub = NULL;
    	int sync;
    
    	if (src->scrub.scrub_flag && st->src.scrub == NULL) {
    		sscrub = pf_state_scrub_get();
    		if (sscrub == NULL) {
    			/* inc error? */
    			goto out;
    		}
    	}
    	if (dst->scrub.scrub_flag && st->dst.scrub == NULL) {
    		dscrub = pf_state_scrub_get();
    		if (dscrub == NULL) {
    			/* inc error? */
    			goto out;
    		}
    	}
    
    	if (READ_ONCE(st->sync_defer) != NULL)
    		pfsync_deferred(sc, st);
    
    	mtx_enter(&st->mtx);
    
    	/* attach the scrub memory if needed */
    	if (sscrub != NULL && st->src.scrub == NULL) {
    		st->src.scrub = sscrub;
    		sscrub = NULL;
    	}
    	if (dscrub != NULL && st->dst.scrub == NULL) {
    		st->dst.scrub = dscrub;
    		dscrub = NULL;
    	}
    
    	if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP)
    		sync = pfsync_upd_tcp(st, src, dst);
    	else {
    		sync = 0;
    
    		/*
    		 * Non-TCP protocol state machine always go
    		 * forwards
    		 */
    		if (st->src.state > src->state)
    			sync++;
    		else
    			pf_state_peer_ntoh(src, &st->src);
    
    		if (st->dst.state > dst->state)
    			sync++;
    		else
    			pf_state_peer_ntoh(dst, &st->dst);
    	}
    
    	st->pfsync_time = getuptime();
    	if (sync < 2) {
    		st->expire = st->pfsync_time;
    		st->timeout = timeout;
    	}
    
    	mtx_leave(&st->mtx);
    
    	if (sync) {
    		pfsyncstat_inc(pfsyncs_stale);
    		pfsync_update_state(st);
    	}
    
    out:
    	if (sscrub != NULL)
    		pf_state_scrub_put(sscrub);
    	if (dscrub != NULL)
    		pf_state_scrub_put(dscrub);
    }
    
    
    static void
    pfsync_in_upd(struct pfsync_softc *sc,
        const caddr_t buf, unsigned int mlen, unsigned int count)
    {
    	const struct pfsync_state *sp;
    	struct pf_state_cmp id_key;
    	struct pf_state *st;
    	int error;
    	unsigned int i;
    
    	for (i = 0; i < count; i++) {
    		sp = (struct pfsync_state *)(buf + mlen * i);
    
    		/* check for invalid values */
    		if (sp->timeout >= PFTM_MAX ||
    		    sp->src.state > PF_TCPS_PROXY_DST ||
    		    sp->dst.state > PF_TCPS_PROXY_DST) {
    			pfsyncstat_inc(pfsyncs_badval);
    			continue;
    		}
    
    		id_key.id = sp->id;
    		id_key.creatorid = sp->creatorid;
    
    		PF_STATE_ENTER_READ();
    		st = pf_find_state_byid(&id_key);
    		pf_state_ref(st);
    		PF_STATE_EXIT_READ();
    		if (st == NULL) {
    			/* insert the update */
    			PF_LOCK();
    			error = pf_state_import(sp, PFSYNC_SI_PFSYNC);
    			if (error)
    				pfsyncstat_inc(pfsyncs_badstate);
    			PF_UNLOCK();
    			continue;
    		}
    
    		pfsync_in_updates(sc, st, &sp->src, &sp->dst, sp->timeout);
    
    		pf_state_unref(st);
    	}
    }
    
    static struct mbuf *
    pfsync_upd_req_init(struct pfsync_softc *sc, unsigned int count)
    {
    	struct mbuf *m;
    	unsigned int mlen;
    
    	m = m_gethdr(M_DONTWAIT, MT_DATA);
    	if (m == NULL) {
    		pfsyncstat_inc(pfsyncs_onomem);
    		return (NULL);
    	}
    
    	mlen = max_linkhdr + sizeof(sc->sc_template) +
    	    sizeof(struct pfsync_header) +
    	    sizeof(struct pfsync_subheader) +
    	    sizeof(struct pfsync_upd_req) * count;
    
    	if (mlen > MHLEN) {
    		MCLGETL(m, M_DONTWAIT, mlen);
    		if (!ISSET(m->m_flags, M_EXT)) {
    			m_freem(m);
    			return (NULL);
    		}
    	}
    
    	m_align(m, 0);
    	m->m_len = 0;
    
    	return (m);
    }
    
    static void
    pfsync_in_upd_c(struct pfsync_softc *sc,
        const caddr_t buf, unsigned int mlen, unsigned int count)
    {
    	const struct pfsync_upd_c *up;
    	struct pf_state_cmp id_key;
    	struct pf_state *st;
    	unsigned int i;
    	struct mbuf *m = NULL;
    	unsigned int rcount = 0;
    
    	for (i = 0; i < count; i++) {
    		up = (struct pfsync_upd_c *)(buf + mlen * i);
    
    		/* check for invalid values */
    		if (up->timeout >= PFTM_MAX ||
    		    up->src.state > PF_TCPS_PROXY_DST ||
    		    up->dst.state > PF_TCPS_PROXY_DST) {
    			pfsyncstat_inc(pfsyncs_badval);
    			continue;
    		}
    
    		id_key.id = up->id;
    		id_key.creatorid = up->creatorid;
    
    		PF_STATE_ENTER_READ();
    		st = pf_find_state_byid(&id_key);
    		pf_state_ref(st);
    		PF_STATE_EXIT_READ();
    		if (st == NULL) {
    			/* We don't have this state. Ask for it. */
    			struct pfsync_upd_req *ur;
    
    			if (m == NULL) {
    				m = pfsync_upd_req_init(sc, count);
    				if (m == NULL) {
    					pfsyncstat_inc(pfsyncs_onomem);
    					continue;
    				}
    			}
    
    			m = m_prepend(m, sizeof(*ur), M_DONTWAIT);
    			if (m == NULL) {
    				pfsyncstat_inc(pfsyncs_onomem);
    				continue;
    			}
    
    			ur = mtod(m, struct pfsync_upd_req *);
    			ur->id = up->id;
    			ur->creatorid = up->creatorid;
    			rcount++;
    
    			continue;
    		}
    
    		pfsync_in_updates(sc, st, &up->src, &up->dst, up->timeout);
    
    		pf_state_unref(st);
    	}
    
    	if (m != NULL) {
    		struct pfsync_subheader *subh;
    
    		m = m_prepend(m, sizeof(*subh), M_DONTWAIT);
    		if (m == NULL) {
    			pfsyncstat_inc(pfsyncs_onomem);
    			return;
    		}
    
    		subh = mtod(m, struct pfsync_subheader *);
    		subh->action = PFSYNC_ACT_UPD_REQ;
    		subh->len = sizeof(struct pfsync_upd_req) >> 2;
    		subh->count = htons(rcount);
    
    		m = pfsync_encap(sc, m);
    		if (m == NULL) {
    			pfsyncstat_inc(pfsyncs_onomem);
    			return;
    		}
    
    		pfsync_sendout(sc, m);
    	}
    }
    
    static void
    pfsync_in_ureq(struct pfsync_softc *sc,
        const caddr_t buf, unsigned int mlen, unsigned int count)
    {
    	const struct pfsync_upd_req *ur;
    	struct pf_state_cmp id_key;
    	struct pf_state *st;
    	unsigned int i;
    
    	for (i = 0; i < count; i++) {
    		ur = (struct pfsync_upd_req *)(buf + mlen * i);
    
    		id_key.id = ur->id;
    		id_key.creatorid = ur->creatorid;
    
    		if (id_key.id == 0 && id_key.creatorid == 0) {
    			pfsync_bulk_snd_start(sc);
    			continue;
    		}
    
    		PF_STATE_ENTER_READ();
    		st = pf_find_state_byid(&id_key);
    		if (st != NULL && st->timeout < PFTM_MAX &&
    		    !ISSET(st->state_flags, PFSTATE_NOSYNC))
    			pf_state_ref(st);
    		else
    			st = NULL;
    		PF_STATE_EXIT_READ();
    		if (st == NULL) {
    			pfsyncstat_inc(pfsyncs_badstate);
    			continue;
    		}
    
    		pfsync_update_state_req(sc, st);
    
    		pf_state_unref(st);
    	}
    }
    
    static void
    pfsync_in_del(struct pfsync_softc *sc,
        const caddr_t buf, unsigned int mlen, unsigned int count)
    {
    	const struct pfsync_state *sp;
    	struct pf_state_cmp id_key;
    	struct pf_state *st;
    	unsigned int i;
    
    	PF_LOCK();
    	PF_STATE_ENTER_WRITE();
    	for (i = 0; i < count; i++) {
    		sp = (struct pfsync_state *)(buf + mlen * i);
    
    		id_key.id = sp->id;
    		id_key.creatorid = sp->creatorid;
    
    		st = pf_find_state_byid(&id_key);
    		if (st == NULL) {
    			pfsyncstat_inc(pfsyncs_badstate);
    			continue;
    		}
    
    		mtx_enter(&st->mtx);
    		SET(st->state_flags, PFSTATE_NOSYNC);
    		mtx_leave(&st->mtx);
    		pf_remove_state(st);
    	}
    	PF_STATE_EXIT_WRITE();
    	PF_UNLOCK();
    }
    
    static void
    pfsync_in_del_c(struct pfsync_softc *sc,
        const caddr_t buf, unsigned int mlen, unsigned int count)
    {
    	const struct pfsync_del_c *sp;
    	struct pf_state_cmp id_key;
    	struct pf_state *st;
    	unsigned int i;
    
    	PF_LOCK();
    	PF_STATE_ENTER_WRITE();
    	for (i = 0; i < count; i++) {
    		sp = (struct pfsync_del_c *)(buf + mlen * i);
    
    		id_key.id = sp->id;
    		id_key.creatorid = sp->creatorid;
    
    		st = pf_find_state_byid(&id_key);
    		if (st == NULL) {
    			pfsyncstat_inc(pfsyncs_badstate);
    			continue;
    		}
    
    		mtx_enter(&st->mtx);
    		SET(st->state_flags, PFSTATE_NOSYNC);
    		mtx_leave(&st->mtx);
    		pf_remove_state(st);
    	}
    	PF_STATE_EXIT_WRITE();
    	PF_UNLOCK();
    }
    
    static void
    pfsync_in_bus(struct pfsync_softc *sc,
        const caddr_t buf, unsigned int len, unsigned int count)
    {
    	const struct pfsync_bus *bus = (struct pfsync_bus *)buf;
    
    	switch (bus->status) {
    	case PFSYNC_BUS_START:
    		pfsync_bulk_req_evt(sc, PFSYNC_BREQ_EVT_BUS_START);
    		break;
    
    	case PFSYNC_BUS_END:
    		pfsync_bulk_req_evt(sc, PFSYNC_BREQ_EVT_BUS_END);
    		break;
    	}
    }
    
    #if defined(IPSEC)
    /* Update an in-kernel tdb. Silently fail if no tdb is found. */
    static void
    pfsync_update_net_tdb(const struct pfsync_tdb *pt)
    {
    	struct tdb *tdb;
    
    	NET_ASSERT_LOCKED();
    
    	/* check for invalid values */
    	if (ntohl(pt->spi) <= SPI_RESERVED_MAX ||
    	    (pt->dst.sa.sa_family != AF_INET &&
    	     pt->dst.sa.sa_family != AF_INET6))
    		goto bad;
    
    	tdb = gettdb(ntohs(pt->rdomain), pt->spi,
    	    (union sockaddr_union *)&pt->dst, pt->sproto);
    	if (tdb) {
    		uint64_t rpl = betoh64(pt->rpl);
    		uint64_t cur_bytes = betoh64(pt->cur_bytes);
    
    		/* Neither replay nor byte counter should ever decrease. */
    		mtx_enter(&tdb->tdb_mtx);
    		if (rpl >= tdb->tdb_rpl &&
    		    cur_bytes >= tdb->tdb_cur_bytes) {
    			tdb->tdb_rpl = rpl;
    			tdb->tdb_cur_bytes = cur_bytes;
    		}
    		mtx_leave(&tdb->tdb_mtx);
    
    		tdb_unref(tdb);
    	}
    	return;
    
     bad:
    	DPFPRINTF(LOG_WARNING, "pfsync_insert: PFSYNC_ACT_TDB_UPD: "
    	    "invalid value");
    	pfsyncstat_inc(pfsyncs_badstate);
    	return;
    }
    #endif
    
    static void
    pfsync_in_tdb(struct pfsync_softc *sc,
        const caddr_t buf, unsigned int len, unsigned int count)
    {
    #if defined(IPSEC)
    	const struct pfsync_tdb *tp;
    	unsigned int i;
    
    	for (i = 0; i < count; i++) {
    		tp = (const struct pfsync_tdb *)(buf + len * i);
    		pfsync_update_net_tdb(tp);
    	}
    #endif
    }
    
    int
    pfsync_input4(struct mbuf **mp, int *offp, int proto, int af)
    {
    	struct mbuf *m = *mp;
    	struct ip *ip;
    
    	ip = mtod(m, struct ip *);
    
    	m = pfsync_input(m, ip->ip_ttl, ip->ip_hl << 2);
    
    	m_freem(m);
    	*mp = NULL;
    
    	return (IPPROTO_DONE);
    }
    
    int
    pfsync_sysctl_pfsyncstat(void *oldp, size_t *oldlenp, void *newp)
    {
    	struct pfsyncstats pfsyncstat;
    
    	CTASSERT(sizeof(pfsyncstat) == (pfsyncs_ncounters * sizeof(uint64_t)));
    	memset(&pfsyncstat, 0, sizeof pfsyncstat);
    	counters_read(pfsynccounters, (uint64_t *)&pfsyncstat,
    	    pfsyncs_ncounters, NULL);
    	return (sysctl_rdstruct(oldp, oldlenp, newp,
    	    &pfsyncstat, sizeof(pfsyncstat)));
    }
    
    int
    pfsync_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp,
        void *newp, size_t newlen)
    {
    	/* All sysctl names at this level are terminal. */
    	if (namelen != 1)
    		return (ENOTDIR);
    
    	switch (name[0]) {
    	case PFSYNCCTL_STATS:
    		return (pfsync_sysctl_pfsyncstat(oldp, oldlenp, newp));
    	default:
    		return (ENOPROTOOPT);
    	}
    }