Edit

IABSD.fr/src/sys/dev/dt/dt_dev.c

Branch :

  • Show log

    Commit

  • Author : dlg
    Date : 2025-05-21 00:13:44
    Hash : 267a06d5
    Message : establish the dt_deferred_wakeup() softintr as MPSAFE. without this, profiling on a busy system with btrace ends up with all the cpus spinning on the kernel lock all the time when the dt event buffer gets full. when the event buffer is full, dt tries to wake up btrace to read the buffer, but it defers this wakup to a softintr to avoid calling wakeup() at really high IPLs. because dt hides itself from stack traces, it looks ambiguous where this contention comes from and it's often assumed that it is the timeout subsystem that's causing the problems. we have improved timeouts, but apart from optimising timeout_barrier out of the picture the flame graphs looked basically the same as before. this is possible now that softintrs are implemented consistently across our archs (thanks visa@). tested by and ok bluhm@

  • sys/dev/dt/dt_dev.c
  • /*	$OpenBSD: dt_dev.c,v 1.43 2025/05/21 00:13:44 dlg Exp $ */
    
    /*
     * Copyright (c) 2019 Martin Pieuchot <mpi@openbsd.org>
     *
     * Permission to use, copy, modify, and distribute this software for any
     * purpose with or without fee is hereby granted, provided that the above
     * copyright notice and this permission notice appear in all copies.
     *
     * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
     * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
     * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
     * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
     * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
     * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
     * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
     */
    
    #include <sys/types.h>
    #include <sys/systm.h>
    #include <sys/param.h>
    #include <sys/clockintr.h>
    #include <sys/device.h>
    #include <sys/exec_elf.h>
    #include <sys/malloc.h>
    #include <sys/proc.h>
    #include <sys/ptrace.h>
    
    #include <machine/intr.h>
    
    #include <dev/dt/dtvar.h>
    
    /*
     * Number of frames to skip in stack traces.
     *
     * The number of frames required to execute dt(4) profiling code
     * depends on the probe, context, architecture and possibly the
     * compiler.
     *
     * Static probes (tracepoints) are executed in the context of the
     * current thread and only need to skip frames up to the recording
     * function.  For example the syscall provider:
     *
     *	dt_prov_syscall_entry+0x141
     *	syscall+0x205		<--- start here
     *	Xsyscall+0x128
     *
     * Probes executed in their own context, like the profile provider,
     * need to skip the frames of that context which are different for
     * every architecture.  For example the profile provider executed
     * from hardclock(9) on amd64:
     *
     *	dt_prov_profile_enter+0x6e
     *	hardclock+0x1a9
     *	lapic_clockintr+0x3f
     *	Xresume_lapic_ltimer+0x26
     *	acpicpu_idle+0x1d2	<---- start here.
     *	sched_idle+0x225
     *	proc_trampoline+0x1c
     */
    #if defined(__amd64__)
    #define DT_FA_PROFILE	5
    #define DT_FA_STATIC	2
    #elif defined(__i386__)
    #define DT_FA_PROFILE	5
    #define DT_FA_STATIC	2
    #elif defined(__macppc__)
    #define DT_FA_PROFILE  5
    #define DT_FA_STATIC   2
    #elif defined(__octeon__)
    #define DT_FA_PROFILE	6
    #define DT_FA_STATIC	2
    #elif defined(__powerpc64__)
    #define DT_FA_PROFILE	6
    #define DT_FA_STATIC	2
    #elif defined(__sparc64__)
    #define DT_FA_PROFILE	7
    #define DT_FA_STATIC	1
    #else
    #define DT_FA_STATIC	0
    #define DT_FA_PROFILE	0
    #endif
    
    #define DT_EVTRING_SIZE	16	/* # of slots in per PCB event ring */
    
    #define DPRINTF(x...) /* nothing */
    
    /*
     * Per-CPU Event States
     *
     *  Locks used to protect struct members:
     *	r	owned by thread doing read(2)
     *	c	owned by CPU
     *	s	sliced ownership, based on read/write indexes
     *	p	written by CPU, read by thread doing read(2)
     */
    struct dt_cpubuf {
    	unsigned int		 dc_prod;	/* [r] read index */
    	unsigned int		 dc_cons;	/* [c] write index */
    	struct dt_evt		*dc_ring;	/* [s] ring of event states */
    	unsigned int	 	 dc_inevt;	/* [c] in event already? */
    
    	/* Counters */
    	unsigned int		 dc_dropevt;	/* [p] # of events dropped */
    	unsigned int		 dc_skiptick;	/* [p] # of ticks skipped */
    	unsigned int		 dc_recurevt;	/* [p] # of recursive events */
    	unsigned int		 dc_readevt;	/* [r] # of events read */
    };
    
    /*
     * Descriptor associated with each program opening /dev/dt.  It is used
     * to keep track of enabled PCBs.
     *
     *  Locks used to protect struct members in this file:
     *	a	atomic
     *	K	kernel lock
     *	r	owned by thread doing read(2)
     *	I	invariant after initialization
     */
    struct dt_softc {
    	SLIST_ENTRY(dt_softc)	 ds_next;	/* [K] descriptor list */
    	int			 ds_unit;	/* [I] D_CLONE unique unit */
    	pid_t			 ds_pid;	/* [I] PID of tracing program */
    	void			*ds_si;		/* [I] to defer wakeup(9) */
    
    	struct dt_pcb_list	 ds_pcbs;	/* [K] list of enabled PCBs */
    	int			 ds_recording;	/* [K] currently recording? */
    	unsigned int		 ds_evtcnt;	/* [a] # of readable evts */
    
    	struct dt_cpubuf	 ds_cpu[MAXCPUS]; /* [I] Per-cpu event states */
    	unsigned int		 ds_lastcpu;	/* [r] last CPU ring read(2). */
    };
    
    SLIST_HEAD(, dt_softc) dtdev_list;	/* [K] list of open /dev/dt nodes */
    
    /*
     * Probes are created during dt_attach() and never modified/freed during
     * the lifetime of the system.  That's why we consider them as [I]mmutable.
     */
    unsigned int			dt_nprobes;	/* [I] # of probes available */
    SIMPLEQ_HEAD(, dt_probe)	dt_probe_list;	/* [I] list of probes */
    
    struct rwlock			dt_lock = RWLOCK_INITIALIZER("dtlk");
    volatile uint32_t		dt_tracing = 0;	/* [K] # of processes tracing */
    
    int allowdt;					/* [a] */
    
    void	dtattach(struct device *, struct device *, void *);
    int	dtopen(dev_t, int, int, struct proc *);
    int	dtclose(dev_t, int, int, struct proc *);
    int	dtread(dev_t, struct uio *, int);
    int	dtioctl(dev_t, u_long, caddr_t, int, struct proc *);
    
    struct	dt_softc *dtlookup(int);
    struct	dt_softc *dtalloc(void);
    void	dtfree(struct dt_softc *);
    
    int	dt_ioctl_list_probes(struct dt_softc *, struct dtioc_probe *);
    int	dt_ioctl_get_args(struct dt_softc *, struct dtioc_arg *);
    int	dt_ioctl_get_stats(struct dt_softc *, struct dtioc_stat *);
    int	dt_ioctl_record_start(struct dt_softc *);
    void	dt_ioctl_record_stop(struct dt_softc *);
    int	dt_ioctl_probe_enable(struct dt_softc *, struct dtioc_req *);
    int	dt_ioctl_probe_disable(struct dt_softc *, struct dtioc_req *);
    int	dt_ioctl_get_auxbase(struct dt_softc *, struct dtioc_getaux *);
    
    int	dt_ring_copy(struct dt_cpubuf *, struct uio *, size_t, size_t *);
    
    void	dt_wakeup(struct dt_softc *);
    void	dt_deferred_wakeup(void *);
    
    void
    dtattach(struct device *parent, struct device *self, void *aux)
    {
    	SLIST_INIT(&dtdev_list);
    	SIMPLEQ_INIT(&dt_probe_list);
    
    	/* Init providers */
    	dt_nprobes += dt_prov_profile_init();
    	dt_nprobes += dt_prov_syscall_init();
    	dt_nprobes += dt_prov_static_init();
    #ifdef DDBPROF
    	dt_nprobes += dt_prov_kprobe_init();
    #endif
    }
    
    int
    dtopen(dev_t dev, int flags, int mode, struct proc *p)
    {
    	struct dt_softc *sc;
    	int unit = minor(dev);
    
    	if (atomic_load_int(&allowdt) == 0)
    		return EPERM;
    
    	sc = dtalloc();
    	if (sc == NULL)
    		return ENOMEM;
    
    	/* no sleep after this point */
    	if (dtlookup(unit) != NULL) {
    		dtfree(sc);
    		return EBUSY;
    	}
    
    	sc->ds_unit = unit;
    	sc->ds_pid = p->p_p->ps_pid;
    	TAILQ_INIT(&sc->ds_pcbs);
    	sc->ds_lastcpu = 0;
    	sc->ds_evtcnt = 0;
    
    	SLIST_INSERT_HEAD(&dtdev_list, sc, ds_next);
    
    	DPRINTF("dt%d: pid %d open\n", sc->ds_unit, sc->ds_pid);
    
    	return 0;
    }
    
    int
    dtclose(dev_t dev, int flags, int mode, struct proc *p)
    {
    	struct dt_softc *sc;
    	int unit = minor(dev);
    
    	sc = dtlookup(unit);
    	KASSERT(sc != NULL);
    
    	DPRINTF("dt%d: pid %d close\n", sc->ds_unit, sc->ds_pid);
    
    	SLIST_REMOVE(&dtdev_list, sc, dt_softc, ds_next);
    	dt_ioctl_record_stop(sc);
    	dt_pcb_purge(&sc->ds_pcbs);
    	dtfree(sc);
    
    	return 0;
    }
    
    int
    dtread(dev_t dev, struct uio *uio, int flags)
    {
    	struct dt_softc *sc;
    	struct dt_cpubuf *dc;
    	int i, error = 0, unit = minor(dev);
    	size_t count, max, read = 0;
    
    	sc = dtlookup(unit);
    	KASSERT(sc != NULL);
    
    	max = howmany(uio->uio_resid, sizeof(struct dt_evt));
    	if (max < 1)
    		return (EMSGSIZE);
    
    	while (!atomic_load_int(&sc->ds_evtcnt)) {
    		sleep_setup(sc, PWAIT | PCATCH, "dtread");
    		error = sleep_finish(0, !atomic_load_int(&sc->ds_evtcnt));
    		if (error == EINTR || error == ERESTART)
    			break;
    	}
    	if (error)
    		return error;
    
    	KERNEL_ASSERT_LOCKED();
    	for (i = 0; i < ncpusfound; i++) {
    		count = 0;
    		dc = &sc->ds_cpu[(sc->ds_lastcpu + i) % ncpusfound];
    		error = dt_ring_copy(dc, uio, max, &count);
    		if (error && count == 0)
    			break;
    
    		read += count;
    		max -= count;
    		if (max == 0)
    			break;
    	}
    	sc->ds_lastcpu += i % ncpusfound;
    
    	atomic_sub_int(&sc->ds_evtcnt, read);
    
    	return error;
    }
    
    int
    dtioctl(dev_t dev, u_long cmd, caddr_t addr, int flag, struct proc *p)
    {
    	struct dt_softc *sc;
    	int unit = minor(dev);
    	int on, error = 0;
    
    	sc = dtlookup(unit);
    	KASSERT(sc != NULL);
    
    	switch (cmd) {
    	case DTIOCGPLIST:
    		return dt_ioctl_list_probes(sc, (struct dtioc_probe *)addr);
    	case DTIOCGARGS:
    		return dt_ioctl_get_args(sc, (struct dtioc_arg *)addr);
    	case DTIOCGSTATS:
    		return dt_ioctl_get_stats(sc, (struct dtioc_stat *)addr);
    	case DTIOCRECORD:
    	case DTIOCPRBENABLE:
    	case DTIOCPRBDISABLE:
    	case DTIOCGETAUXBASE:
    		/* root only ioctl(2) */
    		break;
    	default:
    		return ENOTTY;
    	}
    
    	if ((error = suser(p)) != 0)
    		return error;
    
    	switch (cmd) {
    	case DTIOCRECORD:
    		on = *(int *)addr;
    		if (on)
    			error = dt_ioctl_record_start(sc);
    		else
    			dt_ioctl_record_stop(sc);
    		break;
    	case DTIOCPRBENABLE:
    		error = dt_ioctl_probe_enable(sc, (struct dtioc_req *)addr);
    		break;
    	case DTIOCPRBDISABLE:
    		error = dt_ioctl_probe_disable(sc, (struct dtioc_req *)addr);
    		break;
    	case DTIOCGETAUXBASE:
    		error = dt_ioctl_get_auxbase(sc, (struct dtioc_getaux *)addr);
    		break;
    	default:
    		KASSERT(0);
    	}
    
    	return error;
    }
    
    struct dt_softc *
    dtlookup(int unit)
    {
    	struct dt_softc *sc;
    
    	KERNEL_ASSERT_LOCKED();
    
    	SLIST_FOREACH(sc, &dtdev_list, ds_next) {
    		if (sc->ds_unit == unit)
    			break;
    	}
    
    	return sc;
    }
    
    struct dt_softc *
    dtalloc(void)
    {
    	struct dt_softc *sc;
    	struct dt_evt *dtev;
    	int i;
    
    	sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_CANFAIL|M_ZERO);
    	if (sc == NULL)
    		return NULL;
    
    	for (i = 0; i < ncpusfound; i++) {
    		dtev = mallocarray(DT_EVTRING_SIZE, sizeof(*dtev), M_DEVBUF,
    		    M_WAITOK|M_CANFAIL|M_ZERO);
    		if (dtev == NULL)
    			break;
    		sc->ds_cpu[i].dc_ring = dtev;
    	}
    	if (i < ncpusfound) {
    		dtfree(sc);
    		return NULL;
    	}
    
    	sc->ds_si = softintr_establish(IPL_SOFTCLOCK | IPL_MPSAFE,
    	    dt_deferred_wakeup, sc);
    	if (sc->ds_si == NULL) {
    		dtfree(sc);
    		return NULL;
    	}
    
    	return sc;
    }
    
    void
    dtfree(struct dt_softc *sc)
    {
    	struct dt_evt *dtev;
    	int i;
    
    	if (sc->ds_si != NULL)
    		softintr_disestablish(sc->ds_si);
    
    	for (i = 0; i < ncpusfound; i++) {
    		dtev = sc->ds_cpu[i].dc_ring;
    		free(dtev, M_DEVBUF, DT_EVTRING_SIZE * sizeof(*dtev));
    	}
    	free(sc, M_DEVBUF, sizeof(*sc));
    }
    
    int
    dt_ioctl_list_probes(struct dt_softc *sc, struct dtioc_probe *dtpr)
    {
    	struct dtioc_probe_info info, *dtpi;
    	struct dt_probe *dtp;
    	size_t size;
    	int error = 0;
    
    	size = dtpr->dtpr_size;
    	dtpr->dtpr_size = dt_nprobes * sizeof(*dtpi);
    	if (size == 0)
    		return 0;
    
    	dtpi = dtpr->dtpr_probes;
    	SIMPLEQ_FOREACH(dtp, &dt_probe_list, dtp_next) {
    		if (size < sizeof(*dtpi)) {
    			error = ENOSPC;
    			break;
    		}
    		memset(&info, 0, sizeof(info));
    		info.dtpi_pbn = dtp->dtp_pbn;
    		info.dtpi_nargs = dtp->dtp_nargs;
    		strlcpy(info.dtpi_prov, dtp->dtp_prov->dtpv_name,
    		    sizeof(info.dtpi_prov));
    		strlcpy(info.dtpi_func, dtp->dtp_func, sizeof(info.dtpi_func));
    		strlcpy(info.dtpi_name, dtp->dtp_name, sizeof(info.dtpi_name));
    		error = copyout(&info, dtpi, sizeof(*dtpi));
    		if (error)
    			break;
    		size -= sizeof(*dtpi);
    		dtpi++;
    	}
    
    	return error;
    }
    
    int
    dt_ioctl_get_args(struct dt_softc *sc, struct dtioc_arg *dtar)
    {
    	struct dtioc_arg_info info, *dtai;
    	struct dt_probe *dtp;
    	size_t size, n, t;
    	uint32_t pbn;
    	int error = 0;
    
    	pbn = dtar->dtar_pbn;
    	if (pbn == 0 || pbn > dt_nprobes)
    		return EINVAL;
    
    	SIMPLEQ_FOREACH(dtp, &dt_probe_list, dtp_next) {
    		if (pbn == dtp->dtp_pbn)
    			break;
    	}
    	if (dtp == NULL)
    		return EINVAL;
    
    	if (dtp->dtp_sysnum != 0) {
    		/* currently not supported for system calls */
    		dtar->dtar_size = 0;
    		return 0;
    	}
    
    	size = dtar->dtar_size;
    	dtar->dtar_size = dtp->dtp_nargs * sizeof(*dtar);
    	if (size == 0)
    		return 0;
    
    	t = 0;
    	dtai = dtar->dtar_args;
    	for (n = 0; n < dtp->dtp_nargs; n++) {
    		if (size < sizeof(*dtai)) {
    			error = ENOSPC;
    			break;
    		}
    		if (n >= DTMAXARGTYPES || dtp->dtp_argtype[n] == NULL)
    			continue;
    		memset(&info, 0, sizeof(info));
    		info.dtai_pbn = dtp->dtp_pbn;
    		info.dtai_argn = t++;
    		strlcpy(info.dtai_argtype, dtp->dtp_argtype[n],
    		    sizeof(info.dtai_argtype));
    		error = copyout(&info, dtai, sizeof(*dtai));
    		if (error)
    			break;
    		size -= sizeof(*dtai);
    		dtai++;
    	}
    	dtar->dtar_size = t * sizeof(*dtar);
    
    	return error;
    }
    
    int
    dt_ioctl_get_stats(struct dt_softc *sc, struct dtioc_stat *dtst)
    {
    	struct dt_cpubuf *dc;
    	uint64_t readevt, dropevt, skiptick, recurevt;
    	int i;
    
    	readevt = dropevt = skiptick = 0;
    	for (i = 0; i < ncpusfound; i++) {
    		dc = &sc->ds_cpu[i];
    
    		membar_consumer();
    		dropevt += dc->dc_dropevt;
    		skiptick = dc->dc_skiptick;
    		recurevt = dc->dc_recurevt;
    		readevt += dc->dc_readevt;
    	}
    
    	dtst->dtst_readevt = readevt;
    	dtst->dtst_dropevt = dropevt;
    	dtst->dtst_skiptick = skiptick;
    	dtst->dtst_recurevt = recurevt;
    	return 0;
    }
    
    int
    dt_ioctl_record_start(struct dt_softc *sc)
    {
    	uint64_t now;
    	struct dt_pcb *dp;
    
    	if (sc->ds_recording)
    		return EBUSY;
    
    	KERNEL_ASSERT_LOCKED();
    	if (TAILQ_EMPTY(&sc->ds_pcbs))
    		return ENOENT;
    
    	rw_enter_write(&dt_lock);
    	now = nsecuptime();
    	TAILQ_FOREACH(dp, &sc->ds_pcbs, dp_snext) {
    		struct dt_probe *dtp = dp->dp_dtp;
    
    		SMR_SLIST_INSERT_HEAD_LOCKED(&dtp->dtp_pcbs, dp, dp_pnext);
    		dtp->dtp_recording++;
    		dtp->dtp_prov->dtpv_recording++;
    
    		if (dp->dp_nsecs != 0) {
    			clockintr_bind(&dp->dp_clockintr, dp->dp_cpu, dt_clock,
    			    dp);
    			clockintr_schedule(&dp->dp_clockintr,
    			    now + dp->dp_nsecs);
    		}
    	}
    	rw_exit_write(&dt_lock);
    
    	sc->ds_recording = 1;
    	dt_tracing++;
    
    	return 0;
    }
    
    void
    dt_ioctl_record_stop(struct dt_softc *sc)
    {
    	struct dt_pcb *dp;
    
    	if (!sc->ds_recording)
    		return;
    
    	DPRINTF("dt%d: pid %d disable\n", sc->ds_unit, sc->ds_pid);
    
    	dt_tracing--;
    	sc->ds_recording = 0;
    
    	rw_enter_write(&dt_lock);
    	TAILQ_FOREACH(dp, &sc->ds_pcbs, dp_snext) {
    		struct dt_probe *dtp = dp->dp_dtp;
    
    		/*
    		 * Set an execution barrier to ensure the shared
    		 * reference to dp is inactive.
    		 */
    		if (dp->dp_nsecs != 0)
    			clockintr_unbind(&dp->dp_clockintr, CL_BARRIER);
    
    		dtp->dtp_recording--;
    		dtp->dtp_prov->dtpv_recording--;
    		SMR_SLIST_REMOVE_LOCKED(&dtp->dtp_pcbs, dp, dt_pcb, dp_pnext);
    	}
    	rw_exit_write(&dt_lock);
    
    	/* Wait until readers cannot access the PCBs. */
    	smr_barrier();
    }
    
    int
    dt_ioctl_probe_enable(struct dt_softc *sc, struct dtioc_req *dtrq)
    {
    	struct dt_pcb_list plist;
    	struct dt_probe *dtp;
    	struct dt_pcb *dp;
    	int error;
    
    	SIMPLEQ_FOREACH(dtp, &dt_probe_list, dtp_next) {
    		if (dtp->dtp_pbn == dtrq->dtrq_pbn)
    			break;
    	}
    	if (dtp == NULL)
    		return ENOENT;
    
    	/* Only allow one probe of each type. */
    	TAILQ_FOREACH(dp, &sc->ds_pcbs, dp_snext) {
    		if (dp->dp_dtp->dtp_pbn == dtrq->dtrq_pbn)
    			return EEXIST;
    	}
    
    	TAILQ_INIT(&plist);
    	error = dtp->dtp_prov->dtpv_alloc(dtp, sc, &plist, dtrq);
    	if (error)
    		return error;
    
    	DPRINTF("dt%d: pid %d enable %u : %b\n", sc->ds_unit, sc->ds_pid,
    	    dtrq->dtrq_pbn, (unsigned int)dtrq->dtrq_evtflags, DTEVT_FLAG_BITS);
    
    	/* Append all PCBs to this instance */
    	TAILQ_CONCAT(&sc->ds_pcbs, &plist, dp_snext);
    
    	return 0;
    }
    
    int
    dt_ioctl_probe_disable(struct dt_softc *sc, struct dtioc_req *dtrq)
    {
    	struct dt_probe *dtp;
    	int error;
    
    	SIMPLEQ_FOREACH(dtp, &dt_probe_list, dtp_next) {
    		if (dtp->dtp_pbn == dtrq->dtrq_pbn)
    			break;
    	}
    	if (dtp == NULL)
    		return ENOENT;
    
    	if (dtp->dtp_prov->dtpv_dealloc) {
    		error = dtp->dtp_prov->dtpv_dealloc(dtp, sc, dtrq);
    		if (error)
    			return error;
    	}
    
    	DPRINTF("dt%d: pid %d dealloc\n", sc->ds_unit, sc->ds_pid,
    	    dtrq->dtrq_pbn);
    
    	return 0;
    }
    
    int
    dt_ioctl_get_auxbase(struct dt_softc *sc, struct dtioc_getaux *dtga)
    {
    	struct uio uio;
    	struct iovec iov;
    	struct process *pr;
    	struct proc *p = curproc;
    	AuxInfo auxv[ELF_AUX_ENTRIES];
    	int i, error;
    
    	dtga->dtga_auxbase = 0;
    
    	if ((pr = prfind(dtga->dtga_pid)) == NULL)
    		return ESRCH;
    
    	iov.iov_base = auxv;
    	iov.iov_len = sizeof(auxv);
    	uio.uio_iov = &iov;
    	uio.uio_iovcnt = 1;
    	uio.uio_offset = pr->ps_auxinfo;
    	uio.uio_resid = sizeof(auxv);
    	uio.uio_segflg = UIO_SYSSPACE;
    	uio.uio_procp = p;
    	uio.uio_rw = UIO_READ;
    
    	error = process_domem(p, pr, &uio, PT_READ_D);
    	if (error)
    		return error;
    
    	for (i = 0; i < ELF_AUX_ENTRIES; i++)
    		if (auxv[i].au_id == AUX_base)
    			dtga->dtga_auxbase = auxv[i].au_v;
    
    	return 0;
    }
    
    struct dt_probe *
    dt_dev_alloc_probe(const char *func, const char *name, struct dt_provider *dtpv)
    {
    	struct dt_probe *dtp;
    
    	dtp = malloc(sizeof(*dtp), M_DT, M_NOWAIT|M_ZERO);
    	if (dtp == NULL)
    		return NULL;
    
    	SMR_SLIST_INIT(&dtp->dtp_pcbs);
    	dtp->dtp_prov = dtpv;
    	dtp->dtp_func = func;
    	dtp->dtp_name = name;
    	dtp->dtp_sysnum = -1;
    	dtp->dtp_ref = 0;
    
    	return dtp;
    }
    
    void
    dt_dev_register_probe(struct dt_probe *dtp)
    {
    	static uint64_t probe_nb;
    
    	dtp->dtp_pbn = ++probe_nb;
    	SIMPLEQ_INSERT_TAIL(&dt_probe_list, dtp, dtp_next);
    }
    
    struct dt_pcb *
    dt_pcb_alloc(struct dt_probe *dtp, struct dt_softc *sc)
    {
    	struct dt_pcb *dp;
    
    	dp = malloc(sizeof(*dp), M_DT, M_WAITOK|M_CANFAIL|M_ZERO);
    	if (dp == NULL)
    		return NULL;
    
    	dp->dp_sc = sc;
    	dp->dp_dtp = dtp;
    	return dp;
    }
    
    void
    dt_pcb_free(struct dt_pcb *dp)
    {
    	free(dp, M_DT, sizeof(*dp));
    }
    
    void
    dt_pcb_purge(struct dt_pcb_list *plist)
    {
    	struct dt_pcb *dp;
    
    	while ((dp = TAILQ_FIRST(plist)) != NULL) {
    		TAILQ_REMOVE(plist, dp, dp_snext);
    		dt_pcb_free(dp);
    	}
    }
    
    void
    dt_pcb_ring_skiptick(struct dt_pcb *dp, unsigned int skip)
    {
    	struct dt_cpubuf *dc = &dp->dp_sc->ds_cpu[cpu_number()];
    
    	dc->dc_skiptick += skip;
    	membar_producer();
    }
    
    /*
     * Get a reference to the next free event state from the ring.
     */
    struct dt_evt *
    dt_pcb_ring_get(struct dt_pcb *dp, int profiling)
    {
    	struct proc *p = curproc;
    	struct dt_evt *dtev;
    	int prod, cons, distance;
    	struct dt_cpubuf *dc = &dp->dp_sc->ds_cpu[cpu_number()];
    
    	if (dc->dc_inevt == 1) {
    		dc->dc_recurevt++;
    		membar_producer();
    		return NULL;
    	}
    
    	dc->dc_inevt = 1;
    
    	membar_consumer();
    	prod = dc->dc_prod;
    	cons = dc->dc_cons;
    	distance = prod - cons;
    	if (distance == 1 || distance == (1 - DT_EVTRING_SIZE)) {
    		/* read(2) isn't finished */
    		dc->dc_dropevt++;
    		membar_producer();
    
    		dc->dc_inevt = 0;
    		return NULL;
    	}
    
    	/*
    	 * Save states in next free event slot.
    	 */
    	dtev = &dc->dc_ring[cons];
    	memset(dtev, 0, sizeof(*dtev));
    
    	dtev->dtev_pbn = dp->dp_dtp->dtp_pbn;
    	dtev->dtev_cpu = cpu_number();
    	dtev->dtev_pid = p->p_p->ps_pid;
    	dtev->dtev_tid = p->p_tid + THREAD_PID_OFFSET;
    	nanotime(&dtev->dtev_tsp);
    
    	if (ISSET(dp->dp_evtflags, DTEVT_EXECNAME))
    		strlcpy(dtev->dtev_comm, p->p_p->ps_comm, sizeof(dtev->dtev_comm));
    
    	if (ISSET(dp->dp_evtflags, DTEVT_KSTACK)) {
    		if (profiling)
    			stacktrace_save_at(&dtev->dtev_kstack, DT_FA_PROFILE);
    		else
    			stacktrace_save_at(&dtev->dtev_kstack, DT_FA_STATIC);
    	}
    	if (ISSET(dp->dp_evtflags, DTEVT_USTACK))
    		stacktrace_save_utrace(&dtev->dtev_ustack);
    
    	return dtev;
    }
    
    void
    dt_pcb_ring_consume(struct dt_pcb *dp, struct dt_evt *dtev)
    {
    	struct dt_cpubuf *dc = &dp->dp_sc->ds_cpu[cpu_number()];
    
    	KASSERT(dtev == &dc->dc_ring[dc->dc_cons]);
    
    	dc->dc_cons = (dc->dc_cons + 1) % DT_EVTRING_SIZE;
    	membar_producer();
    
    	atomic_inc_int(&dp->dp_sc->ds_evtcnt);
    	dc->dc_inevt = 0;
    
    	dt_wakeup(dp->dp_sc);
    }
    
    /*
     * Copy at most `max' events from `dc', producing the same amount
     * of free slots.
     */
    int
    dt_ring_copy(struct dt_cpubuf *dc, struct uio *uio, size_t max, size_t *rcvd)
    {
    	size_t count, copied = 0;
    	unsigned int cons, prod;
    	int error = 0;
    
    	KASSERT(max > 0);
    
    	membar_consumer();
    	cons = dc->dc_cons;
    	prod = dc->dc_prod;
    
    	if (cons < prod)
    		count = DT_EVTRING_SIZE - prod;
    	else
    		count = cons - prod;
    
    	if (count == 0)
    		return 0;
    
    	count = MIN(count, max);
    	error = uiomove(&dc->dc_ring[prod], count * sizeof(struct dt_evt), uio);
    	if (error)
    		return error;
    	copied += count;
    
    	/* Produce */
    	prod = (prod + count) % DT_EVTRING_SIZE;
    
    	/* If the ring didn't wrap, stop here. */
    	if (max == copied || prod != 0 || cons == 0)
    		goto out;
    
    	count = MIN(cons, (max - copied));
    	error = uiomove(&dc->dc_ring[0], count * sizeof(struct dt_evt), uio);
    	if (error)
    		goto out;
    
    	copied += count;
    	prod += count;
    
    out:
    	dc->dc_readevt += copied;
    	dc->dc_prod = prod;
    	membar_producer();
    
    	*rcvd = copied;
    	return error;
    }
    
    void
    dt_wakeup(struct dt_softc *sc)
    {
    	/*
    	 * It is not always safe or possible to call wakeup(9) and grab
    	 * the SCHED_LOCK() from a given tracepoint.  This is true for
    	 * any tracepoint that might trigger inside the scheduler or at
    	 * any IPL higher than IPL_SCHED.  For this reason use a soft-
    	 * interrupt to defer the wakeup.
    	 */
    	softintr_schedule(sc->ds_si);
    }
    
    void
    dt_deferred_wakeup(void *arg)
    {
    	struct dt_softc *sc = arg;
    
    	wakeup(sc);
    }