Branch :
/* $OpenBSD: subr_prof.c,v 1.42 2025/05/24 06:49:16 deraadt Exp $ */
/* $NetBSD: subr_prof.c,v 1.12 1996/04/22 01:38:50 christos Exp $ */
/*-
* Copyright (c) 1982, 1986, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)subr_prof.c 8.3 (Berkeley) 9/23/93
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/atomic.h>
#include <sys/clockintr.h>
#include <sys/namei.h>
#include <sys/vnode.h>
#include <sys/pledge.h>
#include <sys/stat.h>
#include <sys/proc.h>
#include <sys/pool.h>
#include <sys/file.h>
#include <sys/fcntl.h>
#include <sys/filedesc.h>
#include <sys/resourcevar.h>
#include <sys/mount.h>
#include <sys/sysctl.h>
#include <sys/syscallargs.h>
#include <sys/user.h>
#include <sys/gmon.h>
uint64_t profclock_period;
#if defined(GPROF) || defined(DDBPROF)
#include <sys/malloc.h>
#include <uvm/uvm_extern.h>
#include <machine/db_machdep.h>
#include <ddb/db_extern.h>
/*
* Flag to prevent CPUs from executing the mcount() monitor function
* until we're sure they are in a sane state.
*/
int gmoninit = 0;
u_int gmon_cpu_count; /* [K] number of CPUs with profiling enabled */
extern char etext[];
void gmonclock(struct clockrequest *, void *, void *);
void
prof_init(void)
{
CPU_INFO_ITERATOR cii;
struct cpu_info *ci;
struct gmonparam *p;
u_long lowpc, highpc, textsize;
u_long kcountsize, fromssize, tossize;
long tolimit;
char *cp;
int size;
/*
* Round lowpc and highpc to multiples of the density we're using
* so the rest of the scaling (here and in gprof) stays in ints.
*/
lowpc = ROUNDDOWN(KERNBASE, HISTFRACTION * sizeof(HISTCOUNTER));
highpc = ROUNDUP((u_long)etext, HISTFRACTION * sizeof(HISTCOUNTER));
textsize = highpc - lowpc;
#ifdef GPROF
printf("Profiling kernel, textsize=%ld [%lx..%lx]\n",
textsize, lowpc, highpc);
#endif
kcountsize = textsize / HISTFRACTION;
fromssize = textsize / HASHFRACTION;
tolimit = textsize * ARCDENSITY / 100;
if (tolimit < MINARCS)
tolimit = MINARCS;
else if (tolimit > MAXARCS)
tolimit = MAXARCS;
tossize = tolimit * sizeof(struct tostruct);
size = sizeof(*p) + kcountsize + fromssize + tossize;
/* Allocate and initialize one profiling buffer per CPU. */
CPU_INFO_FOREACH(cii, ci) {
cp = km_alloc(round_page(size), &kv_any, &kp_zero, &kd_nowait);
if (cp == NULL) {
printf("No memory for profiling.\n");
return;
}
clockintr_bind(&ci->ci_gmonclock, ci, gmonclock, NULL);
clockintr_stagger(&ci->ci_gmonclock, profclock_period,
CPU_INFO_UNIT(ci), MAXCPUS);
p = (struct gmonparam *)cp;
cp += sizeof(*p);
p->tos = (struct tostruct *)cp;
cp += tossize;
p->kcount = (u_short *)cp;
cp += kcountsize;
p->froms = (u_short *)cp;
p->state = GMON_PROF_OFF;
p->lowpc = lowpc;
p->highpc = highpc;
p->textsize = textsize;
p->hashfraction = HASHFRACTION;
p->kcountsize = kcountsize;
p->fromssize = fromssize;
p->tolimit = tolimit;
p->tossize = tossize;
ci->ci_gmon = p;
}
}
int
prof_state_toggle(struct cpu_info *ci, int oldstate)
{
struct gmonparam *gp = ci->ci_gmon;
int error = 0;
KERNEL_ASSERT_LOCKED();
if (gp->state == oldstate)
return (0);
switch (gp->state) {
case GMON_PROF_ON:
#if !defined(GPROF)
/*
* If this is not a profiling kernel, we need to patch
* all symbols that can be instrumented.
*/
error = db_prof_enable();
#endif
if (error == 0) {
if (++gmon_cpu_count == 1)
startprofclock(&process0);
clockintr_advance(&ci->ci_gmonclock, profclock_period);
}
break;
default:
error = EINVAL;
gp->state = GMON_PROF_OFF;
/* FALLTHROUGH */
case GMON_PROF_OFF:
clockintr_cancel(&ci->ci_gmonclock);
if (--gmon_cpu_count == 0)
stopprofclock(&process0);
#if !defined(GPROF)
db_prof_disable();
#endif
break;
}
return (error);
}
/*
* Return kernel profiling information.
*/
int
sysctl_doprof(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
size_t newlen)
{
CPU_INFO_ITERATOR cii;
struct cpu_info *ci;
struct gmonparam *gp = NULL;
int error, cpuid, op, state;
/* all sysctl names at this level are name and field */
if (namelen != 2)
return (ENOTDIR); /* overloaded */
op = name[0];
cpuid = name[1];
CPU_INFO_FOREACH(cii, ci) {
if (cpuid == CPU_INFO_UNIT(ci)) {
gp = ci->ci_gmon;
break;
}
}
if (gp == NULL)
return (EOPNOTSUPP);
/* Assume that if we're here it is safe to execute profiling. */
gmoninit = 1;
switch (op) {
case GPROF_STATE:
state = gp->state;
error = sysctl_int(oldp, oldlenp, newp, newlen, &gp->state);
if (error)
return (error);
return prof_state_toggle(ci, state);
case GPROF_COUNT:
return (sysctl_struct(oldp, oldlenp, newp, newlen,
gp->kcount, gp->kcountsize));
case GPROF_FROMS:
return (sysctl_struct(oldp, oldlenp, newp, newlen,
gp->froms, gp->fromssize));
case GPROF_TOS:
return (sysctl_struct(oldp, oldlenp, newp, newlen,
gp->tos, gp->tossize));
case GPROF_GMONPARAM:
return (sysctl_rdstruct(oldp, oldlenp, newp, gp, sizeof *gp));
default:
return (EOPNOTSUPP);
}
/* NOTREACHED */
}
void
gmonclock(struct clockrequest *cr, void *cf, void *arg)
{
uint64_t count;
struct clockframe *frame = cf;
struct gmonparam *g = curcpu()->ci_gmon;
u_long i;
count = clockrequest_advance(cr, profclock_period);
if (count > ULONG_MAX)
count = ULONG_MAX;
/*
* Kernel statistics are just like addupc_intr(), only easier.
*/
if (!CLKF_USERMODE(frame) && g != NULL && g->state == GMON_PROF_ON) {
i = CLKF_PC(frame) - g->lowpc;
if (i < g->textsize) {
i /= HISTFRACTION * sizeof(*g->kcount);
g->kcount[i] += (u_long)count;
}
}
}
#endif /* GPROF || DDBPROF */
/*
* Profiling system call.
*
* The scale factor is a fixed point number with 16 bits of fraction, so that
* 1.0 is represented as 0x10000. A scale factor of 0 turns off profiling.
*/
int
sys_profil(struct proc *p, void *v, register_t *retval)
{
struct sys_profil_args /* {
syscallarg(void *) buf;
syscallarg(size_t) buflen;
syscallarg(size_t) samplesize;
syscallarg(u_long) offset;
syscallarg(u_int) scale;
syscallarg(int) dirfd;
} */ *uap = v;
struct process *pr = p->p_p;
struct uprof *upp;
int s;
/* Only binaries linked for profiling can do profil() */
if ((pr->ps_flags & PS_PROFILE) == 0)
return (EPERM);
if (SCARG(uap, scale) > (1 << 16))
return (EINVAL);
if (SCARG(uap, scale) == 0) {
stopprofclock(pr);
need_resched(curcpu());
return (0);
}
upp = &pr->ps_prof;
/* May not change to a new buffer */
if (upp->pr_buf &&
(SCARG(uap, buf) != upp->pr_buf ||
SCARG(uap, buflen) != upp->pr_buflen))
return (EALREADY);
/* First call determines where the output files will go */
if (upp->pr_cdir == NULL) {
struct vnode *dirvp;
upp->pr_ucred = crhold(p->p_ucred);
if (SCARG(uap, dirfd) != -1) {
struct filedesc *fdp = p->p_fd;
struct file *fp;
if ((fp = fd_getfile(fdp, SCARG(uap, dirfd))) == NULL)
return (EBADF);
dirvp = fp->f_data;
if (fp->f_type != DTYPE_VNODE || dirvp->v_type != VDIR) {
FRELE(fp, p);
return (ENOTDIR);
}
} else
dirvp = p->p_fd->fd_cdir;
upp->pr_cdir = dirvp;
vref(upp->pr_cdir);
}
#if 0
if (upp->pr_buf == NULL) {
/* XXX In the future, consider forcing the buffer immutable */
struct sys_mimmutable_args ua;
SCARG(&ua, addr) = SCARG(uap, buf);
SCARG(&ua, len) = SCARG(uap, buflen);
(void) sys_mimmutable(p, &ua, retval);
}
#endif
/* Block profile interrupts while changing state. */
s = splstatclock();
upp->pr_off = SCARG(uap, offset);
upp->pr_scale = SCARG(uap, scale);
upp->pr_base = (caddr_t)SCARG(uap, buf) + sizeof(struct gmonhdr);
upp->pr_size = SCARG(uap, samplesize);
upp->pr_buf = SCARG(uap, buf);
upp->pr_buflen = SCARG(uap, buflen);
startprofclock(pr);
splx(s);
need_resched(curcpu());
return (0);
}
void
prof_fork(struct process *pr)
{
struct uprof *upp = &pr->ps_prof;
if (upp->pr_cdir)
vref(upp->pr_cdir);
if (upp->pr_ucred)
crhold(upp->pr_ucred);
}
void
prof_exec(struct process *pr)
{
struct uprof *upp = &pr->ps_prof;
if (upp->pr_cdir)
vrele(upp->pr_cdir);
upp->pr_cdir = NULL;
if (upp->pr_ucred)
crfree(upp->pr_ucred);
upp->pr_ucred = NULL;
upp->pr_buf = NULL;
}
void
prof_write(struct proc *p)
{
struct process *pr = p->p_p;
struct uprof *upp = &pr->ps_prof;
struct filedesc *fdp = p->p_fd;
struct vnode *cdir = NULL, *old_cdir, *vp;
struct ucred *cred = NULL, *old_cred;
struct nameidata nd;
struct gmonhdr hdr;
struct vattr vattr;
int error;
size_t size;
char *name = NULL;
/* Is this process profiling? */
if (upp->pr_buf == NULL)
return;
cdir = upp->pr_cdir;
cred = upp->pr_ucred;
/* Process sets hrd.totarc to indicate space used by the arc tables */
error = copyin(upp->pr_buf, &hdr, sizeof hdr);
if (error)
goto out;
if (hdr.totarc <= 0)
goto out;
size = sizeof(hdr) + upp->pr_size + hdr.totarc;
if (size > upp->pr_buflen)
goto out;
name = pool_get(&namei_pool, PR_WAITOK);
if (snprintf(name, MAXPATHLEN, "gmon.%s.%d.out",
pr->ps_comm, pr->ps_pid) >= MAXPATHLEN)
goto out;
/* Swap back to the original directory */
old_cdir = fdp->fd_cdir;
fdp->fd_cdir = cdir;
vrele(old_cdir);
cdir = NULL; /* will be released as fdp->fd_cdir */
/* Swap back to the original cred */
old_cred = p->p_ucred;
p->p_ucred = crdup(cred);
crfree(old_cred);
NDINIT(&nd, 0, BYPASSUNVEIL | KERNELPATH, UIO_SYSSPACE, name, p);
error = vn_open(&nd, O_CREAT | FWRITE | O_NOFOLLOW | O_NONBLOCK,
S_IRUSR | S_IWUSR);
if (error)
goto out;
/*
* Don't dump to non-regular files, files with links, or files
* owned by someone else.
*/
vp = nd.ni_vp;
if ((error = VOP_GETATTR(vp, &vattr, cred, p)) != 0) {
VOP_UNLOCK(vp);
vn_close(vp, FWRITE, cred, p);
goto out;
}
if (vp->v_type != VREG || vattr.va_nlink != 1 ||
vattr.va_mode & ((VREAD | VWRITE) >> 3 | (VREAD | VWRITE) >> 6) ||
vattr.va_uid != cred->cr_uid) {
error = EACCES;
VOP_UNLOCK(vp);
vn_close(vp, FWRITE, cred, p);
goto out;
}
vattr_null(&vattr);
vattr.va_size = 0;
VOP_SETATTR(vp, &vattr, cred, p);
VOP_UNLOCK(vp);
vref(vp);
error = vn_close(vp, FWRITE, cred, p);
if (error == 0)
error = vn_rdwr(UIO_WRITE, vp, upp->pr_buf, size,
0, UIO_USERSPACE, IO_UNIT, cred, NULL, p);
vrele(vp);
out:
if (name)
pool_put(&namei_pool, name);
if (cred)
crfree(cred);
if (cdir)
vrele(cdir);
}
void
profclock(struct clockrequest *cr, void *cf, void *arg)
{
uint64_t count;
struct clockframe *frame = cf;
struct proc *p = curproc;
count = clockrequest_advance(cr, profclock_period);
if (count > ULONG_MAX)
count = ULONG_MAX;
if (CLKF_USERMODE(frame)) {
if (ISSET(p->p_p->ps_flags, PS_PROFIL))
addupc_intr(p, CLKF_PC(frame), (u_long)count);
} else {
if (p != NULL && ISSET(p->p_p->ps_flags, PS_PROFIL))
addupc_intr(p, PROC_PC(p), (u_long)count);
}
}
/*
* Scale is a fixed-point number with the binary point 16 bits
* into the value, and is <= 1.0. pc is at most 32 bits, so the
* intermediate result is at most 48 bits.
*/
#define PC_TO_INDEX(pc, prof) \
((int)(((u_quad_t)((pc) - (prof)->pr_off) * \
(u_quad_t)((prof)->pr_scale)) >> 16) & ~1)
/*
* Collect user-level profiling statistics; called on a profiling tick,
* when a process is running in user-mode. This routine may be called
* from an interrupt context. Schedule an AST that will vector us to
* trap() with a context in which copyin and copyout will work.
* Trap will then call addupc_task().
*/
void
addupc_intr(struct proc *p, u_long pc, u_long nticks)
{
struct uprof *prof;
prof = &p->p_p->ps_prof;
if (pc < prof->pr_off || PC_TO_INDEX(pc, prof) >= prof->pr_size)
return; /* out of range; ignore */
p->p_prof_addr = pc;
p->p_prof_ticks += nticks;
atomic_setbits_int(&p->p_flag, P_OWEUPC);
need_proftick(p);
}
/*
* Much like before, but we can afford to take faults here. If the
* update fails, we simply turn off profiling.
*/
void
addupc_task(struct proc *p, u_long pc, u_int nticks)
{
struct process *pr = p->p_p;
struct uprof *prof;
caddr_t addr;
u_int i;
u_short v;
/* Testing PS_PROFIL may be unnecessary, but is certainly safe. */
if ((pr->ps_flags & PS_PROFIL) == 0 || nticks == 0)
return;
prof = &pr->ps_prof;
if (pc < prof->pr_off ||
(i = PC_TO_INDEX(pc, prof)) >= prof->pr_size)
return;
addr = prof->pr_base + i;
if (copyin(addr, (caddr_t)&v, sizeof(v)) == 0) {
v += nticks;
if (copyout((caddr_t)&v, addr, sizeof(v)) == 0)
return;
}
stopprofclock(pr);
}