Edit

IABSD.fr/src/sys/kern/sys_futex.c

Branch :

  • Show log

    Commit

  • Author : claudio
    Date : 2025-06-05 08:44:00
    Hash : c6592223
    Message : This code no longer uses tick_nsec so there is no need for sys/kernel.h OK dlg@

  • sys/kern/sys_futex.c
  • /*	$OpenBSD: sys_futex.c,v 1.25 2025/06/05 08:44:00 claudio Exp $ */
    
    /*
     * Copyright (c) 2016-2017 Martin Pieuchot
     *
     * Permission to use, copy, modify, and distribute this software for any
     * purpose with or without fee is hereby granted, provided that the above
     * copyright notice and this permission notice appear in all copies.
     *
     * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
     * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
     * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
     * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
     * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
     * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
     * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
     */
    
    #include <sys/param.h>
    #include <sys/systm.h>
    #include <sys/proc.h>
    #include <sys/mount.h>
    #include <sys/syscallargs.h>
    #include <sys/pool.h>
    #include <sys/time.h>
    #include <sys/rwlock.h>
    #include <sys/percpu.h> /* CACHELINESIZE */
    #include <sys/futex.h>
    
    #ifdef KTRACE
    #include <sys/ktrace.h>
    #endif
    
    #include <uvm/uvm.h>
    
    /*
     * Locks used to protect variables in this file:
     *
     *	I	immutable after initialization
     *	f	futex_slpque fsq_lock
     */
    
    /*
     * Kernel representation of a futex.
     *
     * The userland address that the futex is waiting on is represented by
     * ft_ps, ft_obj, ft_amap, and ft_off.
     *
     * Whether the futex is waiting or woken up is represented by the
     * ft_proc pointer being set (ie, not NULL) or not (ie, NULL) respectively.
     * When the futex is waiting it is referenced by the list in a
     * futex_slpque. When the futex gets woken up, it is removed from the
     * list and the ft_proc pointer is cleared to indicate that the reference
     * held by the list has been released. Only a thread holding the lock
     * may remove the futex from the list and clear ft_proc. This is true
     * even for futex_wait().
     *
     * However, futex_wait() may read ft_proc without the lock so it can
     * avoid contending with the thread that just woke it up. This means
     * that once ft_proc is cleared, futex_wait() may return, the struct
     * futex will no longer exist, and it is no longer safe to access it
     * from the wakeup side.
     *
     * tl;dr: the thread holding the slpque lock "owns" the references
     * to the futexes on the list until it clears ft_proc.
     */
    
    struct futex {
    	TAILQ_ENTRY(futex)	 ft_entry;	/* [f] entry on futex_slpque */
    
    	struct process		*ft_ps;		/* [I] for private futexes */
    	struct uvm_object	*ft_obj;	/* [f] UVM object */
    	struct vm_amap		*ft_amap;	/* [f] UVM amap */
    	volatile voff_t		 ft_off;	/* [f] UVM offset */
    
    	struct proc * volatile	 ft_proc;	/* [f] waiting thread */
    };
    
    static int
    futex_is_eq(const struct futex *a, const struct futex *b)
    {
    	return (a->ft_off == b->ft_off &&
    	    a->ft_ps == b->ft_ps &&
    	    a->ft_obj == b->ft_obj &&
    	    a->ft_amap == b->ft_amap);
    }
    
    TAILQ_HEAD(futex_list, futex);
    
    struct futex_slpque {
    	struct futex_list	fsq_list;	/* [F] */
    	struct rwlock		fsq_lock;
    	uint32_t		fsq_id;		/* [I] for lock ordering */
    } __aligned(CACHELINESIZE);
    
    /* Syscall helpers. */
    static int	futex_wait(struct proc *, uint32_t *, uint32_t,
    		    const struct timespec *, int);
    static int	futex_wake(struct proc *, uint32_t *, uint32_t, int,
    		    register_t *);
    static int	futex_requeue(struct proc *, uint32_t *, uint32_t,
    		    uint32_t *, uint32_t, int, register_t *);
    
    /* Flags for futex_get(). kernel private flags sit in FUTEX_OP_MASK space */
    #define FT_PRIVATE	FUTEX_PRIVATE_FLAG	/* Futex is process-private. */
    
    #define FUTEX_SLPQUES_BITS	6
    #define FUTEX_SLPQUES_SIZE	(1U << FUTEX_SLPQUES_BITS)
    #define FUTEX_SLPQUES_MASK	(FUTEX_SLPQUES_SIZE - 1)
    
    static struct futex_slpque futex_slpques[FUTEX_SLPQUES_SIZE];
    
    void
    futex_init(void)
    {
    	struct futex_slpque *fsq;
    	unsigned int i;
    
    	for (i = 0; i < nitems(futex_slpques); i++) {
    		fsq = &futex_slpques[i];
    
    		TAILQ_INIT(&fsq->fsq_list);
    		rw_init(&fsq->fsq_lock, "futexlk");
    
    		fsq->fsq_id = arc4random();
    		fsq->fsq_id &= ~FUTEX_SLPQUES_MASK;
    		fsq->fsq_id |= i;
    	}
    }
    
    int
    sys_futex(struct proc *p, void *v, register_t *retval)
    {
    	struct sys_futex_args /* {
    		syscallarg(uint32_t *) f;
    		syscallarg(int) op;
    		syscallarg(inr) val;
    		syscallarg(const struct timespec *) timeout;
    		syscallarg(uint32_t *) g;
    	} */ *uap = v;
    	uint32_t *uaddr = SCARG(uap, f);
    	int op = SCARG(uap, op);
    	uint32_t val = SCARG(uap, val);
    	const struct timespec *timeout = SCARG(uap, timeout);
    	void *g = SCARG(uap, g);
    	int flags = op & FUTEX_FLAG_MASK;
    	int error = 0;
    
    	switch (op & FUTEX_OP_MASK) {
    	case FUTEX_WAIT:
    		error = futex_wait(p, uaddr, val, timeout, flags);
    		break;
    	case FUTEX_WAKE:
    		error = futex_wake(p, uaddr, val, flags, retval);
    		break;
    	case FUTEX_REQUEUE:
    		error = futex_requeue(p, uaddr, val, g,
    		    (u_long)timeout, flags, retval);
    		break;
    	default:
    		error = ENOSYS;
    		break;
    	}
    
    	return error;
    }
    
    static void
    futex_addrs(struct proc *p, struct futex *f, uint32_t *uaddr, int flags)
    {
    	vm_map_t map = &p->p_vmspace->vm_map;
    	vm_map_entry_t entry;
    	struct uvm_object *obj = NULL;
    	struct vm_amap *amap = NULL;
    	voff_t off = (vaddr_t)uaddr;
    	struct process *ps;
    
    	if (ISSET(flags, FT_PRIVATE))
    		ps = p->p_p;
    	else {
    		ps = NULL;
    
    		vm_map_lock_read(map);
    		if (uvm_map_lookup_entry(map, (vaddr_t)uaddr, &entry) &&
    		    entry->inheritance == MAP_INHERIT_SHARE) {
    			if (UVM_ET_ISOBJ(entry)) {
    				obj = entry->object.uvm_obj;
    				off = entry->offset +
    				    ((vaddr_t)uaddr - entry->start);
    			} else if (entry->aref.ar_amap) {
    				amap = entry->aref.ar_amap;
    				off = ptoa(entry->aref.ar_pageoff) +
    				    ((vaddr_t)uaddr - entry->start);
    			}
    		}
    		vm_map_unlock_read(map);
    	}
    
    	f->ft_ps = ps;
    	f->ft_obj = obj;
    	f->ft_amap = amap;
    	f->ft_off = off;
    }
    
    static inline struct futex_slpque *
    futex_get_slpque(struct futex *f)
    {
    	uint32_t key = f->ft_off >> 3; /* watevs */
    	key ^= key >> FUTEX_SLPQUES_BITS;
    
    	return (&futex_slpques[key & FUTEX_SLPQUES_MASK]);
    }
    
    static int
    futex_unwait(struct futex_slpque *ofsq, struct futex *f)
    {
    	struct futex_slpque *fsq;
    	int rv;
    
    	/*
    	 * REQUEUE can move a futex between buckets, so follow it if needed.
    	 */
    
    	for (;;) {
    		rw_enter_write(&ofsq->fsq_lock);
    		fsq = futex_get_slpque(f);
    		if (ofsq == fsq)
    			break;
    
    		rw_exit_write(&ofsq->fsq_lock);
    		ofsq = fsq;
    	}
    
    	rv = f->ft_proc != NULL;
    	if (rv)
    		TAILQ_REMOVE(&fsq->fsq_list, f, ft_entry);
    	rw_exit_write(&fsq->fsq_lock);
    
    	return (rv);
    }
    
    /*
     * Put the current thread on the sleep queue of the futex at address
     * ``uaddr''.  Let it sleep for the specified ``timeout'' time, or
     * indefinitely if the argument is NULL.
     */
    static int
    futex_wait(struct proc *p, uint32_t *uaddr, uint32_t val,
        const struct timespec *timeout, int flags)
    {
    	struct futex f;
    	struct futex_slpque *fsq;
    	uint64_t nsecs = INFSLP;
    	uint32_t cval;
    	int error;
    
    	if (timeout != NULL) {
    		struct timespec ts;
    
    		if ((error = copyin(timeout, &ts, sizeof(ts))))
    			return error;
    #ifdef KTRACE
    		if (KTRPOINT(p, KTR_STRUCT))
    			ktrreltimespec(p, &ts);
    #endif
    		if (ts.tv_sec < 0 || !timespecisvalid(&ts))
    			return EINVAL;
    
    		nsecs = MIN(TIMESPEC_TO_NSEC(&ts), MAXTSLP);
    		if (nsecs == 0)
    			return ETIMEDOUT;
    	}
    
    	futex_addrs(p, &f, uaddr, flags);
    	fsq = futex_get_slpque(&f);
    
    	/* Mark futex as waiting. */
    	f.ft_proc = p;
    	rw_enter_write(&fsq->fsq_lock);
    	/* Make the waiting futex visible to wake/requeue */
    	TAILQ_INSERT_TAIL(&fsq->fsq_list, &f, ft_entry);
    	rw_exit_write(&fsq->fsq_lock);
    
    	/*
    	 * Do not return before f has been removed from the slpque!
    	 */
    
    	/*
    	 * Read user space futex value
    	 */
    	if ((error = copyin32(uaddr, &cval)) != 0)
    		goto exit;
    
    	/* If the value changed, stop here. */
    	if (cval != val) {
    		error = EAGAIN;
    		goto exit;
    	}
    
    	sleep_setup(&f, PWAIT|PCATCH, "fsleep");
    	error = sleep_finish(nsecs, f.ft_proc != NULL);
    	/* Remove ourself if we haven't been awaken. */
    	if (error != 0 || f.ft_proc != NULL) {
    		if (futex_unwait(fsq, &f) == 0)
    			error = 0;
    
    		switch (error) {
    		case ERESTART:
    			error = ECANCELED;
    			break;
    		case EWOULDBLOCK:
    			error = ETIMEDOUT;
    			break;
    		default:
    			break;
    		}
    	}
    
    	return error;
    exit:
    	if (f.ft_proc != NULL)
    		futex_unwait(fsq, &f);
    	return error;
    }
    
    static void
    futex_list_wakeup(struct futex_list *fl)
    {
    	struct futex *f, *nf;
    	struct proc *p;
    
    	/*
    	 * Setting ft_proc to NULL releases the futex reference
    	 * currently held via the slpque lock.
    	 *
    	 * SCHED_LOCK is only needed to call wakeup_proc.
    	 */
    
    	SCHED_LOCK();
    	TAILQ_FOREACH_SAFE(f, fl, ft_entry, nf) {
    		p = f->ft_proc;
    		f->ft_proc = NULL;
    		wakeup_proc(p);
    	}
    	SCHED_UNLOCK();
    }
    
    /*
     * Wakeup at most ``n'' sibling threads sleeping on a futex at address
     * ``uaddr'' and requeue at most ``m'' sibling threads on a futex at
     * address ``uaddr2''.
     */
    static int
    futex_requeue(struct proc *p, uint32_t *uaddr, uint32_t n,
        uint32_t *uaddr2, uint32_t m, int flags, register_t *retval)
    {
    	struct futex_list fl = TAILQ_HEAD_INITIALIZER(fl);
    	struct futex okey, nkey;
    	struct futex *f, *nf, *mf = NULL;
    	struct futex_slpque *ofsq, *nfsq;
    	uint32_t count = 0;
    
    	if (m == 0)
    		return futex_wake(p, uaddr, n, flags, retval);
    
    	futex_addrs(p, &okey, uaddr, flags);
    	ofsq = futex_get_slpque(&okey);
    	futex_addrs(p, &nkey, uaddr2, flags);
    	nfsq = futex_get_slpque(&nkey);
    
    	if (ofsq->fsq_id < nfsq->fsq_id) {
    		rw_enter_write(&ofsq->fsq_lock);
    		rw_enter_write(&nfsq->fsq_lock);
    	} else if (ofsq->fsq_id > nfsq->fsq_id) {
    		rw_enter_write(&nfsq->fsq_lock);
    		rw_enter_write(&ofsq->fsq_lock);
    	} else
    		rw_enter_write(&ofsq->fsq_lock);
    
    	TAILQ_FOREACH_SAFE(f, &ofsq->fsq_list, ft_entry, nf) {
    		/* __builtin_prefetch(nf, 1); */
    		KASSERT(f->ft_proc != NULL);
    
    		if (!futex_is_eq(f, &okey))
    			continue;
    
    		TAILQ_REMOVE(&ofsq->fsq_list, f, ft_entry);
    		TAILQ_INSERT_TAIL(&fl, f, ft_entry);
    
    		if (++count == n) {
    			mf = nf;
    			break;
    		}
    	}
    
    	if (!TAILQ_EMPTY(&fl))
    		futex_list_wakeup(&fl);
    
    	/* update matching futexes */
    	if (mf != NULL) {
    		/*
    		 * only iterate from the current entry to the tail
    		 * of the list as it is now in case we're requeueing
    		 * on the end of the same list.
    		 */
    		nf = TAILQ_LAST(&ofsq->fsq_list, futex_list);
    		do {
    			f = mf;
    			mf = TAILQ_NEXT(f, ft_entry);
    			/* __builtin_prefetch(mf, 1); */
    
    			KASSERT(f->ft_proc != NULL);
    
    			if (!futex_is_eq(f, &okey))
    				continue;
    
    			TAILQ_REMOVE(&ofsq->fsq_list, f, ft_entry);
    			f->ft_ps = nkey.ft_ps;
    			f->ft_obj = nkey.ft_obj;
    			f->ft_amap = nkey.ft_amap;
    			f->ft_off = nkey.ft_off;
    			TAILQ_INSERT_TAIL(&nfsq->fsq_list, f, ft_entry);
    
    			if (--m == 0)
    				break;
    		} while (f != nf);
    	}
    
    	if (ofsq->fsq_id != nfsq->fsq_id)
    		rw_exit_write(&nfsq->fsq_lock);
    	rw_exit_write(&ofsq->fsq_lock);
    
    	*retval = count;
    	return 0;
    }
    
    /*
     * Wakeup at most ``n'' sibling threads sleeping on a futex at address
     * ``uaddr''.
     */
    static int
    futex_wake(struct proc *p, uint32_t *uaddr, uint32_t n, int flags,
        register_t *retval)
    {
    	struct futex_list fl = TAILQ_HEAD_INITIALIZER(fl);
    	struct futex key;
    	struct futex *f, *nf;
    	struct futex_slpque *fsq;
    	int count = 0;
    
    	if (n == 0) {
    		*retval = 0;
    		return 0;
    	}
    
    	futex_addrs(p, &key, uaddr, flags);
    	fsq = futex_get_slpque(&key);
    
    	rw_enter_write(&fsq->fsq_lock);
    
    	TAILQ_FOREACH_SAFE(f, &fsq->fsq_list, ft_entry, nf) {
    		/* __builtin_prefetch(nf, 1); */
    		KASSERT(f->ft_proc != NULL);
    
    		if (!futex_is_eq(f, &key))
    			continue;
    
    		TAILQ_REMOVE(&fsq->fsq_list, f, ft_entry);
    		TAILQ_INSERT_TAIL(&fl, f, ft_entry);
    
    		if (++count == n)
    			break;
    	}
    
    	if (!TAILQ_EMPTY(&fl))
    		futex_list_wakeup(&fl);
    
    	rw_exit_write(&fsq->fsq_lock);
    
    	*retval = count;
    	return 0;
    }