Commit 80a836c2 authored by Alexei Starovoitov's avatar Alexei Starovoitov
Browse files

Merge branch 'BPF_and_RT'

Thomas Gleixner says:

====================
This is the third version of the BPF/RT patch set which makes both coexist
nicely. The long explanation can be found in the cover letter of the V1
submission:

  https://lore.kernel.org/r/20200214133917.304937432@linutronix.de

V2 is here:

  https://lore.kernel.org/r/20200220204517.863202864@linutronix.de



The following changes vs. V2 have been made:

  - Rebased to bpf-next, adjusted to the lock changes in the hashmap code.

  - Split the preallocation enforcement patch for instrumentation type BPF
    programs into two pieces:

    1) Emit a one-time warning on !RT kernels when any instrumentation type
       BPF program uses run-time allocation. Emit also a corresponding
       warning in the verifier log. But allow the program to run for
       backward compatibility sake. After a grace period this should be
       enforced.

    2) On RT reject such programs because on RT the memory allocator cannot
       be called from truly atomic contexts.

  - Fixed the fallout from V2 as reported by Alexei and 0-day

  - Removed the redundant preempt_disable() from trace_call_bpf()

  - Removed the unused export of trace_call_bpf()
====================

Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
parents 8eece07c 099bfaa7
Loading
Loading
Loading
Loading
+34 −4
Original line number Diff line number Diff line
@@ -885,7 +885,7 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array,
		struct bpf_prog *_prog;			\
		struct bpf_prog_array *_array;		\
		u32 _ret = 1;				\
		preempt_disable();			\
		migrate_disable();			\
		rcu_read_lock();			\
		_array = rcu_dereference(array);	\
		if (unlikely(check_non_null && !_array))\
@@ -898,7 +898,7 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array,
		}					\
_out:							\
		rcu_read_unlock();			\
		preempt_enable();			\
		migrate_enable();			\
		_ret;					\
	 })

@@ -932,7 +932,7 @@ _out: \
		u32 ret;				\
		u32 _ret = 1;				\
		u32 _cn = 0;				\
		preempt_disable();			\
		migrate_disable();			\
		rcu_read_lock();			\
		_array = rcu_dereference(array);	\
		_item = &_array->items[0];		\
@@ -944,7 +944,7 @@ _out: \
			_item++;			\
		}					\
		rcu_read_unlock();			\
		preempt_enable();			\
		migrate_enable();			\
		if (_ret)				\
			_ret = (_cn ? NET_XMIT_CN : NET_XMIT_SUCCESS);	\
		else					\
@@ -961,6 +961,36 @@ _out: \
#ifdef CONFIG_BPF_SYSCALL
DECLARE_PER_CPU(int, bpf_prog_active);

/*
 * Block execution of BPF programs attached to instrumentation (perf,
 * kprobes, tracepoints) to prevent deadlocks on map operations as any of
 * these events can happen inside a region which holds a map bucket lock
 * and can deadlock on it.
 *
 * Use the preemption safe inc/dec variants on RT because migrate disable
 * is preemptible on RT and preemption in the middle of the RMW operation
 * might lead to inconsistent state. Use the raw variants for non RT
 * kernels as migrate_disable() maps to preempt_disable() so the slightly
 * more expensive save operation can be avoided.
 */
static inline void bpf_disable_instrumentation(void)
{
	migrate_disable();
	if (IS_ENABLED(CONFIG_PREEMPT_RT))
		this_cpu_inc(bpf_prog_active);
	else
		__this_cpu_inc(bpf_prog_active);
}

static inline void bpf_enable_instrumentation(void)
{
	if (IS_ENABLED(CONFIG_PREEMPT_RT))
		this_cpu_dec(bpf_prog_active);
	else
		__this_cpu_dec(bpf_prog_active);
	migrate_enable();
}

extern const struct file_operations bpf_map_fops;
extern const struct file_operations bpf_prog_fops;

+29 −8
Original line number Diff line number Diff line
@@ -561,7 +561,7 @@ DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key);

#define __BPF_PROG_RUN(prog, ctx, dfunc)	({			\
	u32 ret;							\
	cant_sleep();							\
	cant_migrate();							\
	if (static_branch_unlikely(&bpf_stats_enabled_key)) {		\
		struct bpf_prog_stats *stats;				\
		u64 start = sched_clock();				\
@@ -576,8 +576,30 @@ DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
	}								\
	ret; })

#define BPF_PROG_RUN(prog, ctx) __BPF_PROG_RUN(prog, ctx,		\
					       bpf_dispatcher_nopfunc)
#define BPF_PROG_RUN(prog, ctx)						\
	__BPF_PROG_RUN(prog, ctx, bpf_dispatcher_nopfunc)

/*
 * Use in preemptible and therefore migratable context to make sure that
 * the execution of the BPF program runs on one CPU.
 *
 * This uses migrate_disable/enable() explicitly to document that the
 * invocation of a BPF program does not require reentrancy protection
 * against a BPF program which is invoked from a preempting task.
 *
 * For non RT enabled kernels migrate_disable/enable() maps to
 * preempt_disable/enable(), i.e. it disables also preemption.
 */
static inline u32 bpf_prog_run_pin_on_cpu(const struct bpf_prog *prog,
					  const void *ctx)
{
	u32 ret;

	migrate_disable();
	ret = __BPF_PROG_RUN(prog, ctx, bpf_dispatcher_nopfunc);
	migrate_enable();
	return ret;
}

#define BPF_SKB_CB_LEN QDISC_CB_PRIV_LEN

@@ -655,6 +677,7 @@ static inline u8 *bpf_skb_cb(struct sk_buff *skb)
	return qdisc_skb_cb(skb)->data;
}

/* Must be invoked with migration disabled */
static inline u32 __bpf_prog_run_save_cb(const struct bpf_prog *prog,
					 struct sk_buff *skb)
{
@@ -680,9 +703,9 @@ static inline u32 bpf_prog_run_save_cb(const struct bpf_prog *prog,
{
	u32 res;

	preempt_disable();
	migrate_disable();
	res = __bpf_prog_run_save_cb(prog, skb);
	preempt_enable();
	migrate_enable();
	return res;
}

@@ -695,9 +718,7 @@ static inline u32 bpf_prog_run_clear_cb(const struct bpf_prog *prog,
	if (unlikely(prog->cb_access))
		memset(cb_data, 0, BPF_SKB_CB_LEN);

	preempt_disable();
	res = BPF_PROG_RUN(prog, skb);
	preempt_enable();
	res = bpf_prog_run_pin_on_cpu(prog, skb);
	return res;
}

+123 −49
Original line number Diff line number Diff line
@@ -27,9 +27,62 @@
	.map_delete_batch =			\
	generic_map_delete_batch

/*
 * The bucket lock has two protection scopes:
 *
 * 1) Serializing concurrent operations from BPF programs on differrent
 *    CPUs
 *
 * 2) Serializing concurrent operations from BPF programs and sys_bpf()
 *
 * BPF programs can execute in any context including perf, kprobes and
 * tracing. As there are almost no limits where perf, kprobes and tracing
 * can be invoked from the lock operations need to be protected against
 * deadlocks. Deadlocks can be caused by recursion and by an invocation in
 * the lock held section when functions which acquire this lock are invoked
 * from sys_bpf(). BPF recursion is prevented by incrementing the per CPU
 * variable bpf_prog_active, which prevents BPF programs attached to perf
 * events, kprobes and tracing to be invoked before the prior invocation
 * from one of these contexts completed. sys_bpf() uses the same mechanism
 * by pinning the task to the current CPU and incrementing the recursion
 * protection accross the map operation.
 *
 * This has subtle implications on PREEMPT_RT. PREEMPT_RT forbids certain
 * operations like memory allocations (even with GFP_ATOMIC) from atomic
 * contexts. This is required because even with GFP_ATOMIC the memory
 * allocator calls into code pathes which acquire locks with long held lock
 * sections. To ensure the deterministic behaviour these locks are regular
 * spinlocks, which are converted to 'sleepable' spinlocks on RT. The only
 * true atomic contexts on an RT kernel are the low level hardware
 * handling, scheduling, low level interrupt handling, NMIs etc. None of
 * these contexts should ever do memory allocations.
 *
 * As regular device interrupt handlers and soft interrupts are forced into
 * thread context, the existing code which does
 *   spin_lock*(); alloc(GPF_ATOMIC); spin_unlock*();
 * just works.
 *
 * In theory the BPF locks could be converted to regular spinlocks as well,
 * but the bucket locks and percpu_freelist locks can be taken from
 * arbitrary contexts (perf, kprobes, tracepoints) which are required to be
 * atomic contexts even on RT. These mechanisms require preallocated maps,
 * so there is no need to invoke memory allocations within the lock held
 * sections.
 *
 * BPF maps which need dynamic allocation are only used from (forced)
 * thread context on RT and can therefore use regular spinlocks which in
 * turn allows to invoke memory allocations from the lock held section.
 *
 * On a non RT kernel this distinction is neither possible nor required.
 * spinlock maps to raw_spinlock and the extra code is optimized out by the
 * compiler.
 */
struct bucket {
	struct hlist_nulls_head head;
	raw_spinlock_t lock;
	union {
		raw_spinlock_t raw_lock;
		spinlock_t     lock;
	};
};

struct bpf_htab {
@@ -68,6 +121,51 @@ struct htab_elem {
	char key[0] __aligned(8);
};

static inline bool htab_is_prealloc(const struct bpf_htab *htab)
{
	return !(htab->map.map_flags & BPF_F_NO_PREALLOC);
}

static inline bool htab_use_raw_lock(const struct bpf_htab *htab)
{
	return (!IS_ENABLED(CONFIG_PREEMPT_RT) || htab_is_prealloc(htab));
}

static void htab_init_buckets(struct bpf_htab *htab)
{
	unsigned i;

	for (i = 0; i < htab->n_buckets; i++) {
		INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
		if (htab_use_raw_lock(htab))
			raw_spin_lock_init(&htab->buckets[i].raw_lock);
		else
			spin_lock_init(&htab->buckets[i].lock);
	}
}

static inline unsigned long htab_lock_bucket(const struct bpf_htab *htab,
					     struct bucket *b)
{
	unsigned long flags;

	if (htab_use_raw_lock(htab))
		raw_spin_lock_irqsave(&b->raw_lock, flags);
	else
		spin_lock_irqsave(&b->lock, flags);
	return flags;
}

static inline void htab_unlock_bucket(const struct bpf_htab *htab,
				      struct bucket *b,
				      unsigned long flags)
{
	if (htab_use_raw_lock(htab))
		raw_spin_unlock_irqrestore(&b->raw_lock, flags);
	else
		spin_unlock_irqrestore(&b->lock, flags);
}

static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node);

static bool htab_is_lru(const struct bpf_htab *htab)
@@ -82,11 +180,6 @@ static bool htab_is_percpu(const struct bpf_htab *htab)
		htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
}

static bool htab_is_prealloc(const struct bpf_htab *htab)
{
	return !(htab->map.map_flags & BPF_F_NO_PREALLOC);
}

static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
				     void __percpu *pptr)
{
@@ -328,8 +421,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
	bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
	bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
	struct bpf_htab *htab;
	int err, i;
	u64 cost;
	int err;

	htab = kzalloc(sizeof(*htab), GFP_USER);
	if (!htab)
@@ -391,10 +484,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
	else
		htab->hashrnd = get_random_int();

	for (i = 0; i < htab->n_buckets; i++) {
		INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
		raw_spin_lock_init(&htab->buckets[i].lock);
	}
	htab_init_buckets(htab);

	if (prealloc) {
		err = prealloc_init(htab);
@@ -602,7 +692,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
	b = __select_bucket(htab, tgt_l->hash);
	head = &b->head;

	raw_spin_lock_irqsave(&b->lock, flags);
	flags = htab_lock_bucket(htab, b);

	hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
		if (l == tgt_l) {
@@ -610,7 +700,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
			break;
		}

	raw_spin_unlock_irqrestore(&b->lock, flags);
	htab_unlock_bucket(htab, b, flags);

	return l == tgt_l;
}
@@ -686,15 +776,7 @@ static void htab_elem_free_rcu(struct rcu_head *head)
	struct htab_elem *l = container_of(head, struct htab_elem, rcu);
	struct bpf_htab *htab = l->htab;

	/* must increment bpf_prog_active to avoid kprobe+bpf triggering while
	 * we're calling kfree, otherwise deadlock is possible if kprobes
	 * are placed somewhere inside of slub
	 */
	preempt_disable();
	__this_cpu_inc(bpf_prog_active);
	htab_elem_free(htab, l);
	__this_cpu_dec(bpf_prog_active);
	preempt_enable();
}

static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
@@ -884,8 +966,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
		 */
	}

	/* bpf_map_update_elem() can be called in_irq() */
	raw_spin_lock_irqsave(&b->lock, flags);
	flags = htab_lock_bucket(htab, b);

	l_old = lookup_elem_raw(head, hash, key, key_size);

@@ -926,7 +1007,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
	}
	ret = 0;
err:
	raw_spin_unlock_irqrestore(&b->lock, flags);
	htab_unlock_bucket(htab, b, flags);
	return ret;
}

@@ -964,8 +1045,7 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
		return -ENOMEM;
	memcpy(l_new->key + round_up(map->key_size, 8), value, map->value_size);

	/* bpf_map_update_elem() can be called in_irq() */
	raw_spin_lock_irqsave(&b->lock, flags);
	flags = htab_lock_bucket(htab, b);

	l_old = lookup_elem_raw(head, hash, key, key_size);

@@ -984,7 +1064,7 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
	ret = 0;

err:
	raw_spin_unlock_irqrestore(&b->lock, flags);
	htab_unlock_bucket(htab, b, flags);

	if (ret)
		bpf_lru_push_free(&htab->lru, &l_new->lru_node);
@@ -1019,8 +1099,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
	b = __select_bucket(htab, hash);
	head = &b->head;

	/* bpf_map_update_elem() can be called in_irq() */
	raw_spin_lock_irqsave(&b->lock, flags);
	flags = htab_lock_bucket(htab, b);

	l_old = lookup_elem_raw(head, hash, key, key_size);

@@ -1043,7 +1122,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
	}
	ret = 0;
err:
	raw_spin_unlock_irqrestore(&b->lock, flags);
	htab_unlock_bucket(htab, b, flags);
	return ret;
}

@@ -1083,8 +1162,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
			return -ENOMEM;
	}

	/* bpf_map_update_elem() can be called in_irq() */
	raw_spin_lock_irqsave(&b->lock, flags);
	flags = htab_lock_bucket(htab, b);

	l_old = lookup_elem_raw(head, hash, key, key_size);

@@ -1106,7 +1184,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
	}
	ret = 0;
err:
	raw_spin_unlock_irqrestore(&b->lock, flags);
	htab_unlock_bucket(htab, b, flags);
	if (l_new)
		bpf_lru_push_free(&htab->lru, &l_new->lru_node);
	return ret;
@@ -1144,7 +1222,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
	b = __select_bucket(htab, hash);
	head = &b->head;

	raw_spin_lock_irqsave(&b->lock, flags);
	flags = htab_lock_bucket(htab, b);

	l = lookup_elem_raw(head, hash, key, key_size);

@@ -1154,7 +1232,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
		ret = 0;
	}

	raw_spin_unlock_irqrestore(&b->lock, flags);
	htab_unlock_bucket(htab, b, flags);
	return ret;
}

@@ -1176,7 +1254,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
	b = __select_bucket(htab, hash);
	head = &b->head;

	raw_spin_lock_irqsave(&b->lock, flags);
	flags = htab_lock_bucket(htab, b);

	l = lookup_elem_raw(head, hash, key, key_size);

@@ -1185,7 +1263,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
		ret = 0;
	}

	raw_spin_unlock_irqrestore(&b->lock, flags);
	htab_unlock_bucket(htab, b, flags);
	if (l)
		bpf_lru_push_free(&htab->lru, &l->lru_node);
	return ret;
@@ -1325,8 +1403,7 @@ alloc:
	}

again:
	preempt_disable();
	this_cpu_inc(bpf_prog_active);
	bpf_disable_instrumentation();
	rcu_read_lock();
again_nocopy:
	dst_key = keys;
@@ -1335,7 +1412,7 @@ again_nocopy:
	head = &b->head;
	/* do not grab the lock unless need it (bucket_cnt > 0). */
	if (locked)
		raw_spin_lock_irqsave(&b->lock, flags);
		flags = htab_lock_bucket(htab, b);

	bucket_cnt = 0;
	hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
@@ -1352,10 +1429,9 @@ again_nocopy:
		/* Note that since bucket_cnt > 0 here, it is implicit
		 * that the locked was grabbed, so release it.
		 */
		raw_spin_unlock_irqrestore(&b->lock, flags);
		htab_unlock_bucket(htab, b, flags);
		rcu_read_unlock();
		this_cpu_dec(bpf_prog_active);
		preempt_enable();
		bpf_enable_instrumentation();
		goto after_loop;
	}

@@ -1364,10 +1440,9 @@ again_nocopy:
		/* Note that since bucket_cnt > 0 here, it is implicit
		 * that the locked was grabbed, so release it.
		 */
		raw_spin_unlock_irqrestore(&b->lock, flags);
		htab_unlock_bucket(htab, b, flags);
		rcu_read_unlock();
		this_cpu_dec(bpf_prog_active);
		preempt_enable();
		bpf_enable_instrumentation();
		kvfree(keys);
		kvfree(values);
		goto alloc;
@@ -1418,7 +1493,7 @@ again_nocopy:
		dst_val += value_size;
	}

	raw_spin_unlock_irqrestore(&b->lock, flags);
	htab_unlock_bucket(htab, b, flags);
	locked = false;

	while (node_to_free) {
@@ -1437,8 +1512,7 @@ next_batch:
	}

	rcu_read_unlock();
	this_cpu_dec(bpf_prog_active);
	preempt_enable();
	bpf_enable_instrumentation();
	if (bucket_cnt && (copy_to_user(ukeys + total * key_size, keys,
	    key_size * bucket_cnt) ||
	    copy_to_user(uvalues + total * value_size, values,
+6 −6
Original line number Diff line number Diff line
@@ -34,7 +34,7 @@ struct lpm_trie {
	size_t				n_entries;
	size_t				max_prefixlen;
	size_t				data_size;
	raw_spinlock_t			lock;
	spinlock_t			lock;
};

/* This trie implements a longest prefix match algorithm that can be used to
@@ -315,7 +315,7 @@ static int trie_update_elem(struct bpf_map *map,
	if (key->prefixlen > trie->max_prefixlen)
		return -EINVAL;

	raw_spin_lock_irqsave(&trie->lock, irq_flags);
	spin_lock_irqsave(&trie->lock, irq_flags);

	/* Allocate and fill a new node */

@@ -422,7 +422,7 @@ out:
		kfree(im_node);
	}

	raw_spin_unlock_irqrestore(&trie->lock, irq_flags);
	spin_unlock_irqrestore(&trie->lock, irq_flags);

	return ret;
}
@@ -442,7 +442,7 @@ static int trie_delete_elem(struct bpf_map *map, void *_key)
	if (key->prefixlen > trie->max_prefixlen)
		return -EINVAL;

	raw_spin_lock_irqsave(&trie->lock, irq_flags);
	spin_lock_irqsave(&trie->lock, irq_flags);

	/* Walk the tree looking for an exact key/length match and keeping
	 * track of the path we traverse.  We will need to know the node
@@ -518,7 +518,7 @@ static int trie_delete_elem(struct bpf_map *map, void *_key)
	kfree_rcu(node, rcu);

out:
	raw_spin_unlock_irqrestore(&trie->lock, irq_flags);
	spin_unlock_irqrestore(&trie->lock, irq_flags);

	return ret;
}
@@ -575,7 +575,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
	if (ret)
		goto out_err;

	raw_spin_lock_init(&trie->lock);
	spin_lock_init(&trie->lock);

	return &trie->map;
out_err:
+10 −10
Original line number Diff line number Diff line
@@ -25,12 +25,18 @@ void pcpu_freelist_destroy(struct pcpu_freelist *s)
	free_percpu(s->freelist);
}

static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head,
static inline void pcpu_freelist_push_node(struct pcpu_freelist_head *head,
					   struct pcpu_freelist_node *node)
{
	raw_spin_lock(&head->lock);
	node->next = head->first;
	head->first = node;
}

static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head,
					 struct pcpu_freelist_node *node)
{
	raw_spin_lock(&head->lock);
	pcpu_freelist_push_node(head, node);
	raw_spin_unlock(&head->lock);
}

@@ -56,21 +62,16 @@ void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,
			    u32 nr_elems)
{
	struct pcpu_freelist_head *head;
	unsigned long flags;
	int i, cpu, pcpu_entries;

	pcpu_entries = nr_elems / num_possible_cpus() + 1;
	i = 0;

	/* disable irq to workaround lockdep false positive
	 * in bpf usage pcpu_freelist_populate() will never race
	 * with pcpu_freelist_push()
	 */
	local_irq_save(flags);
	for_each_possible_cpu(cpu) {
again:
		head = per_cpu_ptr(s->freelist, cpu);
		___pcpu_freelist_push(head, buf);
		/* No locking required as this is not visible yet. */
		pcpu_freelist_push_node(head, buf);
		i++;
		buf += elem_size;
		if (i == nr_elems)
@@ -78,7 +79,6 @@ again:
		if (i % pcpu_entries)
			goto again;
	}
	local_irq_restore(flags);
}

struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s)
Loading