Merge branch 'BPF_and_RT' (80a836c2) · Commits · 戴 / test

include/linux/bpf.h

+34 −4

Original line number	Diff line number	Diff line
		@@ -885,7 +885,7 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array,
		struct bpf_prog *_prog; \
		struct bpf_prog_array *_array; \
		u32 _ret = 1; \
		preempt_disable(); \
		migrate_disable(); \
		rcu_read_lock(); \
		_array = rcu_dereference(array); \
		if (unlikely(check_non_null && !_array))\
		@@ -898,7 +898,7 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array,
		} \
		_out: \
		rcu_read_unlock(); \
		preempt_enable(); \
		migrate_enable(); \
		_ret; \
		})

		@@ -932,7 +932,7 @@ _out: \
		u32 ret; \
		u32 _ret = 1; \
		u32 _cn = 0; \
		preempt_disable(); \
		migrate_disable(); \
		rcu_read_lock(); \
		_array = rcu_dereference(array); \
		_item = &_array->items[0]; \
		@@ -944,7 +944,7 @@ _out: \
		_item++; \
		} \
		rcu_read_unlock(); \
		preempt_enable(); \
		migrate_enable(); \
		if (_ret) \
		_ret = (_cn ? NET_XMIT_CN : NET_XMIT_SUCCESS); \
		else \
		@@ -961,6 +961,36 @@ _out: \
		#ifdef CONFIG_BPF_SYSCALL
		DECLARE_PER_CPU(int, bpf_prog_active);

		/*
		* Block execution of BPF programs attached to instrumentation (perf,
		* kprobes, tracepoints) to prevent deadlocks on map operations as any of
		* these events can happen inside a region which holds a map bucket lock
		* and can deadlock on it.
		*
		* Use the preemption safe inc/dec variants on RT because migrate disable
		* is preemptible on RT and preemption in the middle of the RMW operation
		* might lead to inconsistent state. Use the raw variants for non RT
		* kernels as migrate_disable() maps to preempt_disable() so the slightly
		* more expensive save operation can be avoided.
		*/
		static inline void bpf_disable_instrumentation(void)
		{
		migrate_disable();
		if (IS_ENABLED(CONFIG_PREEMPT_RT))
		this_cpu_inc(bpf_prog_active);
		else
		__this_cpu_inc(bpf_prog_active);
		}

		static inline void bpf_enable_instrumentation(void)
		{
		if (IS_ENABLED(CONFIG_PREEMPT_RT))
		this_cpu_dec(bpf_prog_active);
		else
		__this_cpu_dec(bpf_prog_active);
		migrate_enable();
		}

		extern const struct file_operations bpf_map_fops;
		extern const struct file_operations bpf_prog_fops;

include/linux/filter.h

+29 −8

Original line number	Diff line number	Diff line
		@@ -561,7 +561,7 @@ DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key);

		#define __BPF_PROG_RUN(prog, ctx, dfunc) ({ \
		u32 ret; \
		cant_sleep(); \
		cant_migrate(); \
		if (static_branch_unlikely(&bpf_stats_enabled_key)) { \
		struct bpf_prog_stats *stats; \
		u64 start = sched_clock(); \
		@@ -576,8 +576,30 @@ DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
		} \
		ret; })

		#define BPF_PROG_RUN(prog, ctx) __BPF_PROG_RUN(prog, ctx, \
		bpf_dispatcher_nopfunc)
		#define BPF_PROG_RUN(prog, ctx) \
		__BPF_PROG_RUN(prog, ctx, bpf_dispatcher_nopfunc)

		/*
		* Use in preemptible and therefore migratable context to make sure that
		* the execution of the BPF program runs on one CPU.
		*
		* This uses migrate_disable/enable() explicitly to document that the
		* invocation of a BPF program does not require reentrancy protection
		* against a BPF program which is invoked from a preempting task.
		*
		* For non RT enabled kernels migrate_disable/enable() maps to
		* preempt_disable/enable(), i.e. it disables also preemption.
		*/
		static inline u32 bpf_prog_run_pin_on_cpu(const struct bpf_prog *prog,
		const void *ctx)
		{
		u32 ret;

		migrate_disable();
		ret = __BPF_PROG_RUN(prog, ctx, bpf_dispatcher_nopfunc);
		migrate_enable();
		return ret;
		}

		#define BPF_SKB_CB_LEN QDISC_CB_PRIV_LEN

		@@ -655,6 +677,7 @@ static inline u8 bpf_skb_cb(struct sk_buff skb)
		return qdisc_skb_cb(skb)->data;
		}

		/* Must be invoked with migration disabled */
		static inline u32 __bpf_prog_run_save_cb(const struct bpf_prog *prog,
		struct sk_buff *skb)
		{
		@@ -680,9 +703,9 @@ static inline u32 bpf_prog_run_save_cb(const struct bpf_prog *prog,
		{
		u32 res;

		preempt_disable();
		migrate_disable();
		res = __bpf_prog_run_save_cb(prog, skb);
		preempt_enable();
		migrate_enable();
		return res;
		}

		@@ -695,9 +718,7 @@ static inline u32 bpf_prog_run_clear_cb(const struct bpf_prog *prog,
		if (unlikely(prog->cb_access))
		memset(cb_data, 0, BPF_SKB_CB_LEN);

		preempt_disable();
		res = BPF_PROG_RUN(prog, skb);
		preempt_enable();
		res = bpf_prog_run_pin_on_cpu(prog, skb);
		return res;
		}

kernel/bpf/hashtab.c

+123 −49

Original line number	Diff line number	Diff line
		@@ -27,9 +27,62 @@
		.map_delete_batch = \
		generic_map_delete_batch

		/*
		* The bucket lock has two protection scopes:
		*
		* 1) Serializing concurrent operations from BPF programs on differrent
		* CPUs
		*
		* 2) Serializing concurrent operations from BPF programs and sys_bpf()
		*
		* BPF programs can execute in any context including perf, kprobes and
		* tracing. As there are almost no limits where perf, kprobes and tracing
		* can be invoked from the lock operations need to be protected against
		* deadlocks. Deadlocks can be caused by recursion and by an invocation in
		* the lock held section when functions which acquire this lock are invoked
		* from sys_bpf(). BPF recursion is prevented by incrementing the per CPU
		* variable bpf_prog_active, which prevents BPF programs attached to perf
		* events, kprobes and tracing to be invoked before the prior invocation
		* from one of these contexts completed. sys_bpf() uses the same mechanism
		* by pinning the task to the current CPU and incrementing the recursion
		* protection accross the map operation.
		*
		* This has subtle implications on PREEMPT_RT. PREEMPT_RT forbids certain
		* operations like memory allocations (even with GFP_ATOMIC) from atomic
		* contexts. This is required because even with GFP_ATOMIC the memory
		* allocator calls into code pathes which acquire locks with long held lock
		* sections. To ensure the deterministic behaviour these locks are regular
		* spinlocks, which are converted to 'sleepable' spinlocks on RT. The only
		* true atomic contexts on an RT kernel are the low level hardware
		* handling, scheduling, low level interrupt handling, NMIs etc. None of
		* these contexts should ever do memory allocations.
		*
		* As regular device interrupt handlers and soft interrupts are forced into
		* thread context, the existing code which does
		* spin_lock(); alloc(GPF_ATOMIC); spin_unlock();
		* just works.
		*
		* In theory the BPF locks could be converted to regular spinlocks as well,
		* but the bucket locks and percpu_freelist locks can be taken from
		* arbitrary contexts (perf, kprobes, tracepoints) which are required to be
		* atomic contexts even on RT. These mechanisms require preallocated maps,
		* so there is no need to invoke memory allocations within the lock held
		* sections.
		*
		* BPF maps which need dynamic allocation are only used from (forced)
		* thread context on RT and can therefore use regular spinlocks which in
		* turn allows to invoke memory allocations from the lock held section.
		*
		* On a non RT kernel this distinction is neither possible nor required.
		* spinlock maps to raw_spinlock and the extra code is optimized out by the
		* compiler.
		*/
		struct bucket {
		struct hlist_nulls_head head;
		raw_spinlock_t lock;
		union {
		raw_spinlock_t raw_lock;
		spinlock_t lock;
		};
		};

		struct bpf_htab {
		@@ -68,6 +121,51 @@ struct htab_elem {
		char key[0] __aligned(8);
		};

		static inline bool htab_is_prealloc(const struct bpf_htab *htab)
		{
		return !(htab->map.map_flags & BPF_F_NO_PREALLOC);
		}

		static inline bool htab_use_raw_lock(const struct bpf_htab *htab)
		{
		return (!IS_ENABLED(CONFIG_PREEMPT_RT) \|\| htab_is_prealloc(htab));
		}

		static void htab_init_buckets(struct bpf_htab *htab)
		{
		unsigned i;

		for (i = 0; i < htab->n_buckets; i++) {
		INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
		if (htab_use_raw_lock(htab))
		raw_spin_lock_init(&htab->buckets[i].raw_lock);
		else
		spin_lock_init(&htab->buckets[i].lock);
		}
		}

		static inline unsigned long htab_lock_bucket(const struct bpf_htab *htab,
		struct bucket *b)
		{
		unsigned long flags;

		if (htab_use_raw_lock(htab))
		raw_spin_lock_irqsave(&b->raw_lock, flags);
		else
		spin_lock_irqsave(&b->lock, flags);
		return flags;
		}

		static inline void htab_unlock_bucket(const struct bpf_htab *htab,
		struct bucket *b,
		unsigned long flags)
		{
		if (htab_use_raw_lock(htab))
		raw_spin_unlock_irqrestore(&b->raw_lock, flags);
		else
		spin_unlock_irqrestore(&b->lock, flags);
		}

		static bool htab_lru_map_delete_node(void arg, struct bpf_lru_node node);

		static bool htab_is_lru(const struct bpf_htab *htab)
		@@ -82,11 +180,6 @@ static bool htab_is_percpu(const struct bpf_htab *htab)
		htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
		}

		static bool htab_is_prealloc(const struct bpf_htab *htab)
		{
		return !(htab->map.map_flags & BPF_F_NO_PREALLOC);
		}

		static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
		void __percpu *pptr)
		{
		@@ -328,8 +421,8 @@ static struct bpf_map htab_map_alloc(union bpf_attr attr)
		bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
		bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
		struct bpf_htab *htab;
		int err, i;
		u64 cost;
		int err;

		htab = kzalloc(sizeof(*htab), GFP_USER);
		if (!htab)
		@@ -391,10 +484,7 @@ static struct bpf_map htab_map_alloc(union bpf_attr attr)
		else
		htab->hashrnd = get_random_int();

		for (i = 0; i < htab->n_buckets; i++) {
		INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
		raw_spin_lock_init(&htab->buckets[i].lock);
		}
		htab_init_buckets(htab);

		if (prealloc) {
		err = prealloc_init(htab);
		@@ -602,7 +692,7 @@ static bool htab_lru_map_delete_node(void arg, struct bpf_lru_node node)
		b = __select_bucket(htab, tgt_l->hash);
		head = &b->head;

		raw_spin_lock_irqsave(&b->lock, flags);
		flags = htab_lock_bucket(htab, b);

		hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
		if (l == tgt_l) {
		@@ -610,7 +700,7 @@ static bool htab_lru_map_delete_node(void arg, struct bpf_lru_node node)
		break;
		}

		raw_spin_unlock_irqrestore(&b->lock, flags);
		htab_unlock_bucket(htab, b, flags);

		return l == tgt_l;
		}
		@@ -686,15 +776,7 @@ static void htab_elem_free_rcu(struct rcu_head *head)
		struct htab_elem *l = container_of(head, struct htab_elem, rcu);
		struct bpf_htab *htab = l->htab;

		/* must increment bpf_prog_active to avoid kprobe+bpf triggering while
		* we're calling kfree, otherwise deadlock is possible if kprobes
		* are placed somewhere inside of slub
		*/
		preempt_disable();
		__this_cpu_inc(bpf_prog_active);
		htab_elem_free(htab, l);
		__this_cpu_dec(bpf_prog_active);
		preempt_enable();
		}

		static void free_htab_elem(struct bpf_htab htab, struct htab_elem l)
		@@ -884,8 +966,7 @@ static int htab_map_update_elem(struct bpf_map map, void key, void *value,
		*/
		}

		/* bpf_map_update_elem() can be called in_irq() */
		raw_spin_lock_irqsave(&b->lock, flags);
		flags = htab_lock_bucket(htab, b);

		l_old = lookup_elem_raw(head, hash, key, key_size);

		@@ -926,7 +1007,7 @@ static int htab_map_update_elem(struct bpf_map map, void key, void *value,
		}
		ret = 0;
		err:
		raw_spin_unlock_irqrestore(&b->lock, flags);
		htab_unlock_bucket(htab, b, flags);
		return ret;
		}

		@@ -964,8 +1045,7 @@ static int htab_lru_map_update_elem(struct bpf_map map, void key, void *value,
		return -ENOMEM;
		memcpy(l_new->key + round_up(map->key_size, 8), value, map->value_size);

		/* bpf_map_update_elem() can be called in_irq() */
		raw_spin_lock_irqsave(&b->lock, flags);
		flags = htab_lock_bucket(htab, b);

		l_old = lookup_elem_raw(head, hash, key, key_size);

		@@ -984,7 +1064,7 @@ static int htab_lru_map_update_elem(struct bpf_map map, void key, void *value,
		ret = 0;

		err:
		raw_spin_unlock_irqrestore(&b->lock, flags);
		htab_unlock_bucket(htab, b, flags);

		if (ret)
		bpf_lru_push_free(&htab->lru, &l_new->lru_node);
		@@ -1019,8 +1099,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map map, void key,
		b = __select_bucket(htab, hash);
		head = &b->head;

		/* bpf_map_update_elem() can be called in_irq() */
		raw_spin_lock_irqsave(&b->lock, flags);
		flags = htab_lock_bucket(htab, b);

		l_old = lookup_elem_raw(head, hash, key, key_size);

		@@ -1043,7 +1122,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map map, void key,
		}
		ret = 0;
		err:
		raw_spin_unlock_irqrestore(&b->lock, flags);
		htab_unlock_bucket(htab, b, flags);
		return ret;
		}

		@@ -1083,8 +1162,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map map, void key,
		return -ENOMEM;
		}

		/* bpf_map_update_elem() can be called in_irq() */
		raw_spin_lock_irqsave(&b->lock, flags);
		flags = htab_lock_bucket(htab, b);

		l_old = lookup_elem_raw(head, hash, key, key_size);

		@@ -1106,7 +1184,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map map, void key,
		}
		ret = 0;
		err:
		raw_spin_unlock_irqrestore(&b->lock, flags);
		htab_unlock_bucket(htab, b, flags);
		if (l_new)
		bpf_lru_push_free(&htab->lru, &l_new->lru_node);
		return ret;
		@@ -1144,7 +1222,7 @@ static int htab_map_delete_elem(struct bpf_map map, void key)
		b = __select_bucket(htab, hash);
		head = &b->head;

		raw_spin_lock_irqsave(&b->lock, flags);
		flags = htab_lock_bucket(htab, b);

		l = lookup_elem_raw(head, hash, key, key_size);

		@@ -1154,7 +1232,7 @@ static int htab_map_delete_elem(struct bpf_map map, void key)
		ret = 0;
		}

		raw_spin_unlock_irqrestore(&b->lock, flags);
		htab_unlock_bucket(htab, b, flags);
		return ret;
		}

		@@ -1176,7 +1254,7 @@ static int htab_lru_map_delete_elem(struct bpf_map map, void key)
		b = __select_bucket(htab, hash);
		head = &b->head;

		raw_spin_lock_irqsave(&b->lock, flags);
		flags = htab_lock_bucket(htab, b);

		l = lookup_elem_raw(head, hash, key, key_size);

		@@ -1185,7 +1263,7 @@ static int htab_lru_map_delete_elem(struct bpf_map map, void key)
		ret = 0;
		}

		raw_spin_unlock_irqrestore(&b->lock, flags);
		htab_unlock_bucket(htab, b, flags);
		if (l)
		bpf_lru_push_free(&htab->lru, &l->lru_node);
		return ret;
		@@ -1325,8 +1403,7 @@ alloc:
		}

		again:
		preempt_disable();
		this_cpu_inc(bpf_prog_active);
		bpf_disable_instrumentation();
		rcu_read_lock();
		again_nocopy:
		dst_key = keys;
		@@ -1335,7 +1412,7 @@ again_nocopy:
		head = &b->head;
		/* do not grab the lock unless need it (bucket_cnt > 0). */
		if (locked)
		raw_spin_lock_irqsave(&b->lock, flags);
		flags = htab_lock_bucket(htab, b);

		bucket_cnt = 0;
		hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
		@@ -1352,10 +1429,9 @@ again_nocopy:
		/* Note that since bucket_cnt > 0 here, it is implicit
		* that the locked was grabbed, so release it.
		*/
		raw_spin_unlock_irqrestore(&b->lock, flags);
		htab_unlock_bucket(htab, b, flags);
		rcu_read_unlock();
		this_cpu_dec(bpf_prog_active);
		preempt_enable();
		bpf_enable_instrumentation();
		goto after_loop;
		}

		@@ -1364,10 +1440,9 @@ again_nocopy:
		/* Note that since bucket_cnt > 0 here, it is implicit
		* that the locked was grabbed, so release it.
		*/
		raw_spin_unlock_irqrestore(&b->lock, flags);
		htab_unlock_bucket(htab, b, flags);
		rcu_read_unlock();
		this_cpu_dec(bpf_prog_active);
		preempt_enable();
		bpf_enable_instrumentation();
		kvfree(keys);
		kvfree(values);
		goto alloc;
		@@ -1418,7 +1493,7 @@ again_nocopy:
		dst_val += value_size;
		}

		raw_spin_unlock_irqrestore(&b->lock, flags);
		htab_unlock_bucket(htab, b, flags);
		locked = false;

		while (node_to_free) {
		@@ -1437,8 +1512,7 @@ next_batch:
		}

		rcu_read_unlock();
		this_cpu_dec(bpf_prog_active);
		preempt_enable();
		bpf_enable_instrumentation();
		if (bucket_cnt && (copy_to_user(ukeys + total * key_size, keys,
		key_size * bucket_cnt) \|\|
		copy_to_user(uvalues + total * value_size, values,

kernel/bpf/lpm_trie.c

+6 −6

Original line number	Diff line number	Diff line
		@@ -34,7 +34,7 @@ struct lpm_trie {
		size_t n_entries;
		size_t max_prefixlen;
		size_t data_size;
		raw_spinlock_t lock;
		spinlock_t lock;
		};

		/* This trie implements a longest prefix match algorithm that can be used to
		@@ -315,7 +315,7 @@ static int trie_update_elem(struct bpf_map *map,
		if (key->prefixlen > trie->max_prefixlen)
		return -EINVAL;

		raw_spin_lock_irqsave(&trie->lock, irq_flags);
		spin_lock_irqsave(&trie->lock, irq_flags);

		/* Allocate and fill a new node */

		@@ -422,7 +422,7 @@ out:
		kfree(im_node);
		}

		raw_spin_unlock_irqrestore(&trie->lock, irq_flags);
		spin_unlock_irqrestore(&trie->lock, irq_flags);

		return ret;
		}
		@@ -442,7 +442,7 @@ static int trie_delete_elem(struct bpf_map map, void _key)
		if (key->prefixlen > trie->max_prefixlen)
		return -EINVAL;

		raw_spin_lock_irqsave(&trie->lock, irq_flags);
		spin_lock_irqsave(&trie->lock, irq_flags);

		/* Walk the tree looking for an exact key/length match and keeping
		* track of the path we traverse. We will need to know the node
		@@ -518,7 +518,7 @@ static int trie_delete_elem(struct bpf_map map, void _key)
		kfree_rcu(node, rcu);

		out:
		raw_spin_unlock_irqrestore(&trie->lock, irq_flags);
		spin_unlock_irqrestore(&trie->lock, irq_flags);

		return ret;
		}
		@@ -575,7 +575,7 @@ static struct bpf_map trie_alloc(union bpf_attr attr)
		if (ret)
		goto out_err;

		raw_spin_lock_init(&trie->lock);
		spin_lock_init(&trie->lock);

		return &trie->map;
		out_err:

kernel/bpf/percpu_freelist.c

+10 −10

Original line number	Diff line number	Diff line
		@@ -25,12 +25,18 @@ void pcpu_freelist_destroy(struct pcpu_freelist *s)
		free_percpu(s->freelist);
		}

		static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head,
		static inline void pcpu_freelist_push_node(struct pcpu_freelist_head *head,
		struct pcpu_freelist_node *node)
		{
		raw_spin_lock(&head->lock);
		node->next = head->first;
		head->first = node;
		}

		static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head,
		struct pcpu_freelist_node *node)
		{
		raw_spin_lock(&head->lock);
		pcpu_freelist_push_node(head, node);
		raw_spin_unlock(&head->lock);
		}

		@@ -56,21 +62,16 @@ void pcpu_freelist_populate(struct pcpu_freelist s, void buf, u32 elem_size,
		u32 nr_elems)
		{
		struct pcpu_freelist_head *head;
		unsigned long flags;
		int i, cpu, pcpu_entries;

		pcpu_entries = nr_elems / num_possible_cpus() + 1;
		i = 0;

		/* disable irq to workaround lockdep false positive
		* in bpf usage pcpu_freelist_populate() will never race
		* with pcpu_freelist_push()
		*/
		local_irq_save(flags);
		for_each_possible_cpu(cpu) {
		again:
		head = per_cpu_ptr(s->freelist, cpu);
		___pcpu_freelist_push(head, buf);
		/* No locking required as this is not visible yet. */
		pcpu_freelist_push_node(head, buf);
		i++;
		buf += elem_size;
		if (i == nr_elems)
		@@ -78,7 +79,6 @@ again:
		if (i % pcpu_entries)
		goto again;
		}
		local_irq_restore(flags);
		}

		struct pcpu_freelist_node __pcpu_freelist_pop(struct pcpu_freelist s)

Admin message