Commit 2863debf authored by Daniel Borkmann's avatar Daniel Borkmann
Browse files

Merge branch 'bpf-spinlocks'



Alexei Starovoitov says:

====================
Many algorithms need to read and modify several variables atomically.
Until now it was hard to impossible to implement such algorithms in BPF.
Hence introduce support for bpf_spin_lock.

The api consists of 'struct bpf_spin_lock' that should be placed
inside hash/array/cgroup_local_storage element
and bpf_spin_lock/unlock() helper function.

Example:
struct hash_elem {
    int cnt;
    struct bpf_spin_lock lock;
};
struct hash_elem * val = bpf_map_lookup_elem(&hash_map, &key);
if (val) {
    bpf_spin_lock(&val->lock);
    val->cnt++;
    bpf_spin_unlock(&val->lock);
}

and BPF_F_LOCK flag for lookup/update bpf syscall commands that
allows user space to read/write map elements under lock.

Together these primitives allow race free access to map elements
from bpf programs and from user space.

Key restriction: root only.
Key requirement: maps must be annotated with BTF.

This concept was discussed at Linux Plumbers Conference 2018.
Thank you everyone who participated and helped to iron out details
of api and implementation.

Patch 1: bpf_spin_lock support in the verifier, BTF, hash, array.
Patch 2: bpf_spin_lock in cgroup local storage.
Patches 3,4,5: tests
Patch 6: BPF_F_LOCK flag to lookup/update
Patches 7,8,9: tests

v6->v7:
- fixed this_cpu->__this_cpu per Peter's suggestion and added Ack.
- simplified bpf_spin_lock and load/store overlap check in the verifier
  as suggested by Andrii
- rebase

v5->v6:
- adopted arch_spinlock approach suggested by Peter
- switched to spin_lock_irqsave equivalent as the simplest way
  to avoid deadlocks in rare case of nested networking progs
  (cgroup-bpf prog in preempt_disable vs clsbpf in softirq sharing
  the same map with bpf_spin_lock)
  bpf_spin_lock is only allowed in networking progs that don't
  have arbitrary entry points unlike tracing progs.
- rebase and split test_verifier tests

v4->v5:
- disallow bpf_spin_lock for tracing progs due to insufficient preemption checks
- socket filter progs cannot use bpf_spin_lock due to missing preempt_disable
- fix atomic_set_release. Spotted by Peter.
- fixed hash_of_maps

v3->v4:
- fix BPF_EXIST | BPF_NOEXIST check patch 6. Spotted by Jakub. Thanks!
- rebase

v2->v3:
- fixed build on ia64 and archs where qspinlock is not supported
- fixed missing lock init during lookup w/o BPF_F_LOCK. Spotted by Martin

v1->v2:
- addressed several issues spotted by Daniel and Martin in patch 1
- added test11 to patch 4 as suggested by Daniel
====================

Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
parents 1832f4ef ba72a7b4
Loading
Loading
Loading
Loading
+36 −3
Original line number Diff line number Diff line
@@ -72,14 +72,15 @@ struct bpf_map {
	u32 value_size;
	u32 max_entries;
	u32 map_flags;
	u32 pages;
	int spin_lock_off; /* >=0 valid offset, <0 error */
	u32 id;
	int numa_node;
	u32 btf_key_type_id;
	u32 btf_value_type_id;
	struct btf *btf;
	u32 pages;
	bool unpriv_array;
	/* 55 bytes hole */
	/* 51 bytes hole */

	/* The 3rd and 4th cacheline with misc members to avoid false sharing
	 * particularly with refcounting.
@@ -91,6 +92,36 @@ struct bpf_map {
	char name[BPF_OBJ_NAME_LEN];
};

static inline bool map_value_has_spin_lock(const struct bpf_map *map)
{
	return map->spin_lock_off >= 0;
}

static inline void check_and_init_map_lock(struct bpf_map *map, void *dst)
{
	if (likely(!map_value_has_spin_lock(map)))
		return;
	*(struct bpf_spin_lock *)(dst + map->spin_lock_off) =
		(struct bpf_spin_lock){};
}

/* copy everything but bpf_spin_lock */
static inline void copy_map_value(struct bpf_map *map, void *dst, void *src)
{
	if (unlikely(map_value_has_spin_lock(map))) {
		u32 off = map->spin_lock_off;

		memcpy(dst, src, off);
		memcpy(dst + off + sizeof(struct bpf_spin_lock),
		       src + off + sizeof(struct bpf_spin_lock),
		       map->value_size - off - sizeof(struct bpf_spin_lock));
	} else {
		memcpy(dst, src, map->value_size);
	}
}
void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
			   bool lock_src);

struct bpf_offload_dev;
struct bpf_offloaded_map;

@@ -162,6 +193,7 @@ enum bpf_arg_type {
	ARG_PTR_TO_CTX,		/* pointer to context */
	ARG_ANYTHING,		/* any (initialized) argument is ok */
	ARG_PTR_TO_SOCKET,	/* pointer to bpf_sock */
	ARG_PTR_TO_SPIN_LOCK,	/* pointer to bpf_spin_lock */
};

/* type of values returned from helper functions */
@@ -879,7 +911,8 @@ extern const struct bpf_func_proto bpf_msg_redirect_hash_proto;
extern const struct bpf_func_proto bpf_msg_redirect_map_proto;
extern const struct bpf_func_proto bpf_sk_redirect_hash_proto;
extern const struct bpf_func_proto bpf_sk_redirect_map_proto;

extern const struct bpf_func_proto bpf_spin_lock_proto;
extern const struct bpf_func_proto bpf_spin_unlock_proto;
extern const struct bpf_func_proto bpf_get_local_storage_proto;

/* Shared helpers among cBPF and eBPF. */
+1 −0
Original line number Diff line number Diff line
@@ -148,6 +148,7 @@ struct bpf_verifier_state {
	/* call stack tracking */
	struct bpf_func_state *frame[MAX_CALL_FRAMES];
	u32 curframe;
	u32 active_spin_lock;
	bool speculative;
};

+1 −0
Original line number Diff line number Diff line
@@ -50,6 +50,7 @@ u32 btf_id(const struct btf *btf);
bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s,
			   const struct btf_member *m,
			   u32 expected_offset, u32 expected_size);
int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t);

#ifdef CONFIG_BPF_SYSCALL
const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id);
+7 −1
Original line number Diff line number Diff line
@@ -267,6 +267,7 @@ enum bpf_attach_type {
#define BPF_ANY		0 /* create new element or update existing */
#define BPF_NOEXIST	1 /* create new element if it didn't exist */
#define BPF_EXIST	2 /* update existing element */
#define BPF_F_LOCK	4 /* spin_lock-ed map_lookup/map_update */

/* flags for BPF_MAP_CREATE command */
#define BPF_F_NO_PREALLOC	(1U << 0)
@@ -2422,7 +2423,9 @@ union bpf_attr {
	FN(map_peek_elem),		\
	FN(msg_push_data),		\
	FN(msg_pop_data),		\
	FN(rc_pointer_rel),
	FN(rc_pointer_rel),		\
	FN(spin_lock),			\
	FN(spin_unlock),

/* integer value in 'imm' field of BPF_CALL instruction selects which helper
 * function eBPF program intends to call
@@ -3056,4 +3059,7 @@ struct bpf_line_info {
	__u32	line_col;
};

struct bpf_spin_lock {
	__u32	val;
};
#endif /* _UAPI__LINUX_BPF_H__ */
+3 −0
Original line number Diff line number Diff line
@@ -242,6 +242,9 @@ config QUEUED_SPINLOCKS
	def_bool y if ARCH_USE_QUEUED_SPINLOCKS
	depends on SMP

config BPF_ARCH_SPINLOCK
	bool

config ARCH_USE_QUEUED_RWLOCKS
	bool

Loading