Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next (ee5a489f) · Commits · 戴 / test

Documentation/networking/af_xdp.rst

+23 −5

Original line number	Diff line number	Diff line
		@@ -295,7 +295,7 @@ round-robin example of distributing packets is shown below:
		{
		rr = (rr + 1) & (MAX_SOCKS - 1);

		return bpf_redirect_map(&xsks_map, rr, 0);
		return bpf_redirect_map(&xsks_map, rr, XDP_DROP);
		}

		Note, that since there is only a single set of FILL and COMPLETION
		@@ -304,6 +304,12 @@ to make sure that multiple processes or threads do not use these rings
		concurrently. There are no synchronization primitives in the
		libbpf code that protects multiple users at this point in time.

		Libbpf uses this mode if you create more than one socket tied to the
		same umem. However, note that you need to supply the
		XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD libbpf_flag with the
		xsk_socket__create calls and load your own XDP program as there is no
		built in one in libbpf that will route the traffic for you.

		XDP_USE_NEED_WAKEUP bind flag
		-----------------------------

		@@ -355,10 +361,22 @@ to set the size of at least one of the RX and TX rings. If you set
		both, you will be able to both receive and send traffic from your
		application, but if you only want to do one of them, you can save
		resources by only setting up one of them. Both the FILL ring and the
		COMPLETION ring are mandatory if you have a UMEM tied to your socket,
		which is the normal case. But if the XDP_SHARED_UMEM flag is used, any
		socket after the first one does not have a UMEM and should in that
		case not have any FILL or COMPLETION rings created.
		COMPLETION ring are mandatory as you need to have a UMEM tied to your
		socket. But if the XDP_SHARED_UMEM flag is used, any socket after the
		first one does not have a UMEM and should in that case not have any
		FILL or COMPLETION rings created as the ones from the shared umem will
		be used. Note, that the rings are single-producer single-consumer, so
		do not try to access them from multiple processes at the same
		time. See the XDP_SHARED_UMEM section.

		In libbpf, you can create Rx-only and Tx-only sockets by supplying
		NULL to the rx and tx arguments, respectively, to the
		xsk_socket__create function.

		If you create a Tx-only socket, we recommend that you do not put any
		packets on the fill ring. If you do this, drivers might think you are
		going to receive something when you in fact will not, and this can
		negatively impact performance.

		XDP_UMEM_REG setsockopt
		-----------------------

Documentation/networking/filter.txt

+4 −4

Original line number	Diff line number	Diff line
		@@ -770,10 +770,10 @@ Some core changes of the new internal format:
		callq foo
		mov %rax,%r13
		mov %rbx,%rdi
		mov $0x2,%esi
		mov $0x3,%edx
		mov $0x4,%ecx
		mov $0x5,%r8d
		mov $0x6,%esi
		mov $0x7,%edx
		mov $0x8,%ecx
		mov $0x9,%r8d
		callq bar
		add %r13,%rax
		mov -0x228(%rbp),%rbx

arch/s390/net/bpf_jit_comp.c

+369 −133

File changed.

Preview size limit exceeded, changes collapsed.

arch/x86/include/asm/text-patching.h

+18 −6

Original line number	Diff line number	Diff line
		@@ -26,10 +26,11 @@ static inline void apply_paravirt(struct paravirt_patch_site *start,
		#define POKE_MAX_OPCODE_SIZE 5

		struct text_poke_loc {
		void *detour;
		void *addr;
		size_t len;
		const char opcode[POKE_MAX_OPCODE_SIZE];
		int len;
		s32 rel32;
		u8 opcode;
		const u8 text[POKE_MAX_OPCODE_SIZE];
		};

		extern void text_poke_early(void addr, const void opcode, size_t len);
		@@ -51,8 +52,10 @@ extern void text_poke_early(void addr, const void opcode, size_t len);
		extern void text_poke(void addr, const void *opcode, size_t len);
		extern void text_poke_kgdb(void addr, const void *opcode, size_t len);
		extern int poke_int3_handler(struct pt_regs *regs);
		extern void text_poke_bp(void addr, const void opcode, size_t len, void *handler);
		extern void text_poke_bp(void addr, const void opcode, size_t len, const void *emulate);
		extern void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries);
		extern void text_poke_loc_init(struct text_poke_loc tp, void addr,
		const void opcode, size_t len, const void emulate);
		extern int after_bootmem;
		extern __ro_after_init struct mm_struct *poking_mm;
		extern __ro_after_init unsigned long poking_addr;
		@@ -64,7 +67,16 @@ static inline void int3_emulate_jmp(struct pt_regs *regs, unsigned long ip)
		}

		#define INT3_INSN_SIZE 1
		#define INT3_INSN_OPCODE 0xCC

		#define CALL_INSN_SIZE 5
		#define CALL_INSN_OPCODE 0xE8

		#define JMP32_INSN_SIZE 5
		#define JMP32_INSN_OPCODE 0xE9

		#define JMP8_INSN_SIZE 2
		#define JMP8_INSN_OPCODE 0xEB

		static inline void int3_emulate_push(struct pt_regs *regs, unsigned long val)
		{

arch/x86/kernel/alternative.c

+101 −31

Original line number	Diff line number	Diff line
		@@ -956,7 +956,6 @@ NOKPROBE_SYMBOL(patch_cmp);
		int poke_int3_handler(struct pt_regs *regs)
		{
		struct text_poke_loc *tp;
		unsigned char int3 = 0xcc;
		void *ip;

		/*
		@@ -978,9 +977,9 @@ int poke_int3_handler(struct pt_regs *regs)
		return 0;

		/*
		* Discount the sizeof(int3). See text_poke_bp_batch().
		* Discount the INT3. See text_poke_bp_batch().
		*/
		ip = (void *) regs->ip - sizeof(int3);
		ip = (void *) regs->ip - INT3_INSN_SIZE;

		/*
		* Skip the binary search if there is a single member in the vector.
		@@ -997,8 +996,28 @@ int poke_int3_handler(struct pt_regs *regs)
		return 0;
		}

		/* set up the specified breakpoint detour */
		regs->ip = (unsigned long) tp->detour;
		ip += tp->len;

		switch (tp->opcode) {
		case INT3_INSN_OPCODE:
		/*
		* Someone poked an explicit INT3, they'll want to handle it,
		* do not consume.
		*/
		return 0;

		case CALL_INSN_OPCODE:
		int3_emulate_call(regs, (long)ip + tp->rel32);
		break;

		case JMP32_INSN_OPCODE:
		case JMP8_INSN_OPCODE:
		int3_emulate_jmp(regs, (long)ip + tp->rel32);
		break;

		default:
		BUG();
		}

		return 1;
		}
		@@ -1027,9 +1046,9 @@ NOKPROBE_SYMBOL(poke_int3_handler);
		*/
		void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
		{
		int patched_all_but_first = 0;
		unsigned char int3 = 0xcc;
		unsigned char int3 = INT3_INSN_OPCODE;
		unsigned int i;
		int do_sync;

		lockdep_assert_held(&text_mutex);

		@@ -1053,16 +1072,16 @@ void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
		/*
		* Second step: update all but the first byte of the patched range.
		*/
		for (i = 0; i < nr_entries; i++) {
		for (do_sync = 0, i = 0; i < nr_entries; i++) {
		if (tp[i].len - sizeof(int3) > 0) {
		text_poke((char *)tp[i].addr + sizeof(int3),
		(const char *)tp[i].opcode + sizeof(int3),
		(const char *)tp[i].text + sizeof(int3),
		tp[i].len - sizeof(int3));
		patched_all_but_first++;
		do_sync++;
		}
		}

		if (patched_all_but_first) {
		if (do_sync) {
		/*
		* According to Intel, this core syncing is very likely
		* not necessary and we'd be safe even without it. But
		@@ -1075,10 +1094,17 @@ void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
		* Third step: replace the first byte (int3) by the first byte of
		* replacing opcode.
		*/
		for (i = 0; i < nr_entries; i++)
		text_poke(tp[i].addr, tp[i].opcode, sizeof(int3));
		for (do_sync = 0, i = 0; i < nr_entries; i++) {
		if (tp[i].text[0] == INT3_INSN_OPCODE)
		continue;

		text_poke(tp[i].addr, tp[i].text, sizeof(int3));
		do_sync++;
		}

		if (do_sync)
		on_each_cpu(do_sync_core, NULL, 1);

		/*
		* sync_core() implies an smp_mb() and orders this store against
		* the writing of the new instruction.
		@@ -1087,6 +1113,60 @@ void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
		bp_patching.nr_entries = 0;
		}

		void text_poke_loc_init(struct text_poke_loc tp, void addr,
		const void opcode, size_t len, const void emulate)
		{
		struct insn insn;

		if (!opcode)
		opcode = (void *)tp->text;
		else
		memcpy((void *)tp->text, opcode, len);

		if (!emulate)
		emulate = opcode;

		kernel_insn_init(&insn, emulate, MAX_INSN_SIZE);
		insn_get_length(&insn);

		BUG_ON(!insn_complete(&insn));
		BUG_ON(len != insn.length);

		tp->addr = addr;
		tp->len = len;
		tp->opcode = insn.opcode.bytes[0];

		switch (tp->opcode) {
		case INT3_INSN_OPCODE:
		break;

		case CALL_INSN_OPCODE:
		case JMP32_INSN_OPCODE:
		case JMP8_INSN_OPCODE:
		tp->rel32 = insn.immediate.value;
		break;

		default: /* assume NOP */
		switch (len) {
		case 2: /* NOP2 -- emulate as JMP8+0 */
		BUG_ON(memcmp(emulate, ideal_nops[len], len));
		tp->opcode = JMP8_INSN_OPCODE;
		tp->rel32 = 0;
		break;

		case 5: /* NOP5 -- emulate as JMP32+0 */
		BUG_ON(memcmp(emulate, ideal_nops[NOP_ATOMIC5], len));
		tp->opcode = JMP32_INSN_OPCODE;
		tp->rel32 = 0;
		break;

		default: /* unknown instruction */
		BUG();
		}
		break;
		}
		}

		/**
		* text_poke_bp() -- update instructions on live kernel on SMP
		* @addr: address to patch
		@@ -1098,20 +1178,10 @@ void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
		* dynamically allocated memory. This function should be used when it is
		* not possible to allocate memory.
		*/
		void text_poke_bp(void addr, const void opcode, size_t len, void *handler)
		void text_poke_bp(void addr, const void opcode, size_t len, const void *emulate)
		{
		struct text_poke_loc tp = {
		.detour = handler,
		.addr = addr,
		.len = len,
		};

		if (len > POKE_MAX_OPCODE_SIZE) {
		WARN_ONCE(1, "len is larger than %d\n", POKE_MAX_OPCODE_SIZE);
		return;
		}

		memcpy((void *)tp.opcode, opcode, len);
		struct text_poke_loc tp;

		text_poke_loc_init(&tp, addr, opcode, len, emulate);
		text_poke_bp_batch(&tp, 1);
		}

Admin message