Commit 8d75839b authored by Eric Dumazet's avatar Eric Dumazet Committed by Daniel Borkmann
Browse files

bpf, lpm: make longest_prefix_match() faster

At LPC 2018 in Vancouver, Vlad Dumitrescu mentioned that longest_prefix_match()
has a high cost [1].

One reason for that cost is a loop handling one byte at a time.

We can handle more bytes at a time, if enough attention is paid
to endianness.

I was able to remove ~55 % of longest_prefix_match() cpu costs.

[1] https://linuxplumbersconf.org/event/2/contributions/88/attachments/76/87/lpc-bpf-2018-shaping.pdf



Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
Cc: Vlad Dumitrescu <vladum@google.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
parent e4b0c94b
Loading
Loading
Loading
Loading
+49 −10
Original line number Diff line number Diff line
@@ -168,20 +168,59 @@ static size_t longest_prefix_match(const struct lpm_trie *trie,
				   const struct lpm_trie_node *node,
				   const struct bpf_lpm_trie_key *key)
{
	size_t prefixlen = 0;
	size_t i;
	u32 limit = min(node->prefixlen, key->prefixlen);
	u32 prefixlen = 0, i = 0;

	for (i = 0; i < trie->data_size; i++) {
		size_t b;
	BUILD_BUG_ON(offsetof(struct lpm_trie_node, data) % sizeof(u32));
	BUILD_BUG_ON(offsetof(struct bpf_lpm_trie_key, data) % sizeof(u32));

		b = 8 - fls(node->data[i] ^ key->data[i]);
		prefixlen += b;
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && defined(CONFIG_64BIT)

		if (prefixlen >= node->prefixlen || prefixlen >= key->prefixlen)
			return min(node->prefixlen, key->prefixlen);
	/* data_size >= 16 has very small probability.
	 * We do not use a loop for optimal code generation.
	 */
	if (trie->data_size >= 8) {
		u64 diff = be64_to_cpu(*(__be64 *)node->data ^
				       *(__be64 *)key->data);

		prefixlen = 64 - fls64(diff);
		if (prefixlen >= limit)
			return limit;
		if (diff)
			return prefixlen;
		i = 8;
	}
#endif

		if (b < 8)
			break;
	while (trie->data_size >= i + 4) {
		u32 diff = be32_to_cpu(*(__be32 *)&node->data[i] ^
				       *(__be32 *)&key->data[i]);

		prefixlen += 32 - fls(diff);
		if (prefixlen >= limit)
			return limit;
		if (diff)
			return prefixlen;
		i += 4;
	}

	if (trie->data_size >= i + 2) {
		u16 diff = be16_to_cpu(*(__be16 *)&node->data[i] ^
				       *(__be16 *)&key->data[i]);

		prefixlen += 16 - fls(diff);
		if (prefixlen >= limit)
			return limit;
		if (diff)
			return prefixlen;
		i += 2;
	}

	if (trie->data_size >= i + 1) {
		prefixlen += 8 - fls(node->data[i] ^ key->data[i]);

		if (prefixlen >= limit)
			return limit;
	}

	return prefixlen;