Commit 0d7f6827 authored by Alexei Starovoitov's avatar Alexei Starovoitov
Browse files

Merge branch 'bpf_skb_ecn_set_ce'



Lawrence Brakmo says:

====================
Host Bandwidth Manager is a framework for limiting the bandwidth used
by v2 cgroups. It consists of 1 BPF helper, a sample BPF program to
limit egress bandwdith as well as a sample user program and script to
simplify HBM testing.

The sample HBM BPF program is not meant to be production quality, it is
provided as proof of concept. A lot more information, including sample
runs in some cases, are provided in the commit messages of the individual
patches.

A future patch will add support for reducing TCP's cwnd (we are evaluating
alternatives). Another patch will add support for fair queueing's Earliest
Departure Time. Until then, HBM is better suited for flows supporitng ECN.

In addition, A BPF program to limit ingress bandwidth will be provided in
an upcomming patchset.

Changes from v1 to v2:
  * bpf_tcp_enter_cwr can only be called from a cgroup skb egress BPF
    program (otherwise load or attach will fail) where we already hold
    the sk lock. Also only applies for ESTABLISHED state.
  * bpf_skb_ecn_set_ce uses INET_ECN_set_ce()
  * bpf_tcp_check_probe_timer now uses tcp_reset_xmit_timer. Can only be
    used by egress cgroup skb programs.
  * removed load_cg_skb user program.
  * nrm bpf egress program checks packet header in skb to determine
    ECN value. Now also works for ECN enabled UDP packets.
    Using ECN_ defines instead of integers.
  * NRM script test program now uses bpftool instead of load_cg_skb

Changes from v2 to v3:
  * Changed name from NRM (Network Resource Manager) to HBM (Host
    Bandwdith Manager)
  * The bpf helper to set ECN ce now checks that the header is writeable
  * Removed helper bpf functions that modified TCP state due to a concern
    about whether the socket is locked by the current thread.
====================

Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
parents b74e21ab 4ffd44cf
Loading
Loading
Loading
Loading
+9 −1
Original line number Diff line number Diff line
@@ -2359,6 +2359,13 @@ union bpf_attr {
 *	Return
 *		A **struct bpf_tcp_sock** pointer on success, or NULL in
 *		case of failure.
 *
 * int bpf_skb_ecn_set_ce(struct sk_buf *skb)
 *     Description
 *             Sets ECN of IP header to ce (congestion encountered) if
 *             current value is ect (ECN capable). Works with IPv6 and IPv4.
 *     Return
 *             1 if set, 0 if not set.
 */
#define __BPF_FUNC_MAPPER(FN)		\
	FN(unspec),			\
@@ -2457,7 +2464,8 @@ union bpf_attr {
	FN(spin_lock),			\
	FN(spin_unlock),		\
	FN(sk_fullsock),		\
	FN(tcp_sock),
	FN(tcp_sock),			\
	FN(skb_ecn_set_ce),

/* integer value in 'imm' field of BPF_CALL instruction selects which helper
 * function eBPF program intends to call
+28 −0
Original line number Diff line number Diff line
@@ -5426,6 +5426,32 @@ static const struct bpf_func_proto bpf_tcp_sock_proto = {
	.arg1_type	= ARG_PTR_TO_SOCK_COMMON,
};

BPF_CALL_1(bpf_skb_ecn_set_ce, struct sk_buff *, skb)
{
	unsigned int iphdr_len;

	if (skb->protocol == cpu_to_be16(ETH_P_IP))
		iphdr_len = sizeof(struct iphdr);
	else if (skb->protocol == cpu_to_be16(ETH_P_IPV6))
		iphdr_len = sizeof(struct ipv6hdr);
	else
		return 0;

	if (skb_headlen(skb) < iphdr_len)
		return 0;

	if (skb_cloned(skb) && !skb_clone_writable(skb, iphdr_len))
		return 0;

	return INET_ECN_set_ce(skb);
}

static const struct bpf_func_proto bpf_skb_ecn_set_ce_proto = {
	.func           = bpf_skb_ecn_set_ce,
	.gpl_only       = false,
	.ret_type       = RET_INTEGER,
	.arg1_type      = ARG_PTR_TO_CTX,
};
#endif /* CONFIG_INET */

bool bpf_helper_changes_pkt_data(void *func)
@@ -5585,6 +5611,8 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
#ifdef CONFIG_INET
	case BPF_FUNC_tcp_sock:
		return &bpf_tcp_sock_proto;
	case BPF_FUNC_skb_ecn_set_ce:
		return &bpf_skb_ecn_set_ce_proto;
#endif
	default:
		return sk_filter_func_proto(func_id, prog);
+5 −0
Original line number Diff line number Diff line
@@ -52,6 +52,7 @@ hostprogs-y += xdpsock
hostprogs-y += xdp_fwd
hostprogs-y += task_fd_query
hostprogs-y += xdp_sample_pkts
hostprogs-y += hbm

# Libbpf dependencies
LIBBPF = $(TOOLS_PATH)/lib/bpf/libbpf.a
@@ -107,6 +108,7 @@ xdpsock-objs := xdpsock_user.o
xdp_fwd-objs := xdp_fwd_user.o
task_fd_query-objs := bpf_load.o task_fd_query_user.o $(TRACE_HELPERS)
xdp_sample_pkts-objs := xdp_sample_pkts_user.o $(TRACE_HELPERS)
hbm-objs := bpf_load.o hbm.o $(CGROUP_HELPERS)

# Tell kbuild to always build the programs
always := $(hostprogs-y)
@@ -164,6 +166,7 @@ always += xdp_adjust_tail_kern.o
always += xdp_fwd_kern.o
always += task_fd_query_kern.o
always += xdp_sample_pkts_kern.o
always += hbm_out_kern.o

KBUILD_HOSTCFLAGS += -I$(objtree)/usr/include
KBUILD_HOSTCFLAGS += -I$(srctree)/tools/lib/
@@ -263,6 +266,8 @@ $(BPF_SAMPLES_PATH)/*.c: verify_target_bpf $(LIBBPF)
$(src)/*.c: verify_target_bpf $(LIBBPF)

$(obj)/tracex5_kern.o: $(obj)/syscall_nrs.h
$(obj)/hbm_out_kern.o: $(src)/hbm.h $(src)/hbm_kern.h
$(obj)/hbm.o: $(src)/hbm.h

# asm/sysreg.h - inline assembly used by it is incompatible with llvm.
# But, there is no easy way to fix it, so just exclude it since it is
+436 −0
Original line number Diff line number Diff line
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
#
# Copyright (c) 2019 Facebook
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of version 2 of the GNU General Public
# License as published by the Free Software Foundation.

Usage() {
  echo "Script for testing HBM (Host Bandwidth Manager) framework."
  echo "It creates a cgroup to use for testing and load a BPF program to limit"
  echo "egress or ingress bandwidht. It then uses iperf3 or netperf to create"
  echo "loads. The output is the goodput in Mbps (unless -D was used)."
  echo ""
  echo "USAGE: $name [out] [-b=<prog>|--bpf=<prog>] [-c=<cc>|--cc=<cc>] [-D]"
  echo "             [-d=<delay>|--delay=<delay>] [--debug] [-E]"
  echo "             [-f=<#flows>|--flows=<#flows>] [-h] [-i=<id>|--id=<id >]"
  echo "             [-l] [-N] [-p=<port>|--port=<port>] [-P]"
  echo "             [-q=<qdisc>] [-R] [-s=<server>|--server=<server]"
  echo "             [-S|--stats] -t=<time>|--time=<time>] [-w] [cubic|dctcp]"
  echo "  Where:"
  echo "    out               egress (default)"
  echo "    -b or --bpf       BPF program filename to load and attach."
  echo "                      Default is hbm_out_kern.o for egress,"
  echo "    -c or -cc         TCP congestion control (cubic or dctcp)"
  echo "    --debug           print BPF trace buffer"
  echo "    -d or --delay     add a delay in ms using netem"
  echo "    -D                In addition to the goodput in Mbps, it also outputs"
  echo "                      other detailed information. This information is"
  echo "                      test dependent (i.e. iperf3 or netperf)."
  echo "    -E                enable ECN (not required for dctcp)"
  echo "    -f or --flows     number of concurrent flows (default=1)"
  echo "    -i or --id        cgroup id (an integer, default is 1)"
  echo "    -N                use netperf instead of iperf3"
  echo "    -l                do not limit flows using loopback"
  echo "    -h                Help"
  echo "    -p or --port      iperf3 port (default is 5201)"
  echo "    -P                use an iperf3 instance for each flow"
  echo "    -q                use the specified qdisc"
  echo "    -r or --rate      rate in Mbps (default 1s 1Gbps)"
  echo "    -R                Use TCP_RR for netperf. 1st flow has req"
  echo "                      size of 10KB, rest of 1MB. Reply in all"
  echo "                      cases is 1 byte."
  echo "                      More detailed output for each flow can be found"
  echo "                      in the files netperf.<cg>.<flow>, where <cg> is the"
  echo "                      cgroup id as specified with the -i flag, and <flow>"
  echo "                      is the flow id starting at 1 and increasing by 1 for"
  echo "                      flow (as specified by -f)."
  echo "    -s or --server    hostname of netperf server. Used to create netperf"
  echo "                      test traffic between to hosts (default is within host)"
  echo "                      netserver must be running on the host."
  echo "    -S or --stats     whether to update hbm stats (default is yes)."
  echo "    -t or --time      duration of iperf3 in seconds (default=5)"
  echo "    -w                Work conserving flag. cgroup can increase its"
  echo "                      bandwidth beyond the rate limit specified"
  echo "                      while there is available bandwidth. Current"
  echo "                      implementation assumes there is only one NIC"
  echo "                      (eth0), but can be extended to support multiple"
  echo "                       NICs."
  echo "    cubic or dctcp    specify which TCP CC to use"
  echo " "
  exit
}

#set -x

debug_flag=0
args="$@"
name="$0"
netem=0
cc=x
dir="-o"
dir_name="out"
dur=5
flows=1
id=1
prog=""
port=5201
rate=1000
multi_iperf=0
flow_cnt=1
use_netperf=0
rr=0
ecn=0
details=0
server=""
qdisc=""
flags=""
do_stats=0

function start_hbm () {
  rm -f hbm.out
  echo "./hbm $dir -n $id -r $rate -t $dur $flags $dbg $prog" > hbm.out
  echo " " >> hbm.out
  ./hbm $dir -n $id -r $rate -t $dur $flags $dbg $prog >> hbm.out 2>&1  &
  echo $!
}

processArgs () {
  for i in $args ; do
    case $i in
    # Support for upcomming ingress rate limiting
    #in)         # support for upcoming ingress rate limiting
    #  dir="-i"
    #  dir_name="in"
    #  ;;
    out)
      dir="-o"
      dir_name="out"
      ;;
    -b=*|--bpf=*)
      prog="${i#*=}"
      ;;
    -c=*|--cc=*)
      cc="${i#*=}"
      ;;
    --debug)
      flags="$flags -d"
      debug_flag=1
      ;;
    -d=*|--delay=*)
      netem="${i#*=}"
      ;;
    -D)
      details=1
      ;;
    -E)
     ecn=1
     ;;
    # Support for upcomming fq Early Departure Time egress rate limiting
    #--edt)
    # prog="hbm_out_edt_kern.o"
    # qdisc="fq"
    # ;;
    -f=*|--flows=*)
      flows="${i#*=}"
      ;;
    -i=*|--id=*)
      id="${i#*=}"
      ;;
    -l)
      flags="$flags -l"
      ;;
    -N)
      use_netperf=1
      ;;
    -p=*|--port=*)
      port="${i#*=}"
      ;;
    -P)
      multi_iperf=1
      ;;
    -q=*)
      qdisc="${i#*=}"
      ;;
    -r=*|--rate=*)
      rate="${i#*=}"
      ;;
    -R)
      rr=1
      ;;
    -s=*|--server=*)
      server="${i#*=}"
      ;;
    -S|--stats)
      flags="$flags -s"
      do_stats=1
      ;;
    -t=*|--time=*)
      dur="${i#*=}"
      ;;
    -w)
      flags="$flags -w"
      ;;
    cubic)
      cc=cubic
      ;;
    dctcp)
      cc=dctcp
      ;;
    *)
      echo "Unknown arg:$i"
      Usage
      ;;
    esac
  done
}

processArgs

if [ $debug_flag -eq 1 ] ; then
  rm -f hbm_out.log
fi

hbm_pid=$(start_hbm)
usleep 100000

host=`hostname`
cg_base_dir=/sys/fs/cgroup
cg_dir="$cg_base_dir/cgroup-test-work-dir/hbm$id"

echo $$ >> $cg_dir/cgroup.procs

ulimit -l unlimited

rm -f ss.out
rm -f hbm.[0-9]*.$dir_name
if [ $ecn -ne 0 ] ; then
  sysctl -w -q -n net.ipv4.tcp_ecn=1
fi

if [ $use_netperf -eq 0 ] ; then
  cur_cc=`sysctl -n net.ipv4.tcp_congestion_control`
  if [ "$cc" != "x" ] ; then
    sysctl -w -q -n net.ipv4.tcp_congestion_control=$cc
  fi
fi

if [ "$netem" -ne "0" ] ; then
  if [ "$qdisc" != "" ] ; then
    echo "WARNING: Ignoring -q options because -d option used"
  fi
  tc qdisc del dev lo root > /dev/null 2>&1
  tc qdisc add dev lo root netem delay $netem\ms > /dev/null 2>&1
elif [ "$qdisc" != "" ] ; then
  tc qdisc del dev lo root > /dev/null 2>&1
  tc qdisc add dev lo root $qdisc > /dev/null 2>&1
fi

n=0
m=$[$dur * 5]
hn="::1"
if [ $use_netperf -ne 0 ] ; then
  if [ "$server" != "" ] ; then
    hn=$server
  fi
fi

( ping6 -i 0.2 -c $m $hn > ping.out 2>&1 ) &

if [ $use_netperf -ne 0 ] ; then
  begNetserverPid=`ps ax | grep netserver | grep --invert-match "grep" | \
                   awk '{ print $1 }'`
  if [ "$begNetserverPid" == "" ] ; then
    if [ "$server" == "" ] ; then
      ( ./netserver > /dev/null 2>&1) &
      usleep 100000
    fi
  fi
  flow_cnt=1
  if [ "$server" == "" ] ; then
    np_server=$host
  else
    np_server=$server
  fi
  if [ "$cc" == "x" ] ; then
    np_cc=""
  else
    np_cc="-K $cc,$cc"
  fi
  replySize=1
  while [ $flow_cnt -le $flows ] ; do
    if [ $rr -ne 0 ] ; then
      reqSize=1M
      if [ $flow_cnt -eq 1 ] ; then
        reqSize=10K
      fi
      if [ "$dir" == "-i" ] ; then
        replySize=$reqSize
        reqSize=1
      fi
      ( ./netperf -H $np_server -l $dur -f m -j -t TCP_RR  -- -r $reqSize,$replySize $np_cc -k P50_lATENCY,P90_LATENCY,LOCAL_TRANSPORT_RETRANS,REMOTE_TRANSPORT_RETRANS,LOCAL_SEND_THROUGHPUT,LOCAL_RECV_THROUGHPUT,REQUEST_SIZE,RESPONSE_SIZE > netperf.$id.$flow_cnt ) &
    else
      if [ "$dir" == "-i" ] ; then
        ( ./netperf -H $np_server -l $dur -f m -j -t TCP_RR -- -r 1,10M $np_cc -k P50_LATENCY,P90_LATENCY,LOCAL_TRANSPORT_RETRANS,LOCAL_SEND_THROUGHPUT,REMOTE_TRANSPORT_RETRANS,REMOTE_SEND_THROUGHPUT,REQUEST_SIZE,RESPONSE_SIZE > netperf.$id.$flow_cnt ) &
      else
        ( ./netperf -H $np_server -l $dur -f m -j -t TCP_STREAM -- $np_cc -k P50_lATENCY,P90_LATENCY,LOCAL_TRANSPORT_RETRANS,LOCAL_SEND_THROUGHPUT,REQUEST_SIZE,RESPONSE_SIZE > netperf.$id.$flow_cnt ) &
      fi
    fi
    flow_cnt=$[flow_cnt+1]
  done

# sleep for duration of test (plus some buffer)
  n=$[dur+2]
  sleep $n

# force graceful termination of netperf
  pids=`pgrep netperf`
  for p in $pids ; do
    kill -SIGALRM $p
  done

  flow_cnt=1
  rate=0
  if [ $details -ne 0 ] ; then
    echo ""
    echo "Details for HBM in cgroup $id"
    if [ $do_stats -eq 1 ] ; then
      if [ -e hbm.$id.$dir_name ] ; then
        cat hbm.$id.$dir_name
      fi
    fi
  fi
  while [ $flow_cnt -le $flows ] ; do
    if [ "$dir" == "-i" ] ; then
      r=`cat netperf.$id.$flow_cnt | grep -o "REMOTE_SEND_THROUGHPUT=[0-9]*" | grep -o "[0-9]*"`
    else
      r=`cat netperf.$id.$flow_cnt | grep -o "LOCAL_SEND_THROUGHPUT=[0-9]*" | grep -o "[0-9]*"`
    fi
    echo "rate for flow $flow_cnt: $r"
    rate=$[rate+r]
    if [ $details -ne 0 ] ; then
      echo "-----"
      echo "Details for cgroup $id, flow $flow_cnt"
      cat netperf.$id.$flow_cnt
    fi
    flow_cnt=$[flow_cnt+1]
  done
  if [ $details -ne 0 ] ; then
    echo ""
    delay=`grep "avg" ping.out | grep -o "= [0-9.]*/[0-9.]*" | grep -o "[0-9.]*$"`
    echo "PING AVG DELAY:$delay"
    echo "AGGREGATE_GOODPUT:$rate"
  else
    echo $rate
  fi
elif [ $multi_iperf -eq 0 ] ; then
  (iperf3 -s -p $port -1 > /dev/null 2>&1) &
  usleep 100000
  iperf3 -c $host -p $port -i 0 -P $flows -f m -t $dur > iperf.$id
  rates=`grep receiver iperf.$id | grep -o "[0-9.]* Mbits" | grep -o "^[0-9]*"`
  rate=`echo $rates | grep -o "[0-9]*$"`

  if [ $details -ne 0 ] ; then
    echo ""
    echo "Details for HBM in cgroup $id"
    if [ $do_stats -eq 1 ] ; then
      if [ -e hbm.$id.$dir_name ] ; then
        cat hbm.$id.$dir_name
      fi
    fi
    delay=`grep "avg" ping.out | grep -o "= [0-9.]*/[0-9.]*" | grep -o "[0-9.]*$"`
    echo "PING AVG DELAY:$delay"
    echo "AGGREGATE_GOODPUT:$rate"
  else
    echo $rate
  fi
else
  flow_cnt=1
  while [ $flow_cnt -le $flows ] ; do
    (iperf3 -s -p $port -1 > /dev/null 2>&1) &
    ( iperf3 -c $host -p $port -i 0 -P 1 -f m -t $dur | grep receiver | grep -o "[0-9.]* Mbits" | grep -o "^[0-9]*" | grep -o "[0-9]*$" > iperf3.$id.$flow_cnt ) &
    port=$[port+1]
    flow_cnt=$[flow_cnt+1]
  done
  n=$[dur+1]
  sleep $n
  flow_cnt=1
  rate=0
  if [ $details -ne 0 ] ; then
    echo ""
    echo "Details for HBM in cgroup $id"
    if [ $do_stats -eq 1 ] ; then
      if [ -e hbm.$id.$dir_name ] ; then
        cat hbm.$id.$dir_name
      fi
    fi
  fi

  while [ $flow_cnt -le $flows ] ; do
    r=`cat iperf3.$id.$flow_cnt`
#    echo "rate for flow $flow_cnt: $r"
  if [ $details -ne 0 ] ; then
    echo "Rate for cgroup $id, flow $flow_cnt LOCAL_SEND_THROUGHPUT=$r"
  fi
    rate=$[rate+r]
    flow_cnt=$[flow_cnt+1]
  done
  if [ $details -ne 0 ] ; then
    delay=`grep "avg" ping.out | grep -o "= [0-9.]*/[0-9.]*" | grep -o "[0-9.]*$"`
    echo "PING AVG DELAY:$delay"
    echo "AGGREGATE_GOODPUT:$rate"
  else
    echo $rate
  fi
fi

if [ $use_netperf -eq 0 ] ; then
  sysctl -w -q -n net.ipv4.tcp_congestion_control=$cur_cc
fi
if [ $ecn -ne 0 ] ; then
  sysctl -w -q -n net.ipv4.tcp_ecn=0
fi
if [ "$netem" -ne "0" ] ; then
  tc qdisc del dev lo root > /dev/null 2>&1
fi

sleep 2

hbmPid=`ps ax | grep "hbm " | grep --invert-match "grep" | awk '{ print $1 }'`
if [ "$hbmPid" == "$hbm_pid" ] ; then
  kill $hbm_pid
fi

sleep 1

# Detach any BPF programs that may have lingered
ttx=`bpftool cgroup tree | grep hbm`
v=2
for x in $ttx ; do
    if [ "${x:0:36}" == "/sys/fs/cgroup/cgroup-test-work-dir/" ] ; then
	cg=$x ; v=0
    else
	if [ $v -eq 0 ] ; then
	    id=$x ; v=1
	else
	    if [ $v -eq 1 ] ; then
		type=$x ; bpftool cgroup detach $cg $type id $id
		v=0
	    fi
	fi
    fi
done

if [ $use_netperf -ne 0 ] ; then
  if [ "$server" == "" ] ; then
    if [ "$begNetserverPid" == "" ] ; then
      netserverPid=`ps ax | grep netserver | grep --invert-match "grep" | awk '{ print $1 }'`
      if [ "$netserverPid" != "" ] ; then
        kill $netserverPid
      fi
    fi
  fi
fi
exit

samples/bpf/hbm.c

0 → 100644
+441 −0

File added.

Preview size limit exceeded, changes collapsed.

Loading