diff --git a/tools/testing/selftests/bpf/bpf_tcp_helpers.h b/tools/testing/selftests/bpf/bpf_tcp_helpers.h index cd4b6e6f14f9da..6b43845f12343f 100644 --- a/tools/testing/selftests/bpf/bpf_tcp_helpers.h +++ b/tools/testing/selftests/bpf/bpf_tcp_helpers.h @@ -36,6 +36,8 @@ enum sk_pacing { struct sock { struct sock_common __sk_common; #define sk_state __sk_common.skc_state + int sk_sndbuf; + int sk_wmem_queued; unsigned long sk_pacing_rate; __u32 sk_pacing_status; /* see enum sk_pacing */ } __attribute__((preserve_access_index)); @@ -234,7 +236,9 @@ extern void tcp_cong_avoid_ai(struct tcp_sock *tp, __u32 w, __u32 acked) __ksym; #define MPTCP_SUBFLOWS_MAX 8 struct mptcp_subflow_context { + unsigned long avg_pacing_rate; __u32 backup : 1; + __u8 stale_count; struct sock *tcp_sock; /* tcp sk backpointer */ } __attribute__((preserve_access_index)); @@ -257,6 +261,8 @@ struct mptcp_sched_ops { struct mptcp_sock { struct inet_connection_sock sk; + __u64 snd_nxt; + int snd_burst; __u32 token; struct sock *first; char ca_name[TCP_CA_NAME_MAX]; diff --git a/tools/testing/selftests/bpf/progs/mptcp_bpf_burst.c b/tools/testing/selftests/bpf/progs/mptcp_bpf_burst.c new file mode 100644 index 00000000000000..b3c8115648667f --- /dev/null +++ b/tools/testing/selftests/bpf/progs/mptcp_bpf_burst.c @@ -0,0 +1,199 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2023, SUSE. */ + +#include +#include +#include "bpf_tcp_helpers.h" + +char _license[] SEC("license") = "GPL"; + +#define MPTCP_SEND_BURST_SIZE 65428 + +struct subflow_send_info { + __u8 subflow_id; + __u64 linger_time; +}; + +extern bool mptcp_subflow_active(struct mptcp_subflow_context *subflow) __ksym; +extern void mptcp_set_timeout(struct sock *sk) __ksym; +extern __u64 mptcp_wnd_end(const struct mptcp_sock *msk) __ksym; +extern bool tcp_stream_memory_free(const struct sock *sk, int wake) __ksym; +extern bool bpf_mptcp_subflow_queues_empty(struct sock *sk) __ksym; +extern void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk) __ksym; + +#define SSK_MODE_ACTIVE 0 +#define SSK_MODE_BACKUP 1 +#define SSK_MODE_MAX 2 + +static __always_inline __u64 div_u64(__u64 dividend, __u32 divisor) +{ + return dividend / divisor; +} + +static __always_inline bool tcp_write_queue_empty(struct sock *sk) +{ + const struct tcp_sock *tp = bpf_skc_to_tcp_sock(sk); + + return tp ? tp->write_seq == tp->snd_nxt : true; +} + +static __always_inline bool tcp_rtx_and_write_queues_empty(struct sock *sk) +{ + return bpf_mptcp_subflow_queues_empty(sk) && tcp_write_queue_empty(sk); +} + +static __always_inline bool __sk_stream_memory_free(const struct sock *sk, int wake) +{ + if (sk->sk_wmem_queued >= sk->sk_sndbuf) + return false; + + return tcp_stream_memory_free(sk, wake); +} + +static __always_inline bool sk_stream_memory_free(const struct sock *sk) +{ + return __sk_stream_memory_free(sk, 0); +} + +SEC("struct_ops/mptcp_sched_burst_init") +void BPF_PROG(mptcp_sched_burst_init, struct mptcp_sock *msk) +{ +} + +SEC("struct_ops/mptcp_sched_burst_release") +void BPF_PROG(mptcp_sched_burst_release, struct mptcp_sock *msk) +{ +} + +static int bpf_burst_get_send(struct mptcp_sock *msk, + struct mptcp_sched_data *data) +{ + struct subflow_send_info send_info[SSK_MODE_MAX]; + struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; + __u32 pace, burst, wmem; + __u64 linger_time; + struct sock *ssk; + int i; + + /* pick the subflow with the lower wmem/wspace ratio */ + for (i = 0; i < SSK_MODE_MAX; ++i) { + send_info[i].subflow_id = MPTCP_SUBFLOWS_MAX; + send_info[i].linger_time = -1; + } + + for (i = 0; i < data->subflows && i < MPTCP_SUBFLOWS_MAX; i++) { + subflow = bpf_mptcp_subflow_ctx_by_pos(data, i); + if (!subflow) + break; + + ssk = mptcp_subflow_tcp_sock(subflow); + if (!mptcp_subflow_active(subflow)) + continue; + + pace = subflow->avg_pacing_rate; + if (!pace) { + /* init pacing rate from socket */ + subflow->avg_pacing_rate = ssk->sk_pacing_rate; + pace = subflow->avg_pacing_rate; + if (!pace) + continue; + } + + linger_time = div_u64((__u64)ssk->sk_wmem_queued << 32, pace); + if (linger_time < send_info[subflow->backup].linger_time) { + send_info[subflow->backup].subflow_id = i; + send_info[subflow->backup].linger_time = linger_time; + } + } + mptcp_set_timeout(sk); + + /* pick the best backup if no other subflow is active */ + if (send_info[SSK_MODE_ACTIVE].subflow_id == MPTCP_SUBFLOWS_MAX) + send_info[SSK_MODE_ACTIVE].subflow_id = send_info[SSK_MODE_BACKUP].subflow_id; + + subflow = bpf_mptcp_subflow_ctx_by_pos(data, send_info[SSK_MODE_ACTIVE].subflow_id); + if (!subflow) + return -1; + ssk = mptcp_subflow_tcp_sock(subflow); + if (!ssk || !sk_stream_memory_free(ssk)) + return -1; + + burst = min(MPTCP_SEND_BURST_SIZE, mptcp_wnd_end(msk) - msk->snd_nxt); + wmem = ssk->sk_wmem_queued; + if (!burst) + goto out; + + subflow->avg_pacing_rate = div_u64((__u64)subflow->avg_pacing_rate * wmem + + ssk->sk_pacing_rate * burst, + burst + wmem); + msk->snd_burst = burst; + +out: + mptcp_subflow_set_scheduled(subflow, true); + return 0; +} + +static int bpf_burst_get_retrans(struct mptcp_sock *msk, + struct mptcp_sched_data *data) +{ + int backup = MPTCP_SUBFLOWS_MAX, pick = MPTCP_SUBFLOWS_MAX, subflow_id; + struct mptcp_subflow_context *subflow; + int min_stale_count = INT_MAX; + struct sock *ssk; + + for (int i = 0; i < data->subflows && i < MPTCP_SUBFLOWS_MAX; i++) { + subflow = bpf_mptcp_subflow_ctx_by_pos(data, i); + if (!subflow) + break; + + if (!mptcp_subflow_active(subflow)) + continue; + + ssk = mptcp_subflow_tcp_sock(subflow); + /* still data outstanding at TCP level? skip this */ + if (!tcp_rtx_and_write_queues_empty(ssk)) { + mptcp_pm_subflow_chk_stale(msk, ssk); + min_stale_count = min(min_stale_count, subflow->stale_count); + continue; + } + + if (subflow->backup) { + if (backup == MPTCP_SUBFLOWS_MAX) + backup = i; + continue; + } + + if (pick == MPTCP_SUBFLOWS_MAX) + pick = i; + } + + if (pick < MPTCP_SUBFLOWS_MAX) { + subflow_id = pick; + goto out; + } + subflow_id = min_stale_count > 1 ? backup : MPTCP_SUBFLOWS_MAX; + +out: + subflow = bpf_mptcp_subflow_ctx_by_pos(data, subflow_id); + if (!subflow) + return -1; + mptcp_subflow_set_scheduled(subflow, true); + return 0; +} + +int BPF_STRUCT_OPS(bpf_burst_get_subflow, struct mptcp_sock *msk, + struct mptcp_sched_data *data) +{ + if (data->reinject) + return bpf_burst_get_retrans(msk, data); + return bpf_burst_get_send(msk, data); +} + +SEC(".struct_ops") +struct mptcp_sched_ops burst = { + .init = (void *)mptcp_sched_burst_init, + .release = (void *)mptcp_sched_burst_release, + .get_subflow = (void *)bpf_burst_get_subflow, + .name = "bpf_burst", +};