forked from apernet/tcp-brutal
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbrutal.c
316 lines (264 loc) · 8.73 KB
/
brutal.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
#include <linux/module.h>
#include <linux/version.h>
#include <net/tcp.h>
#include <linux/math64.h>
#if IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 8, 0)
#include <net/transp_v6.h>
#else
#warning IPv6 support is disabled. Brutal will only work with IPv4. \
Please ensure you have enabled CONFIG_IPV6 in your kernel config \
and your kernel version is greater than 5.8.
#endif
#define INIT_PACING_RATE 125000 // 1 Mbps
#define INIT_CWND_GAIN 20
#define MIN_PACING_RATE 62500 // 500 Kbps
#define MIN_CWND_GAIN 5
#define MAX_CWND_GAIN 80
#define MIN_CWND 4
#ifndef ICSK_CA_PRIV_SIZE
#error "ICSK_CA_PRIV_SIZE not defined"
#else
// This is the size of the private data area in struct inet_connection_sock
// The size varies between Linux versions
// We use it to calculate the number of slots in the packet info array
#define RAW_PKT_INFO_SLOTS ((ICSK_CA_PRIV_SIZE - 2 * sizeof(u64)) / sizeof(struct brutal_pkt_info))
#define PKT_INFO_SLOTS (RAW_PKT_INFO_SLOTS < 3 ? 3 : (RAW_PKT_INFO_SLOTS > 5 ? 5 : RAW_PKT_INFO_SLOTS))
#endif
#define MIN_PKT_INFO_SAMPLES 50
#define MIN_ACK_RATE_PERCENT 80
#define TCP_BRUTAL_PARAMS 23301
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 13, 0)
static u64 tcp_sock_get_sec(const struct tcp_sock *tp)
{
return div_u64(tp->tcp_mstamp, USEC_PER_SEC);
}
#elif LINUX_VERSION_CODE >= KERNEL_VERSION(4, 12, 0)
// see https://github.com/torvalds/linux/commit/9a568de4818dea9a05af141046bd3e589245ab83
static u64 tcp_sock_get_sec(const struct tcp_sock *tp)
{
return div_u64(tp->tcp_mstamp.stamp_us, USEC_PER_SEC);
}
#else
#include <linux/jiffies.h>
static u64 tcp_sock_get_sec(const struct tcp_sock *tp)
{
return div_u64(jiffies_to_usecs(tcp_time_stamp), USEC_PER_SEC);
}
#endif
struct brutal_pkt_info
{
u64 sec;
u32 acked;
u32 losses;
};
struct brutal
{
u64 rate;
u32 cwnd_gain;
struct brutal_pkt_info slots[PKT_INFO_SLOTS];
};
struct brutal_params
{
u64 rate; // Send rate in bytes per second
u32 cwnd_gain; // CWND gain in tenths (10=1.0)
} __packed;
static struct proto tcp_prot_override __ro_after_init;
#ifdef _TRANSP_V6_H
static struct proto tcpv6_prot_override __ro_after_init;
#endif // _TRANSP_V6_H
#ifdef _LINUX_SOCKPTR_H
static int brutal_set_params(struct sock *sk, sockptr_t optval, unsigned int optlen)
#else
static int brutal_set_params(struct sock *sk, char __user *optval, unsigned int optlen)
#endif
{
struct brutal *brutal = inet_csk_ca(sk);
struct brutal_params params;
if (optlen < sizeof(params))
return -EINVAL;
#ifdef _LINUX_SOCKPTR_H
if (copy_from_sockptr(¶ms, optval, sizeof(params)))
return -EFAULT;
#else
if (copy_from_user(¶ms, optval, sizeof(params)))
return -EFAULT;
#endif
// Sanity checks
if (params.rate < MIN_PACING_RATE)
return -EINVAL;
if (params.cwnd_gain < MIN_CWND_GAIN || params.cwnd_gain > MAX_CWND_GAIN)
return -EINVAL;
brutal->rate = params.rate;
brutal->cwnd_gain = params.cwnd_gain;
return 0;
}
#ifdef _LINUX_SOCKPTR_H
static int brutal_tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen)
#else
static int brutal_tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, unsigned int optlen)
#endif
{
if (level == IPPROTO_TCP && optname == TCP_BRUTAL_PARAMS)
return brutal_set_params(sk, optval, optlen);
else
return tcp_prot.setsockopt(sk, level, optname, optval, optlen);
}
#ifdef _TRANSP_V6_H
#ifdef _LINUX_SOCKPTR_H
static int brutal_tcpv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen)
#else // _LINUX_SOCKPTR_H
static int brutal_tcpv6_setsockopt(struct sock *sk, int level, int optname, char __user *optval, unsigned int optlen)
#endif // _LINUX_SOCKPTR_H
{
if (level == IPPROTO_TCP && optname == TCP_BRUTAL_PARAMS)
return brutal_set_params(sk, optval, optlen);
else
return tcpv6_prot.setsockopt(sk, level, optname, optval, optlen);
}
#endif // _TRANSP_V6_H
static void brutal_init(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct brutal *brutal = inet_csk_ca(sk);
if (sk->sk_family == AF_INET)
sk->sk_prot = &tcp_prot_override;
#ifdef _TRANSP_V6_H
else if (sk->sk_family == AF_INET6)
sk->sk_prot = &tcpv6_prot_override;
#endif // _TRANSP_V6_H
else
BUG(); // WTF?
tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
brutal->rate = INIT_PACING_RATE;
brutal->cwnd_gain = INIT_CWND_GAIN;
memset(brutal->slots, 0, sizeof(brutal->slots));
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 13, 0)
// Pacing is REQUIRED for Brutal to work, but Linux only has internal pacing after 4.13.
// For kernels prior to 4.13, you MUST add fq pacing manually (e.g. "tc qdisc add dev eth0 root fq pacing")
// or rate control will be broken.
// See https://github.com/torvalds/linux/commit/218af599fa635b107cfe10acf3249c4dfe5e4123 for details.
cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED);
#endif
}
// Copied from tcp.h for compatibility reasons
static inline u32 brutal_tcp_snd_cwnd(const struct tcp_sock *tp)
{
return tp->snd_cwnd;
}
// Copied from tcp.h for compatibility reasons
static inline void brutal_tcp_snd_cwnd_set(struct tcp_sock *tp, u32 val)
{
WARN_ON_ONCE((int)val <= 0);
tp->snd_cwnd = val;
}
static void brutal_update_rate(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct brutal *brutal = inet_csk_ca(sk);
u64 sec = tcp_sock_get_sec(tp);
u64 min_sec = sec - PKT_INFO_SLOTS;
u32 acked = 0, losses = 0;
u32 ack_rate; // Scaled by 100 (100=1.00) as kernel doesn't support float
u64 rate = brutal->rate;
u32 cwnd;
u32 mss = tp->mss_cache;
u32 rtt_ms = (tp->srtt_us >> 3) / USEC_PER_MSEC;
if (!rtt_ms)
rtt_ms = 1;
for (int i = 0; i < PKT_INFO_SLOTS; i++)
{
if (brutal->slots[i].sec >= min_sec)
{
acked += brutal->slots[i].acked;
losses += brutal->slots[i].losses;
}
}
if (acked + losses < MIN_PKT_INFO_SAMPLES)
ack_rate = 100;
else
{
ack_rate = acked * 100 / (acked + losses);
if (ack_rate < MIN_ACK_RATE_PERCENT)
ack_rate = MIN_ACK_RATE_PERCENT;
}
rate *= 100;
rate = div_u64(rate, ack_rate);
// The order here is chosen carefully to avoid overflow as much as possible
cwnd = div_u64(rate, MSEC_PER_SEC);
cwnd *= rtt_ms;
cwnd /= mss;
cwnd *= brutal->cwnd_gain;
cwnd /= 10;
cwnd = max_t(u32, cwnd, MIN_CWND);
brutal_tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp));
WRITE_ONCE(sk->sk_pacing_rate, min_t(u64, rate, READ_ONCE(sk->sk_max_pacing_rate)));
}
#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 10, 0)
static void brutal_main(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs)
#else
static void brutal_main(struct sock *sk, const struct rate_sample *rs)
#endif
{
struct tcp_sock *tp = tcp_sk(sk);
struct brutal *brutal = inet_csk_ca(sk);
u64 sec;
u32 slot;
// Ignore invalid rate samples
if (rs->delivered < 0 || rs->interval_us <= 0)
return;
sec = tcp_sock_get_sec(tp);
div_u64_rem(sec, PKT_INFO_SLOTS, &slot);
if (brutal->slots[slot].sec == sec)
{
// Current slot, update
brutal->slots[slot].acked += rs->acked_sacked;
brutal->slots[slot].losses += rs->losses;
}
else
{
// Uninitialized slot or slot expired
brutal->slots[slot].sec = sec;
brutal->slots[slot].acked = rs->acked_sacked;
brutal->slots[slot].losses = rs->losses;
}
brutal_update_rate(sk);
}
static u32 brutal_undo_cwnd(struct sock *sk)
{
return brutal_tcp_snd_cwnd(tcp_sk(sk));
}
static u32 brutal_ssthresh(struct sock *sk)
{
return tcp_sk(sk)->snd_ssthresh;
}
static struct tcp_congestion_ops tcp_brutal_ops = {
.flags = TCP_CONG_NON_RESTRICTED,
.name = "brutal",
.owner = THIS_MODULE,
.init = brutal_init,
.cong_control = brutal_main,
.undo_cwnd = brutal_undo_cwnd,
.ssthresh = brutal_ssthresh,
};
static int __init brutal_register(void)
{
BUILD_BUG_ON(sizeof(struct brutal) > ICSK_CA_PRIV_SIZE);
BUILD_BUG_ON(PKT_INFO_SLOTS < 1);
tcp_prot_override = tcp_prot;
tcp_prot_override.setsockopt = brutal_tcp_setsockopt;
#ifdef _TRANSP_V6_H
tcpv6_prot_override = tcpv6_prot;
tcpv6_prot_override.setsockopt = brutal_tcpv6_setsockopt;
#endif // _TRANSP_V6_H
return tcp_register_congestion_control(&tcp_brutal_ops);
}
static void __exit brutal_unregister(void)
{
tcp_unregister_congestion_control(&tcp_brutal_ops);
}
module_init(brutal_register);
module_exit(brutal_unregister);
MODULE_AUTHOR("Aperture Internet Laboratory");
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("TCP Brutal");
MODULE_VERSION("1.0.2");