-
Notifications
You must be signed in to change notification settings - Fork 54
Expand file tree
/
Copy pathhoma_qdisc.h
More file actions
459 lines (405 loc) · 14.8 KB
/
homa_qdisc.h
File metadata and controls
459 lines (405 loc) · 14.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
/* SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */
/* This file contains definitions related to Homa's special-purpose
* queuing discipline
*/
#include "homa_rpc.h"
#ifdef __UNIT_TEST__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-variable"
#endif /* __UNIT_TEST__*/
#include <net/sch_generic.h>
#ifdef __UNIT_TEST__
#pragma GCC diagnostic pop
#endif /* __UNIT_TEST__*/
#include <linux/rbtree.h>
#include <net/pkt_sched.h>
#ifndef _HOMA_QDISC_H
#define _HOMA_QDISC_H
/**
* struct homa_qdisc - Contains Homa-specific data for a single instance of
* the homa queuing discipline.
*/
struct homa_qdisc {
/** @qdisc: The Qdisc that this structure is associated with. */
struct Qdisc *qdisc;
/** @qdev: Info shared among all qdiscs for a net_device. */
struct homa_qdisc_dev *qdev;
/**
* @ix: Index of this qdisc's transmit queue among all those for
* its net_device.
*/
int ix;
/**
* @deferred_tcp: List of non-Homa packets for this qdisc that have
* been deferred because of NIC overload, in order of arrival.
* Synchronize with qdev->defer_lock.
*/
struct sk_buff_head deferred_tcp;
/**
* @defer_links: Used to link this object into qdev->deferred_qdiscs
* when deferred_tcp is nonempty. This will be an empty list if
* deferred_tcp is nonempty. Synchronized with qdev->defer_lock.
*/
struct list_head defer_links;
};
/**
* struct homa_qdisc_dev - Contains information shared across all of the
* homa_qdiscs associated with a net_device.
*/
struct homa_qdisc_dev {
/** @dev: Device common to all qdiscs using this struct. */
struct net_device *dev;
/**
* @hnet: Homa's information about the network namespace
* this object belongs to.
*/
struct homa_net *hnet;
/**
* @refs: Reference count (e.g. includes one reference for each
* homa_qdisc that references this object). Must hold
* hnet->qdisc_devs_lock to access.
*/
refcount_t refs;
/** @link_mbps: Speed of the link associated with @dev, in Mbps. */
int link_mbps;
/**
* @cycles_per_mibyte: The number of homa_clock cycles that it takes
* to transmit 2**20 bytes on the link associated with @dev; computed
* from @link_mbps. This is actually a slight overestimate (if we
* underestimate, the link queue could grow without bound during
* periods of high traffic).
*/
int cycles_per_mibyte;
/**
* @links: Used to link this object into the qdevs list in a
* homa_qdisc_shared struct.
*/
struct list_head links;
/**
* @link_idle_time: The time, measured by homa_clock, at which we
* estimate that all of the packets passed to @dev will have been
* transmitted, assuming the NIC can transmit at full link speed.
* May be in the past. See the PACING comment at the top of
* homa_qdisc.c for a discussion of the pacing mechanism.
*/
atomic64_t link_idle_time __aligned(L1_CACHE_BYTES);
/**
* @deferred_rpcs: Contains all homa_rpc's with deferred packets, in
* SRPT order.
*/
struct rb_root_cached deferred_rpcs;
/**
* @oldest_rpc: The RPC in deferred_rpcs with the oldest init_time, or
* NULL if not currently known.
*/
struct homa_rpc *oldest_rpc;
/**
* @srpt_bytes: The number of bytes that should be transmitted from
* SRPT packets before transmitting a FIFO packet. <= 0 means
* the next packet transmission should be FIFO.
*/
s64 srpt_bytes;
/**
* @deferred_qdiscs: List of all homa_qdiscs with non-Homa packets
* that have been deferred because of NIC overload.
*/
struct list_head deferred_qdiscs;
/**
* @next_qdisc: Points to either the defer_links field in a homa_qdisc
* or to deferred_qdiscs above. Used to select the next non-Homa packet
* for transmission. Note: this may refer to deferred_qdiscs even when
* deferred_qdiscs is nonempty.
*/
struct list_head *next_qdisc;
/**
* @last_defer: The most recent homa_clock() time when a packet was
* deferred, or 0 if there are currently no deferred packets.
*/
u64 last_defer;
/**
* @max_nic_queue_bytes: The number of bytes corresponding to
* qdev->max_nic_queue_usecs.
*/
int max_nic_queue_bytes;
/**
* @congested_qdisc: If non-NULL, this variable identifies a qdisc
* whose NIC queue is overloaded according to @homa_max_nic_queue_bytes.
* NULL means no queue is currently known to be congested. This
* variable is accessed without synchronization. See the PACING comment
* at the top of homa_qdisc.c for a discussion of the packet pacing
* architecture.
*/
struct homa_qdisc *congested_qdisc;
/**
* @defer_lock: Synchronizes access to information about deferred
* packets, including deferred_rpcs, deferred_qdiscs, next_qdisc,
* last_defer, and some information in homa_qdiscs.
*/
spinlock_t defer_lock;
/**
* @homa_credit: When there are both Homa and TCP deferred packets,
* this is used to balance output between them according to the
* homa_share sysctl value. Positive means that Homa packets should
* be transmitted next, zero or negative means TCP. When a TCP
* packet is transmitted, this is incremented by the packet length
* times homa_share; when a Homa packet is transmitted, it is
* decremented by packet length times (100 - homa_share). Used only
* by the pacer, so no need for synchronization.
*/
int homa_credit;
/**
* @pacer_kthread: Kernel thread that eventually transmits packets
* on homa_deferred and tcp_deferred.
*/
struct task_struct *pacer_kthread;
/**
* @pacer_sleep: Used to block the pacer thread when there
* are no throttled RPCs.
*/
struct wait_queue_head pacer_sleep;
/**
* @pacer_mutex: Ensures that only one instance of
* homa_qdisc_pacer runs at a time. Only used in "try" mode:
* never block on this.
*/
spinlock_t pacer_mutex ____cacheline_aligned_in_smp;
/**
* @rcu_head: Holds state of a pending call_rcu invocation when
* this struct is deleted.
*/
struct rcu_head rcu_head;
};
/**
* struct homa_qdisc_shared - There is one of these structs for each
* struct homa. Contains information that is shared across all homq_qdiscs
* and homa_qdisc_devs for the struct homa.
*/
struct homa_qdisc_shared {
/**
* @mutex: Must hold when modifying qdevs. Can scan qdevs
* without locking using RCU.
*/
struct mutex mutex;
/**
* @qdevs: RCU list of all homa_qdisc_devs that currently
* exist for this struct homa.
*/
struct list_head qdevs;
/**
* @fifo_fraction: Out of every 1000 packets transmitted by the
* pacer, this number will be transmitted from the oldest message
* rather than the highest-priority message. Set externally via
* sysctl.
*/
int fifo_fraction;
/**
* @fifo_weight: Determines how much qdev->fifo_count is updated
* when a FIFO packet is transmitted (for each FIFO byte transmitted,
* @fifo_weight >> HOMA_FIFO_WEIGHT_SHIFT SRPT bytes should be
* transmitted); computed from @fifo_fraction. Valid only if
* fifo_fraction is nonzero.
*/
int fifo_weight;
#define HOMA_FIFO_WEIGHT_SHIFT 10
/**
* @max_nic_est_backlog_usecs: Limits the NIC queue length: we won't
* queue packets in the NIC for transmission if link_idle_time is
* this many nanoseconds in the future (or more). Set externally via
* sysctl.
*/
int max_nic_est_backlog_usecs;
/**
* @max_nic_est_backlog_cycles: Same as max_nic_est_backlog_usecs
* except in homa_clock() units.
*/
int max_nic_est_backlog_cycles;
/**
* @max_nic_queue_usecs: An additional limit on NIC queue buildup:
* if any individual NIC queue reaches a length where it would
* take at least this many microseconds to transmit all of its packets,
* then no more packets will be queued for *any* NIC queue until
* the queue gets below this limit. Set externally via sysctl.
*/
int max_nic_queue_usecs;
/**
* @defer_min_bytes: If a packet has fewer bytes than this, then it
* will be transmitted immediately, regardless of NIC queue length.
* We have this limit because for very small packets CPU overheads
* make it impossible to keep up with the NIC so (a) the NIC queue
* can't grow and (b) using the pacer would serialize all of these
* packets through a single core, which makes things even worse.
* Set externally via sysctl.
*/
int defer_min_bytes;
/**
* @homa_share: When the uplink is overloaded, this determines how
* to share bandwidth between TCP and Homa. It gives the percentage
* of bandwidth that Homa will receive; TCP (and all other protocols,
* such as UDP) get the remainder. Must be between 0 and 100,
* inclusive.
*/
int homa_share;
/**
* @max_link_usage: An integer <= 100 indicating the maximum percentage
* of uplink bandwidth that Homa will attempt to utilize. A smaller
* value reduces the likelihood of queue buildup in the NIC, but
* also prevents full link utilization.
*/
int max_link_usage;
#ifndef __STRIP__ /* See strip.py */
/**
* @sysctl_header: Used to remove sysctl values when this structure
* is destroyed.
*/
struct ctl_table_header *sysctl_header;
#endif /* See strip.py */
};
/**
* struct homa_rcu_kfreer - Used by homa_rcu_kfree to defer kfree-ing
* an object until it is RCU-safe.
*/
struct homa_rcu_kfreer {
/** @rcu_head: Holds state of a pending call_rcu invocation. */
struct rcu_head rcu_head;
/** @object: Kfree this after waiting until RCU has synced. */
void *object;
};
void homa_qdev_update_sysctl(struct homa_qdisc_dev *qdev);
bool homa_qdisc_can_bypass(struct sk_buff *skb,
struct homa_qdisc *q);
void homa_qdisc_defer_homa(struct homa_qdisc_dev *qdev,
struct sk_buff *skb);
void homa_qdisc_defer_tcp(struct homa_qdisc *q, struct sk_buff *skb);
void homa_qdisc_destroy(struct Qdisc *sch);
void homa_qdisc_dev_callback(struct rcu_head *head);
int homa_qdisc_dointvec(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos);
int homa_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
struct sk_buff **to_free);
void homa_qdisc_free_homa(struct homa_qdisc_dev *qdev);
struct sk_buff *homa_qdisc_get_deferred_homa(struct homa_qdisc_dev *qdev);
struct homa_rpc *
homa_qdisc_get_oldest(struct homa_qdisc_dev *qdev);
int homa_qdisc_init(struct Qdisc *sch, struct nlattr *opt,
struct netlink_ext_ack *extack);
void homa_qdisc_insert_rb(struct homa_qdisc_dev *qdev,
struct homa_rpc *rpc);
int homa_qdisc_pacer(struct homa_qdisc_dev *qdev);
void homa_qdisc_pacer_check(struct homa *homa);
int homa_qdisc_pacer_main(void *device);
struct homa_qdisc_dev *
homa_qdisc_qdev_get(struct net_device *dev);
void homa_qdisc_qdev_put(struct homa_qdisc_dev *qdev);
int homa_qdisc_register(void);
struct homa_qdisc_shared *
homa_qdisc_shared_alloc(void);
void homa_qdisc_shared_free(struct homa_qdisc_shared *qshared);
void homa_qdisc_unregister(void);
int homa_qdisc_update_link_idle(struct homa_qdisc_dev *qdev,
int bytes, int max_queue_ns);
void homa_qdisc_update_sysctl_deps(struct homa_qdisc_shared *qshared);
int homa_qdisc_xmit_deferred_homa(struct homa_qdisc_dev *qdev);
int homa_qdisc_xmit_deferred_tcp(struct homa_qdisc_dev *qdev);
void homa_rcu_kfree(void *object);
void homa_rcu_kfree_callback(struct rcu_head *head);
/**
* homa_qdisc_active() - Return true if homa qdiscs are enabled for @hnet
* (so the old pacer should not be used), false otherwise.
* @homa: Information about the Homa transport.
* Return: See above.
*/
static inline bool homa_qdisc_active(struct homa *homa)
{
return list_first_or_null_rcu(&homa->qshared->qdevs,
struct homa_qdisc_dev, links) != NULL;
}
/**
* homa_qdisc_rpc_init() - Initialize a homa_rpc_qdisc struct.
* @qrpc: Struct to initialize
*/
static inline void homa_qdisc_rpc_init(struct homa_rpc_qdisc *qrpc)
{
skb_queue_head_init(&qrpc->packets);
qrpc->tx_left = HOMA_MAX_MESSAGE_LENGTH;
}
/**
* homa_qdisc_any_deferred() - Returns true if there are currently any
* deferred packets in a homa_qdisc_dev, false if there are none.
* @qdev: Holds info about deferred packets.
* Return: See above.
*/
static inline bool homa_qdisc_any_deferred(struct homa_qdisc_dev *qdev)
{
return rb_first_cached(&qdev->deferred_rpcs) ||
!list_empty(&qdev->deferred_qdiscs);
}
/**
* homa_qdisc_schedule_skb() - Enqueue an skb on a qdisc and schedule the
* qdisc for execution.
* @skb: Packet buffer to queue for output
* @qdisc: homa_qdisc on which to schedule it.
*/
static inline void homa_qdisc_schedule_skb(struct sk_buff *skb,
struct Qdisc *qdisc) {
spin_lock_bh(qdisc_lock(qdisc));
qdisc_enqueue_tail(skb, qdisc);
spin_unlock_bh(qdisc_lock(qdisc));
__netif_schedule(qdisc);
}
/**
* homa_qdisc_precedes() - Return true if @rpc1 is considered "less" than
* @rpc2 (i.e. higher priority) for the purposes of qdev->deferred_rpcs, or
* false if @rpc1 is consdered "greater" (ties not allowed).
* @rpc1: RPC to compare
* @rpc2: RPC to compare; must be different from rpc1
* Return: See above
*/
static inline bool homa_qdisc_precedes(struct homa_rpc *rpc1,
struct homa_rpc *rpc2)
{
/* The primary metric for comparison is bytes left to transmit;
* in case of ties, use RPC age as secondar metric (oldest RPC
* is "less"), and if still tied (highly unlikely) use the
* addresses of the RPCs as a tie-breaker.
*/
if (rpc1->qrpc.tx_left < rpc2->qrpc.tx_left)
return true;
else if (rpc2->qrpc.tx_left < rpc1->qrpc.tx_left)
return false;
if (rpc1->msgout.init_time < rpc2->msgout.init_time)
return true;
else if (rpc2->msgout.init_time < rpc1->msgout.init_time)
return false;
return rpc1 < rpc2;
}
/**
* homa_qdisc_bytes_pending() - Return the total number of bytes in skbs
* that have been enqueued in the NIC for transmission via a given queue
* but have not yet been returned after transmission.
* @q: Return the pending bytes for the devqueue associated with
* this qdisc.
* Return: See above
*/
static inline int homa_qdisc_bytes_pending(struct homa_qdisc *q)
{
/* Ideally this function would be provided by dynamic_queue_limits.h
* so that we don't have to root around in its data structures.
*/
struct dql *dql = &qdisc_from_priv(q)->dev_queue->dql;
return READ_ONCE(dql->num_queued) - READ_ONCE(dql->num_completed);
}
/**
* homa_qdisc_update_congested() - If the NIC queue for a qdisc has
* become too long, record the fact that this qdisc is congested.
* @q: qdisc whose netdev_queue should be checked.
*/
static inline void homa_qdisc_update_congested(struct homa_qdisc *q)
{
if (homa_qdisc_bytes_pending(q) > q->qdev->max_nic_queue_bytes) {
if (!READ_ONCE(q->qdev->congested_qdisc))
tt_record2("homa_qdisc_update_congested marked qid %d congested (%d bytes)",
q->ix, homa_qdisc_bytes_pending(q));
WRITE_ONCE(q->qdev->congested_qdisc, q);
}
}
#endif /* _HOMA_QDISC_H */