Gautam Ramakrishnan <gautamr...@gmail.com> writes: > Hello, thanks for the feedback > > On Tue, Apr 2, 2019 at 4:19 PM Toke Høiland-Jørgensen <t...@redhat.com> wrote: >> >> Some suggestions below to make fq_pie and fq_codel more similar (ref. my >> previous email). >> >> Also, a few unrelated nits. >> >> > From: Mohit P. Tahiliani <tahili...@nitk.edu.in> >> > >> > FQ-PIE incorporates fair/flow queuing in which every queue >> > is managed by an instance of PIE queueing discipline. >> > The algorithm provides good control over the queueing delay >> > while at the same time, ensures fairness across various >> > flows sharing the same link. >> > >> > Principles: >> > - Packets are classified stochastically on flows. >> > - Each flow has a PIE managed queue. >> > - Flows are linked onto two (Round Robin) lists, >> > so that new flows have priority on old ones. >> > - For a given flow, packets are dropped on arrival according >> > to a drop probability. >> > - Tail drops only. >> >> Why tail drops only? > > I had mentioned this because packets are dropped only at enqueue.
Yup, realise that; was just wondering why you went with this design instead of doing the head drop that fq_codel does? >> >> > Usage: >> > tc qdisc ... fq_pie [ limit PACKETS ] [ flows NUMBER ] >> > [ alpha NUMBER ] [ beta NUMBER ] >> > [ target TIME us ] [ tupdate TIME us ] >> > [ bytemode ] [ quantum BYTES ] >> > [ ecn | ecn_prob PERCENTAGE ] >> > >> > defaults: 1024 flows, 10240 packets limit, quantum : device MTU >> > target: 15ms >> > tupdate: 15ms >> > alpha: 2 (on a scale of 0 to 16) >> > beta: 20 (on a scale of 0 to 16) >> > ecn: false >> > ecn_prob: 10% >> > >> > Signed-off-by: Mohit P. Tahiliani <tahili...@nitk.edu.in> >> > Signed-off-by: Sachin D. Patil <sdp.sac...@gmail.com> >> > Signed-off-by: Mohit Bhasi <mohitbhasi1...@gmail.com> >> > Signed-off-by: V. Saicharan <vsaicharan1...@gmail.com> >> > Signed-off-by: Leslie Monis <lesliemo...@gmail.com> >> > Signed-off-by: Gautam Ramakrishnan <gautamr...@gmail.com> >> > Cc: Dave Taht <dave.t...@gmail.com> >> > --- >> > include/uapi/linux/pkt_sched.h | 28 ++ >> > net/sched/Kconfig | 14 +- >> > net/sched/Makefile | 1 + >> > net/sched/sch_fq_pie.c | 485 +++++++++++++++++++++++++++++++++ >> > 4 files changed, 527 insertions(+), 1 deletion(-) >> > create mode 100644 net/sched/sch_fq_pie.c >> > >> > diff --git a/include/uapi/linux/pkt_sched.h >> > b/include/uapi/linux/pkt_sched.h >> > index 7ee74c3474bf..005413bd09ee 100644 >> > --- a/include/uapi/linux/pkt_sched.h >> > +++ b/include/uapi/linux/pkt_sched.h >> > @@ -964,6 +964,34 @@ struct tc_pie_xstats { >> > __u32 ecn_mark; /* packets marked with ecn*/ >> > }; >> > >> > +/* FQ PIE */ >> > +enum { >> > + TCA_FQ_PIE_UNSPEC, >> > + TCA_FQ_PIE_TARGET, >> > + TCA_FQ_PIE_LIMIT, >> > + TCA_FQ_PIE_TUPDATE, >> > + TCA_FQ_PIE_ALPHA, >> > + TCA_FQ_PIE_BETA, >> > + TCA_FQ_PIE_ECN, >> > + TCA_FQ_PIE_QUANTUM, >> > + TCA_FQ_PIE_BYTEMODE, >> > + TCA_FQ_PIE_FLOWS, >> > + TCA_FQ_PIE_ECN_PROB, >> > + __TCA_FQ_PIE_MAX >> > +}; >> > +#define TCA_FQ_PIE_MAX (__TCA_FQ_PIE_MAX - 1) >> > + >> > +struct tc_fq_pie_xstats { >> > + __u32 packets_in; /* total number of packets enqueued */ >> > + __u32 dropped; /* packets dropped due to fq_pie_action */ >> > + __u32 overlimit; /* dropped due to lack of space in queue */ >> > + __u32 ecn_mark; /* packets marked with ecn*/ >> > + __u32 new_flow_count; /* number of time packets >> > + created a 'new flow' */ >> > + __u32 new_flows_len; /* count of flows in new list */ >> > + __u32 old_flows_len; /* count of flows in old list */ >> > +}; >> > + >> > /* CBS */ >> > struct tc_cbs_qopt { >> > __u8 offload; >> > diff --git a/net/sched/Kconfig b/net/sched/Kconfig >> > index 5c02ad97ef23..49f4dd9894a0 100644 >> > --- a/net/sched/Kconfig >> > +++ b/net/sched/Kconfig >> > @@ -358,13 +358,25 @@ config NET_SCH_PIE >> > help >> > Say Y here if you want to use the Proportional Integral controller >> > Enhanced scheduler packet scheduling algorithm. >> > - For more information, please see >> > https://tools.ietf.org/html/rfc8033 >> > + For more information, please see >> > + http://tools.ietf.org/html/draft-pan-tsvwg-pie-00 >> > >> > To compile this driver as a module, choose M here: the module >> > will be called sch_pie. >> > >> > If unsure, say N. >> > >> > +config NET_SCH_FQ_PIE >> > + tristate "Flow Queue Proportional Integral controller Enhanced >> > (FQ-PIE) scheduler" >> > + help >> > + Say Y here if you want to use the Flow Queue Proportional Integral >> > controller >> > + Enhanced scheduler packet scheduling algorithm. >> > + >> > + To compile this driver as a module, choose M here: the module >> > + will be called sch_fq_pie. >> > + >> > + If unsure, say N. >> > + >> > config NET_SCH_INGRESS >> > tristate "Ingress/classifier-action Qdisc" >> > depends on NET_CLS_ACT >> > diff --git a/net/sched/Makefile b/net/sched/Makefile >> > index 8a40431d7b5c..fdcd3f7b2fb2 100644 >> > --- a/net/sched/Makefile >> > +++ b/net/sched/Makefile >> > @@ -55,6 +55,7 @@ obj-$(CONFIG_NET_SCH_CAKE) += sch_cake.o >> > obj-$(CONFIG_NET_SCH_FQ) += sch_fq.o >> > obj-$(CONFIG_NET_SCH_HHF) += sch_hhf.o >> > obj-$(CONFIG_NET_SCH_PIE) += sch_pie.o >> > +obj-$(CONFIG_NET_SCH_FQ_PIE) += sch_fq_pie.o >> > obj-$(CONFIG_NET_SCH_CBS) += sch_cbs.o >> > obj-$(CONFIG_NET_SCH_ETF) += sch_etf.o >> > obj-$(CONFIG_NET_SCH_TAPRIO) += sch_taprio.o >> > diff --git a/net/sched/sch_fq_pie.c b/net/sched/sch_fq_pie.c >> > new file mode 100644 >> > index 000000000000..4ccefa0bc7f0 >> > --- /dev/null >> > +++ b/net/sched/sch_fq_pie.c >> > @@ -0,0 +1,485 @@ >> > +/* >> > + * net/sched/sch_fq_pie.c >> > + * >> > + * This program is free software; you can redistribute it and/or >> > + * modify it under the terms of the GNU General Public License >> > + * as published by the Free Software Foundation; either version 2 >> > + * of the License. >> > + * >> > + * This program is distributed in the hope that it will be useful, >> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of >> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the >> > + * GNU General Public License for more details. >> >> Lose the license boilerplate and replace it with a SPDX header line. > > I shall do that. > >> >> > + * Author: Mohit P. Tahiliani <tahili...@nitk.edu.in> >> > + * Author: Sachin D. Patil <sdp.sac...@gmail.com> >> > + * Author: Mohit Bhasi <mohitbhasi1...@gmail.com> >> > + * Author: V Saicharan <vsaicharan1...@gmail.com> >> > + * Author: Leslie Monis <lesliemo...@gmail.com> >> > + * Author: Gautam Ramakrishnan <gautamr...@gmail.com> >> > + * >> > + * References: >> > + * RFC 8033: https://tools.ietf.org/html/rfc8033 >> > + */ >> > + >> > +#include <linux/jhash.h> >> > +#include <linux/vmalloc.h> >> > +#include <net/pie.h> >> > + >> > +struct fq_pie_params { >> > + struct pie_params p_params; >> > + u32 ecn_prob; >> > + u32 flows_cnt; >> > +}; >> > + >> > +struct fq_pie_stats { >> > + u32 packets_in; /* total number of packets enqueued */ >> > + u32 dropped; /* packets dropped due to fq_pie action */ >> > + u32 overlimit; /* dropped due to lack of space in queue */ >> > + u32 ecn_mark; /* packets marked with ECN */ >> > + u32 new_flow_count; /* number of time packets created a new flow >> > */ >> > +}; >> > + >> > +struct fq_pie_flow { >> > + s32 deficit; /* number of credits remaining for the flow >> > */ >> > + u32 backlog; /* size of data in the flow */ >> > + u32 qlen; /* number of packets in the flow */ >> > + struct sk_buff *head; >> > + struct sk_buff *tail; >> > + struct list_head flowchain; >> > + struct pie_vars vars; /* pie vars for the flow */ >> > + struct pie_stats stats; /* pie stats for the flow */ >> > +}; >> > + >> > +struct fq_pie_sched_data { >> > + u32 quantum; /* number of credits in deficit round robin >> > */ >> > + struct fq_pie_flow *flows; >> > + struct Qdisc *sch; >> > + struct fq_pie_params params; >> > + struct fq_pie_stats stats; >> > + struct list_head old_flows; >> > + struct list_head new_flows; >> > + struct timer_list adapt_timer; >> > +}; >> >> The flow and sched_data structs have quite a bit in common with those in >> fq_codel; but the members are in a different order. >> >> > +static void fq_pie_params_init(struct fq_pie_params *params) >> > +{ >> > + pie_params_init(¶ms->p_params); >> > + params->ecn_prob = 10; >> > + params->flows_cnt = 1024; >> > +} >> > + >> > +static inline void flow_queue_add(struct fq_pie_flow *flow, >> > + struct sk_buff *skb) >> > +{ >> > + if (!flow->head) >> > + flow->head = skb; >> > + else >> > + flow->tail->next = skb; >> > + flow->tail = skb; >> > + skb->next = NULL; >> > +} >> > + >> > +static int fq_pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, >> > + struct sk_buff **to_free) >> > +{ >> > + struct fq_pie_sched_data *q = qdisc_priv(sch); >> > + struct fq_pie_flow *sel_flow; >> > + u32 pkt_len; >> > + u32 idx; >> > + u8 enqueue = false; >> > + >> > + /* Classifies packet into corresponding flow */ >> > + idx = reciprocal_scale(skb_get_hash(skb), q->params.flows_cnt); >> > + sel_flow = &q->flows[idx]; >> >> This is missing the ability to override the classification from tc >> filters. See fq_codel_classify(). > > I wanted to keep it simple initially. Shall add that. > >> >> > + >> > + /* Checks if the qdisc is full */ >> > + if (unlikely(qdisc_qlen(sch) >= sch->limit)) { >> > + q->stats.overlimit++; >> > + sel_flow->stats.overlimit++; >> > + goto out; >> > + } >> >> The memory_limit checks in fq_codel have turned out to be quite useful >> on constrained systems. I'd suggest adding them here as well. > > I shall add that too. > >> >> > + >> > + if (!drop_early(sch, sel_flow->backlog, skb->len, &sel_flow->vars, >> > + &q->params.p_params)) { >> > + enqueue = true; >> > + } else if (q->params.p_params.ecn && >> > + sel_flow->vars.prob <= >> > + (MAX_PROB / 100) * q->params.ecn_prob && >> > + INET_ECN_set_ce(skb)) { >> > + /* If packet is ecn capable, mark it if drop probability >> > + * is lower than the parameter ecn_prob, else drop it. >> > + */ >> > + q->stats.ecn_mark++; >> > + sel_flow->stats.ecn_mark++; >> > + enqueue = true; >> > + } >> > + if (enqueue) { >> > + pkt_len = qdisc_pkt_len(skb); >> > + q->stats.packets_in++; >> > + sch->qstats.backlog += pkt_len; >> > + sch->q.qlen++; >> > + flow_queue_add(sel_flow, skb); >> > + if (list_empty(&sel_flow->flowchain)) { >> > + list_add_tail(&sel_flow->flowchain, &q->new_flows); >> > + q->stats.new_flow_count++; >> > + sel_flow->deficit = q->quantum; >> > + sel_flow->stats.dropped = 0; >> > + sel_flow->qlen = 0; >> > + sel_flow->backlog = 0; >> > + } >> > + sel_flow->qlen++; >> > + sel_flow->stats.packets_in++; >> > + sel_flow->backlog += pkt_len; >> > + return NET_XMIT_SUCCESS; >> > + } >> > +out: >> > + q->stats.dropped++; >> > + sel_flow->stats.dropped++; >> > + return qdisc_drop(skb, sch, to_free); >> >> You probably want to return NET_XMIT_CN instead of NET_XMIT_DROP here. >> > > Should I replace qdisc_drop with __qdisc_drop and return NET_XMIT_CN? Yeah, I think that would be more appropriate here, as that would directly throttle sockets on the same host. >> > +} >> > + >> > +static const struct nla_policy fq_pie_policy[TCA_FQ_PIE_MAX + 1] = { >> > + [TCA_FQ_PIE_TARGET] = {.type = NLA_U32}, >> > + [TCA_FQ_PIE_LIMIT] = {.type = NLA_U32}, >> > + [TCA_FQ_PIE_TUPDATE] = {.type = NLA_U32}, >> > + [TCA_FQ_PIE_ALPHA] = {.type = NLA_U32}, >> > + [TCA_FQ_PIE_BETA] = {.type = NLA_U32}, >> > + [TCA_FQ_PIE_ECN] = {.type = NLA_U32}, >> > + [TCA_FQ_PIE_QUANTUM] = {.type = NLA_U32}, >> > + [TCA_FQ_PIE_BYTEMODE] = {.type = NLA_U32}, >> > + [TCA_FQ_PIE_FLOWS] = {.type = NLA_U32}, >> > + [TCA_FQ_PIE_ECN_PROB] = {.type = NLA_U32} >> > +}; >> > + >> > +static inline struct sk_buff *dequeue_head(struct fq_pie_flow *flow) >> > +{ >> > + struct sk_buff *skb = flow->head; >> > + >> > + flow->head = skb->next; >> > + skb->next = NULL; >> > + return skb; >> > +} >> > + >> > +static struct sk_buff *fq_pie_qdisc_dequeue(struct Qdisc *sch) >> > +{ >> > + struct fq_pie_sched_data *q = qdisc_priv(sch); >> > + struct sk_buff *skb = NULL; >> > + struct fq_pie_flow *flow; >> > + struct list_head *head; >> > + u32 pkt_len; >> > + >> > +begin: >> > + head = &q->new_flows; >> > + if (list_empty(head)) { >> > + head = &q->old_flows; >> > + if (list_empty(head)) >> > + return NULL; >> > + } >> > + >> > + flow = list_first_entry(head, struct fq_pie_flow, flowchain); >> > + /* Flow has exhausted all its credits */ >> > + if (flow->deficit <= 0) { >> > + flow->deficit += q->quantum; >> > + list_move_tail(&flow->flowchain, &q->old_flows); >> > + goto begin; >> > + } >> > + >> > + if (flow->head) { >> > + skb = dequeue_head(flow); >> > + pkt_len = qdisc_pkt_len(skb); >> > + sch->qstats.backlog -= pkt_len; >> > + sch->q.qlen--; >> > + qdisc_bstats_update(sch, skb); >> > + } >> >> If you factor this out into a dequeue_func(), this dequeue() function is >> very close to identical to the one in fq_codel(). > > Do you suggest I put the if(flow->head) block in a different function? Yeah, that was what I meant. However, without doing the full refactoring it's only borderline useful, so feel free to just keep it the way it is... -Toke