[dpdk-dev] [PATCH v2] sched: fix overflow errors in WRR weighting code

Dumitrescu, Cristian cristian.dumitrescu at intel.com
Tue Jan 2 17:13:34 CET 2018
Previous message: [dpdk-dev] [RFC 0/5] Port Representor for control and monitoring of VF devices
Next message: [dpdk-dev] [PATCH v2] sched: fix overflow errors in WRR weighting code
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]
Hi Alan,

NAK for now.

There is a good reason for truncating the WRR cost to 8-bit value, which is keeping the size of the rte_sched_pipe structure to single cache line (64 bytes). This is done for performance reasons at the expense of some accuracy loss for the scheduling of the 4x queues per traffic class.

Is there a way to make the improvement while working with 8-bit WRR cost values in the pipe structure?

> -----Original Message-----
> From: alangordondewar at gmail.com [mailto:alangordondewar at gmail.com]
> Sent: Thursday, November 30, 2017 9:05 AM
> To: Dumitrescu, Cristian <cristian.dumitrescu at intel.com>
> Cc: dev at dpdk.org; Alan Dewar <alan.dewar at att.com>
> Subject: [PATCH v2] sched: fix overflow errors in WRR weighting code
> 
> From: Alan Dewar <alan.dewar at att.com>
> 
> Revised patch - this version fixes an issue when a small wrr_cost is
> shifted so far right that its value becomes zero.
> 
> The WRR code calculates the lowest common denominator between the
> four
> WRR weights as a uint32_t value and divides the LCD by each of the WRR
> weights and casts the results as a uint8_t.  This casting can cause
> the ratios of the computed wrr costs to be wrong.  For example with
> WRR weights of 3, 5, 7 and 11, the LCD is computed to be
> 1155.  The WRR costs get computed as:

Picking prime numbers for the weights is generally a bad idea. If you would pick e.g. 4, 6, 8 and 12 rather than 3, 5, 7 and 11 you would avoid any issues due to the 8-bit truncation.

> 
>   1155/3 = 385, 1155/5 = 231, 1155/7 = 165, 1155/11 = 105.
> 
> When the value 385 is cast into an uint8_t it ends up as 129.
> Rather than casting straight into a uint8_t, this patch shifts the
> computed WRR costs right so that the largest value is only eight bits
> wide.
> 
> In grinder_schedule, the packet length is multiplied by the WRR cost
> and added to the grinder's wrr_tokens value.  The grinder's wrr_tokens
> field is a uint16_t, so combination of a packet length of 1500 bytes
> and a wrr cost of 44 will overflow this field on the first packet.
> 
> This patch increases the width of the grinder's wrr_tokens and
> wrr_mask fields from uint16_t to uint32_t.
> 

Increasing the size of the grinder fields is OK, but I am not sure whether it is really helpful, as the values saved in the pipe structure are 8-bit.


> In grinder_wrr_store, the remaining tokens in the grinder's wrr_tokens
> array are copied to the appropriate pipe's wrr_tokens array.  However
> the pipe's wrr_tokens array is only a uint8_t array so unused tokens
> were quite frequently lost which upsets the balance of traffic across
> the four WRR queues.
> 
> This patch increases the width of the pipe's wrr_tokens array from
> a uint8_t to uint32_t.

This is not allowed for performance reasons, as having 16-bit or 32-bit WRR cost values in pipe structure would increase the size of the pipe structure from one cache line to two cache lines.

> 
> Signed-off-by: Alan Dewar <alan.dewar at att.com>
> ---
> v2 - fixed bug in the wrr_cost calculation code that could result
> in a zero wrr_cost
> 
>  lib/librte_sched/rte_sched.c        | 59 +++++++++++++++++++++++++++++---
> -----
>  lib/librte_sched/rte_sched_common.h | 15 ++++++++++
>  2 files changed, 61 insertions(+), 13 deletions(-)
> 
> diff --git a/lib/librte_sched/rte_sched.c b/lib/librte_sched/rte_sched.c
> index 7252f85..324743d 100644
> --- a/lib/librte_sched/rte_sched.c
> +++ b/lib/librte_sched/rte_sched.c
> @@ -130,7 +130,7 @@ struct rte_sched_pipe {
>  	uint32_t tc_credits[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE];
> 
>  	/* Weighted Round Robin (WRR) */
> -	uint8_t wrr_tokens[RTE_SCHED_QUEUES_PER_PIPE];
> +	uint32_t wrr_tokens[RTE_SCHED_QUEUES_PER_PIPE];
> 
>  	/* TC oversubscription */
>  	uint32_t tc_ov_credits;
> @@ -205,8 +205,8 @@ struct rte_sched_grinder {
>  	struct rte_mbuf *pkt;
> 
>  	/* WRR */
> -	uint16_t wrr_tokens[RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS];
> -	uint16_t wrr_mask[RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS];
> +	uint32_t wrr_tokens[RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS];
> +	uint32_t wrr_mask[RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS];
>  	uint8_t wrr_cost[RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS];
>  };
> 
> @@ -542,6 +542,17 @@ rte_sched_time_ms_to_bytes(uint32_t time_ms,
> uint32_t rate)
>  	return time;
>  }
> 
> +static uint32_t rte_sched_reduce_to_byte(uint32_t value)
> +{
> +	uint32_t shift = 0;
> +
> +	while (value & 0xFFFFFF00) {
> +		value >>= 1;
> +		shift++;
> +	}
> +	return shift;
> +}
> +
>  static void
>  rte_sched_port_config_pipe_profile_table(struct rte_sched_port *port,
> struct rte_sched_port_params *params)
>  {
> @@ -583,6 +594,8 @@ rte_sched_port_config_pipe_profile_table(struct
> rte_sched_port *port, struct rte
>  			uint32_t
> wrr_cost[RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS];
>  			uint32_t lcd, lcd1, lcd2;
>  			uint32_t qindex;
> +			uint32_t low_pos;
> +			uint32_t shift;
> 
>  			qindex = j *
> RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS;
> 
> @@ -594,12 +607,28 @@ rte_sched_port_config_pipe_profile_table(struct
> rte_sched_port *port, struct rte
>  			lcd1 = rte_get_lcd(wrr_cost[0], wrr_cost[1]);
>  			lcd2 = rte_get_lcd(wrr_cost[2], wrr_cost[3]);
>  			lcd = rte_get_lcd(lcd1, lcd2);
> +			low_pos = rte_min_pos_4_u32(wrr_cost);
> 
>  			wrr_cost[0] = lcd / wrr_cost[0];
>  			wrr_cost[1] = lcd / wrr_cost[1];
>  			wrr_cost[2] = lcd / wrr_cost[2];
>  			wrr_cost[3] = lcd / wrr_cost[3];
> 
> +			shift =
> rte_sched_reduce_to_byte(wrr_cost[low_pos]);
> +			wrr_cost[0] >>= shift;
> +			wrr_cost[1] >>= shift;
> +			wrr_cost[2] >>= shift;
> +			wrr_cost[3] >>= shift;
> +
> +			if (wrr_cost[0] == 0)
> +				wrr_cost[0]++;
> +			if (wrr_cost[1] == 0)
> +				wrr_cost[1]++;
> +			if (wrr_cost[2] == 0)
> +				wrr_cost[2]++;
> +			if (wrr_cost[3] == 0)
> +				wrr_cost[3]++;
> +
>  			dst->wrr_cost[qindex] = (uint8_t) wrr_cost[0];
>  			dst->wrr_cost[qindex + 1] = (uint8_t) wrr_cost[1];
>  			dst->wrr_cost[qindex + 2] = (uint8_t) wrr_cost[2];
> @@ -1941,15 +1970,19 @@ grinder_wrr_load(struct rte_sched_port *port,
> uint32_t pos)
> 
>  	qindex = tc_index * 4;
> 
> -	grinder->wrr_tokens[0] = ((uint16_t) pipe->wrr_tokens[qindex]) <<
> RTE_SCHED_WRR_SHIFT;
> -	grinder->wrr_tokens[1] = ((uint16_t) pipe->wrr_tokens[qindex + 1])
> << RTE_SCHED_WRR_SHIFT;
> -	grinder->wrr_tokens[2] = ((uint16_t) pipe->wrr_tokens[qindex + 2])
> << RTE_SCHED_WRR_SHIFT;
> -	grinder->wrr_tokens[3] = ((uint16_t) pipe->wrr_tokens[qindex + 3])
> << RTE_SCHED_WRR_SHIFT;
> +	grinder->wrr_tokens[0] = pipe->wrr_tokens[qindex] <<
> +		RTE_SCHED_WRR_SHIFT;
> +	grinder->wrr_tokens[1] = pipe->wrr_tokens[qindex + 1] <<
> +		RTE_SCHED_WRR_SHIFT;
> +	grinder->wrr_tokens[2] = pipe->wrr_tokens[qindex + 2] <<
> +		RTE_SCHED_WRR_SHIFT;
> +	grinder->wrr_tokens[3] = pipe->wrr_tokens[qindex + 3] <<
> +		RTE_SCHED_WRR_SHIFT;
> 
> -	grinder->wrr_mask[0] = (qmask & 0x1) * 0xFFFF;
> -	grinder->wrr_mask[1] = ((qmask >> 1) & 0x1) * 0xFFFF;
> -	grinder->wrr_mask[2] = ((qmask >> 2) & 0x1) * 0xFFFF;
> -	grinder->wrr_mask[3] = ((qmask >> 3) & 0x1) * 0xFFFF;
> +	grinder->wrr_mask[0] = (qmask & 0x1) * 0xFFFFFFFF;
> +	grinder->wrr_mask[1] = ((qmask >> 1) & 0x1) * 0xFFFFFFFF;
> +	grinder->wrr_mask[2] = ((qmask >> 2) & 0x1) * 0xFFFFFFFF;
> +	grinder->wrr_mask[3] = ((qmask >> 3) & 0x1) * 0xFFFFFFFF;
> 
>  	grinder->wrr_cost[0] = pipe_params->wrr_cost[qindex];
>  	grinder->wrr_cost[1] = pipe_params->wrr_cost[qindex + 1];
> @@ -1981,14 +2014,14 @@ static inline void
>  grinder_wrr(struct rte_sched_port *port, uint32_t pos)
>  {
>  	struct rte_sched_grinder *grinder = port->grinder + pos;
> -	uint16_t wrr_tokens_min;
> +	uint32_t wrr_tokens_min;
> 
>  	grinder->wrr_tokens[0] |= ~grinder->wrr_mask[0];
>  	grinder->wrr_tokens[1] |= ~grinder->wrr_mask[1];
>  	grinder->wrr_tokens[2] |= ~grinder->wrr_mask[2];
>  	grinder->wrr_tokens[3] |= ~grinder->wrr_mask[3];
> 
> -	grinder->qpos = rte_min_pos_4_u16(grinder->wrr_tokens);
> +	grinder->qpos = rte_min_pos_4_u32(grinder->wrr_tokens);
>  	wrr_tokens_min = grinder->wrr_tokens[grinder->qpos];
> 
>  	grinder->wrr_tokens[0] -= wrr_tokens_min;
> diff --git a/lib/librte_sched/rte_sched_common.h
> b/lib/librte_sched/rte_sched_common.h
> index aed144b..a3c6bc2 100644
> --- a/lib/librte_sched/rte_sched_common.h
> +++ b/lib/librte_sched/rte_sched_common.h
> @@ -77,6 +77,21 @@ rte_min_pos_4_u16(uint16_t *x)
>  	return pos0;
>  }
> 
> +static inline uint32_t
> +rte_min_pos_4_u32(uint32_t *x)
> +{
> +	uint32_t pos0 = 0;
> +	uint32_t pos1 = 2;
> +
> +	if (x[1] <= x[0])
> +		pos0 = 1;
> +	if (x[3] <= x[2])
> +		pos1 = 3;
> +	if (x[pos1] <= x[pos0])
> +		pos0 = pos1;
> +
> +	return pos0;
> +}
>  #endif
> 
>  /*
> --
> 2.1.4

Regards,
Cristian
Previous message: [dpdk-dev] [RFC 0/5] Port Representor for control and monitoring of VF devices
Next message: [dpdk-dev] [PATCH v2] sched: fix overflow errors in WRR weighting code
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]
More information about the dev mailing list