[dpdk-dev] [PATCH v8 1/7] power_intrinsics: use callbacks for comparison

McDaniel, Timothy timothy.mcdaniel at intel.com
Thu Jul 8 18:56:58 CEST 2021



> -----Original Message-----
> From: Burakov, Anatoly <anatoly.burakov at intel.com>
> Sent: Thursday, July 8, 2021 9:14 AM
> To: dev at dpdk.org; McDaniel, Timothy <timothy.mcdaniel at intel.com>; Xing,
> Beilei <beilei.xing at intel.com>; Wu, Jingjing <jingjing.wu at intel.com>; Yang,
> Qiming <qiming.yang at intel.com>; Zhang, Qi Z <qi.z.zhang at intel.com>; Wang,
> Haiyue <haiyue.wang at intel.com>; Matan Azrad <matan at nvidia.com>; Shahaf
> Shuler <shahafs at nvidia.com>; Viacheslav Ovsiienko <viacheslavo at nvidia.com>;
> Richardson, Bruce <bruce.richardson at intel.com>; Ananyev, Konstantin
> <konstantin.ananyev at intel.com>
> Cc: Loftus, Ciara <ciara.loftus at intel.com>; Hunt, David <david.hunt at intel.com>
> Subject: [PATCH v8 1/7] power_intrinsics: use callbacks for comparison
> 
> Previously, the semantics of power monitor were such that we were
> checking current value against the expected value, and if they matched,
> then the sleep was aborted. This is somewhat inflexible, because it only
> allowed us to check for a specific value in a specific way.
> 
> This commit replaces the comparison with a user callback mechanism, so
> that any PMD (or other code) using `rte_power_monitor()` can define
> their own comparison semantics and decision making on how to detect the
> need to abort the entering of power optimized state.
> 
> Existing implementations are adjusted to follow the new semantics.
> 
> Suggested-by: Konstantin Ananyev <konstantin.ananyev at intel.com>
> Signed-off-by: Anatoly Burakov <anatoly.burakov at intel.com>
> Acked-by: Konstantin Ananyev <konstantin.ananyev at intel.com>
> ---
> 
> Notes:
>     v4:
>     - Return error if callback is set to NULL
>     - Replace raw number with a macro in monitor condition opaque data
> 
>     v2:
>     - Use callback mechanism for more flexibility
>     - Address feedback from Konstantin
> 
>  doc/guides/rel_notes/release_21_08.rst        |  2 ++
>  drivers/event/dlb2/dlb2.c                     | 17 ++++++++--
>  drivers/net/i40e/i40e_rxtx.c                  | 20 +++++++----
>  drivers/net/iavf/iavf_rxtx.c                  | 20 +++++++----
>  drivers/net/ice/ice_rxtx.c                    | 20 +++++++----
>  drivers/net/ixgbe/ixgbe_rxtx.c                | 20 +++++++----
>  drivers/net/mlx5/mlx5_rx.c                    | 17 ++++++++--
>  .../include/generic/rte_power_intrinsics.h    | 33 +++++++++++++++----
>  lib/eal/x86/rte_power_intrinsics.c            | 17 +++++-----
>  9 files changed, 122 insertions(+), 44 deletions(-)
> 
> diff --git a/doc/guides/rel_notes/release_21_08.rst
> b/doc/guides/rel_notes/release_21_08.rst
> index c92e016783..65910de348 100644
> --- a/doc/guides/rel_notes/release_21_08.rst
> +++ b/doc/guides/rel_notes/release_21_08.rst
> @@ -135,6 +135,8 @@ API Changes
>  * eal: ``rte_strscpy`` sets ``rte_errno`` to ``E2BIG`` in case of string
>    truncation.
> 
> +* eal: the ``rte_power_intrinsics`` API changed to use a callback mechanism.
> +
> 
>  ABI Changes
>  -----------
> diff --git a/drivers/event/dlb2/dlb2.c b/drivers/event/dlb2/dlb2.c
> index eca183753f..252bbd8d5e 100644
> --- a/drivers/event/dlb2/dlb2.c
> +++ b/drivers/event/dlb2/dlb2.c
> @@ -3154,6 +3154,16 @@ dlb2_port_credits_inc(struct dlb2_port *qm_port,
> int num)
>  	}
>  }
> 
> +#define CLB_MASK_IDX 0
> +#define CLB_VAL_IDX 1
> +static int
> +dlb2_monitor_callback(const uint64_t val,
> +		const uint64_t opaque[RTE_POWER_MONITOR_OPAQUE_SZ])
> +{
> +	/* abort if the value matches */
> +	return (val & opaque[CLB_MASK_IDX]) == opaque[CLB_VAL_IDX] ? -1 :
> 0;
> +}
> +
>  static inline int
>  dlb2_dequeue_wait(struct dlb2_eventdev *dlb2,
>  		  struct dlb2_eventdev_port *ev_port,
> @@ -3194,8 +3204,11 @@ dlb2_dequeue_wait(struct dlb2_eventdev *dlb2,
>  			expected_value = 0;
> 
>  		pmc.addr = monitor_addr;
> -		pmc.val = expected_value;
> -		pmc.mask = qe_mask.raw_qe[1];
> +		/* store expected value and comparison mask in opaque data */
> +		pmc.opaque[CLB_VAL_IDX] = expected_value;
> +		pmc.opaque[CLB_MASK_IDX] = qe_mask.raw_qe[1];
> +		/* set up callback */
> +		pmc.fn = dlb2_monitor_callback;
>  		pmc.size = sizeof(uint64_t);
> 
>  		rte_power_monitor(&pmc, timeout + start_ticks);
> diff --git a/drivers/net/i40e/i40e_rxtx.c b/drivers/net/i40e/i40e_rxtx.c
> index 8d65f287f4..65f325ede1 100644
> --- a/drivers/net/i40e/i40e_rxtx.c
> +++ b/drivers/net/i40e/i40e_rxtx.c
> @@ -81,6 +81,18 @@
>  #define I40E_TX_OFFLOAD_SIMPLE_NOTSUP_MASK \
>  		(PKT_TX_OFFLOAD_MASK ^
> I40E_TX_OFFLOAD_SIMPLE_SUP_MASK)
> 
> +static int
> +i40e_monitor_callback(const uint64_t value,
> +		const uint64_t arg[RTE_POWER_MONITOR_OPAQUE_SZ]
> __rte_unused)
> +{
> +	const uint64_t m = rte_cpu_to_le_64(1 <<
> I40E_RX_DESC_STATUS_DD_SHIFT);
> +	/*
> +	 * we expect the DD bit to be set to 1 if this descriptor was already
> +	 * written to.
> +	 */
> +	return (value & m) == m ? -1 : 0;
> +}
> +
>  int
>  i40e_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond
> *pmc)
>  {
> @@ -93,12 +105,8 @@ i40e_get_monitor_addr(void *rx_queue, struct
> rte_power_monitor_cond *pmc)
>  	/* watch for changes in status bit */
>  	pmc->addr = &rxdp->wb.qword1.status_error_len;
> 
> -	/*
> -	 * we expect the DD bit to be set to 1 if this descriptor was already
> -	 * written to.
> -	 */
> -	pmc->val = rte_cpu_to_le_64(1 << I40E_RX_DESC_STATUS_DD_SHIFT);
> -	pmc->mask = rte_cpu_to_le_64(1 <<
> I40E_RX_DESC_STATUS_DD_SHIFT);
> +	/* comparison callback */
> +	pmc->fn = i40e_monitor_callback;
> 
>  	/* registers are 64-bit */
>  	pmc->size = sizeof(uint64_t);
> diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
> index f817fbc49b..d61b32fcee 100644
> --- a/drivers/net/iavf/iavf_rxtx.c
> +++ b/drivers/net/iavf/iavf_rxtx.c
> @@ -57,6 +57,18 @@ iavf_proto_xtr_type_to_rxdid(uint8_t flex_type)
>  				rxdid_map[flex_type] :
> IAVF_RXDID_COMMS_OVS_1;
>  }
> 
> +static int
> +iavf_monitor_callback(const uint64_t value,
> +		const uint64_t arg[RTE_POWER_MONITOR_OPAQUE_SZ]
> __rte_unused)
> +{
> +	const uint64_t m = rte_cpu_to_le_64(1 <<
> IAVF_RX_DESC_STATUS_DD_SHIFT);
> +	/*
> +	 * we expect the DD bit to be set to 1 if this descriptor was already
> +	 * written to.
> +	 */
> +	return (value & m) == m ? -1 : 0;
> +}
> +
>  int
>  iavf_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond *pmc)
>  {
> @@ -69,12 +81,8 @@ iavf_get_monitor_addr(void *rx_queue, struct
> rte_power_monitor_cond *pmc)
>  	/* watch for changes in status bit */
>  	pmc->addr = &rxdp->wb.qword1.status_error_len;
> 
> -	/*
> -	 * we expect the DD bit to be set to 1 if this descriptor was already
> -	 * written to.
> -	 */
> -	pmc->val = rte_cpu_to_le_64(1 << IAVF_RX_DESC_STATUS_DD_SHIFT);
> -	pmc->mask = rte_cpu_to_le_64(1 <<
> IAVF_RX_DESC_STATUS_DD_SHIFT);
> +	/* comparison callback */
> +	pmc->fn = iavf_monitor_callback;
> 
>  	/* registers are 64-bit */
>  	pmc->size = sizeof(uint64_t);
> diff --git a/drivers/net/ice/ice_rxtx.c b/drivers/net/ice/ice_rxtx.c
> index 3f6e735984..5d7ab4f047 100644
> --- a/drivers/net/ice/ice_rxtx.c
> +++ b/drivers/net/ice/ice_rxtx.c
> @@ -27,6 +27,18 @@ uint64_t
> rte_net_ice_dynflag_proto_xtr_ipv6_flow_mask;
>  uint64_t rte_net_ice_dynflag_proto_xtr_tcp_mask;
>  uint64_t rte_net_ice_dynflag_proto_xtr_ip_offset_mask;
> 
> +static int
> +ice_monitor_callback(const uint64_t value,
> +		const uint64_t arg[RTE_POWER_MONITOR_OPAQUE_SZ]
> __rte_unused)
> +{
> +	const uint64_t m = rte_cpu_to_le_16(1 <<
> ICE_RX_FLEX_DESC_STATUS0_DD_S);
> +	/*
> +	 * we expect the DD bit to be set to 1 if this descriptor was already
> +	 * written to.
> +	 */
> +	return (value & m) == m ? -1 : 0;
> +}
> +
>  int
>  ice_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond *pmc)
>  {
> @@ -39,12 +51,8 @@ ice_get_monitor_addr(void *rx_queue, struct
> rte_power_monitor_cond *pmc)
>  	/* watch for changes in status bit */
>  	pmc->addr = &rxdp->wb.status_error0;
> 
> -	/*
> -	 * we expect the DD bit to be set to 1 if this descriptor was already
> -	 * written to.
> -	 */
> -	pmc->val = rte_cpu_to_le_16(1 << ICE_RX_FLEX_DESC_STATUS0_DD_S);
> -	pmc->mask = rte_cpu_to_le_16(1 <<
> ICE_RX_FLEX_DESC_STATUS0_DD_S);
> +	/* comparison callback */
> +	pmc->fn = ice_monitor_callback;
> 
>  	/* register is 16-bit */
>  	pmc->size = sizeof(uint16_t);
> diff --git a/drivers/net/ixgbe/ixgbe_rxtx.c b/drivers/net/ixgbe/ixgbe_rxtx.c
> index d69f36e977..c814a28cb4 100644
> --- a/drivers/net/ixgbe/ixgbe_rxtx.c
> +++ b/drivers/net/ixgbe/ixgbe_rxtx.c
> @@ -1369,6 +1369,18 @@ const uint32_t
>  		RTE_PTYPE_INNER_L3_IPV4_EXT |
> RTE_PTYPE_INNER_L4_UDP,
>  };
> 
> +static int
> +ixgbe_monitor_callback(const uint64_t value,
> +		const uint64_t arg[RTE_POWER_MONITOR_OPAQUE_SZ]
> __rte_unused)
> +{
> +	const uint64_t m = rte_cpu_to_le_32(IXGBE_RXDADV_STAT_DD);
> +	/*
> +	 * we expect the DD bit to be set to 1 if this descriptor was already
> +	 * written to.
> +	 */
> +	return (value & m) == m ? -1 : 0;
> +}
> +
>  int
>  ixgbe_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond
> *pmc)
>  {
> @@ -1381,12 +1393,8 @@ ixgbe_get_monitor_addr(void *rx_queue, struct
> rte_power_monitor_cond *pmc)
>  	/* watch for changes in status bit */
>  	pmc->addr = &rxdp->wb.upper.status_error;
> 
> -	/*
> -	 * we expect the DD bit to be set to 1 if this descriptor was already
> -	 * written to.
> -	 */
> -	pmc->val = rte_cpu_to_le_32(IXGBE_RXDADV_STAT_DD);
> -	pmc->mask = rte_cpu_to_le_32(IXGBE_RXDADV_STAT_DD);
> +	/* comparison callback */
> +	pmc->fn = ixgbe_monitor_callback;
> 
>  	/* the registers are 32-bit */
>  	pmc->size = sizeof(uint32_t);
> diff --git a/drivers/net/mlx5/mlx5_rx.c b/drivers/net/mlx5/mlx5_rx.c
> index 777a1d6e45..17370b77dc 100644
> --- a/drivers/net/mlx5/mlx5_rx.c
> +++ b/drivers/net/mlx5/mlx5_rx.c
> @@ -269,6 +269,18 @@ mlx5_rx_queue_count(struct rte_eth_dev *dev,
> uint16_t rx_queue_id)
>  	return rx_queue_count(rxq);
>  }
> 
> +#define CLB_VAL_IDX 0
> +#define CLB_MSK_IDX 1
> +static int
> +mlx_monitor_callback(const uint64_t value,
> +		const uint64_t opaque[RTE_POWER_MONITOR_OPAQUE_SZ])
> +{
> +	const uint64_t m = opaque[CLB_MSK_IDX];
> +	const uint64_t v = opaque[CLB_VAL_IDX];
> +
> +	return (value & m) == v ? -1 : 0;
> +}
> +
>  int mlx5_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond
> *pmc)
>  {
>  	struct mlx5_rxq_data *rxq = rx_queue;
> @@ -282,8 +294,9 @@ int mlx5_get_monitor_addr(void *rx_queue, struct
> rte_power_monitor_cond *pmc)
>  		return -rte_errno;
>  	}
>  	pmc->addr = &cqe->op_own;
> -	pmc->val =  !!idx;
> -	pmc->mask = MLX5_CQE_OWNER_MASK;
> +	pmc->opaque[CLB_VAL_IDX] = !!idx;
> +	pmc->opaque[CLB_MSK_IDX] = MLX5_CQE_OWNER_MASK;
> +	pmc->fn = mlx_monitor_callback;
>  	pmc->size = sizeof(uint8_t);
>  	return 0;
>  }
> diff --git a/lib/eal/include/generic/rte_power_intrinsics.h
> b/lib/eal/include/generic/rte_power_intrinsics.h
> index dddca3d41c..c9aa52a86d 100644
> --- a/lib/eal/include/generic/rte_power_intrinsics.h
> +++ b/lib/eal/include/generic/rte_power_intrinsics.h
> @@ -18,19 +18,38 @@
>   * which are architecture-dependent.
>   */
> 
> +/** Size of the opaque data in monitor condition */
> +#define RTE_POWER_MONITOR_OPAQUE_SZ 4
> +
> +/**
> + * Callback definition for monitoring conditions. Callbacks with this signature
> + * will be used by `rte_power_monitor()` to check if the entering of power
> + * optimized state should be aborted.
> + *
> + * @param val
> + *   The value read from memory.
> + * @param opaque
> + *   Callback-specific data.
> + *
> + * @return
> + *   0 if entering of power optimized state should proceed
> + *   -1 if entering of power optimized state should be aborted
> + */
> +typedef int (*rte_power_monitor_clb_t)(const uint64_t val,
> +		const uint64_t opaque[RTE_POWER_MONITOR_OPAQUE_SZ]);
>  struct rte_power_monitor_cond {
>  	volatile void *addr;  /**< Address to monitor for changes */
> -	uint64_t val;         /**< If the `mask` is non-zero, location pointed
> -	                       *   to by `addr` will be read and compared
> -	                       *   against this value.
> -	                       */
> -	uint64_t mask;   /**< 64-bit mask to extract value read from `addr` */
> -	uint8_t size;    /**< Data size (in bytes) that will be used to compare
> -	                  *   expected value (`val`) with data read from the
> +	uint8_t size;    /**< Data size (in bytes) that will be read from the
>  	                  *   monitored memory location (`addr`). Can be 1, 2,
>  	                  *   4, or 8. Supplying any other value will result in
>  	                  *   an error.
>  	                  */
> +	rte_power_monitor_clb_t fn; /**< Callback to be used to check if
> +	                             *   entering power optimized state should
> +	                             *   be aborted.
> +	                             */
> +	uint64_t opaque[RTE_POWER_MONITOR_OPAQUE_SZ];
> +	/**< Callback-specific data */
>  };
> 
>  /**
> diff --git a/lib/eal/x86/rte_power_intrinsics.c
> b/lib/eal/x86/rte_power_intrinsics.c
> index 39ea9fdecd..66fea28897 100644
> --- a/lib/eal/x86/rte_power_intrinsics.c
> +++ b/lib/eal/x86/rte_power_intrinsics.c
> @@ -76,6 +76,7 @@ rte_power_monitor(const struct
> rte_power_monitor_cond *pmc,
>  	const uint32_t tsc_h = (uint32_t)(tsc_timestamp >> 32);
>  	const unsigned int lcore_id = rte_lcore_id();
>  	struct power_wait_status *s;
> +	uint64_t cur_value;
> 
>  	/* prevent user from running this instruction if it's not supported */
>  	if (!wait_supported)
> @@ -91,6 +92,9 @@ rte_power_monitor(const struct
> rte_power_monitor_cond *pmc,
>  	if (__check_val_size(pmc->size) < 0)
>  		return -EINVAL;
> 
> +	if (pmc->fn == NULL)
> +		return -EINVAL;
> +
>  	s = &wait_status[lcore_id];
> 
>  	/* update sleep address */
> @@ -110,16 +114,11 @@ rte_power_monitor(const struct
> rte_power_monitor_cond *pmc,
>  	/* now that we've put this address into monitor, we can unlock */
>  	rte_spinlock_unlock(&s->lock);
> 
> -	/* if we have a comparison mask, we might not need to sleep at all */
> -	if (pmc->mask) {
> -		const uint64_t cur_value = __get_umwait_val(
> -				pmc->addr, pmc->size);
> -		const uint64_t masked = cur_value & pmc->mask;
> +	cur_value = __get_umwait_val(pmc->addr, pmc->size);
> 
> -		/* if the masked value is already matching, abort */
> -		if (masked == pmc->val)
> -			goto end;
> -	}
> +	/* check if callback indicates we should abort */
> +	if (pmc->fn(cur_value, pmc->opaque) != 0)
> +		goto end;
> 
>  	/* execute UMWAIT */
>  	asm volatile(".byte 0xf2, 0x0f, 0xae, 0xf7;"
> --
> 2.25.1

DLB changes look good to me

Acked-by: timothy.mcdaniel at intel.com


More information about the dev mailing list