[dpdk-dev] [PATCH v13 05/11] eal: add monitor wakeup function

Burakov, Anatoly anatoly.burakov at intel.com
Tue Jan 12 17:25:22 CET 2021


On 12-Jan-21 4:18 PM, Burakov, Anatoly wrote:
> On 12-Jan-21 4:02 PM, Ananyev, Konstantin wrote:
>>
>>>
>>> Now that we have everything in a C file, we can store the information
>>> about our sleep, and have a native mechanism to wake up the sleeping
>>> core. This mechanism would however only wake up a core that's sleeping
>>> while monitoring - waking up from `rte_power_pause` won't work.
>>>
>>> Signed-off-by: Anatoly Burakov <anatoly.burakov at intel.com>
>>> ---
>>>
>>> Notes:
>>>      v13:
>>>      - Add comments around wakeup code to explain what it does
>>>      - Add lcore_id parameter checking to prevent buffer overrun
>>>
>>>   .../arm/include/rte_power_intrinsics.h        |  9 ++
>>>   .../include/generic/rte_power_intrinsics.h    | 16 ++++
>>>   .../ppc/include/rte_power_intrinsics.h        |  9 ++
>>>   lib/librte_eal/version.map                    |  1 +
>>>   lib/librte_eal/x86/rte_power_intrinsics.c     | 85 +++++++++++++++++++
>>>   5 files changed, 120 insertions(+)
>>>
>>> diff --git a/lib/librte_eal/arm/include/rte_power_intrinsics.h 
>>> b/lib/librte_eal/arm/include/rte_power_intrinsics.h
>>> index 27869251a8..39e49cc45b 100644
>>> --- a/lib/librte_eal/arm/include/rte_power_intrinsics.h
>>> +++ b/lib/librte_eal/arm/include/rte_power_intrinsics.h
>>> @@ -33,6 +33,15 @@ rte_power_pause(const uint64_t tsc_timestamp)
>>>   RTE_SET_USED(tsc_timestamp);
>>>   }
>>>
>>> +/**
>>> + * This function is not supported on ARM.
>>> + */
>>> +void
>>> +rte_power_monitor_wakeup(const unsigned int lcore_id)
>>> +{
>>> +RTE_SET_USED(lcore_id);
>>> +}
>>> +
>>>   #ifdef __cplusplus
>>>   }
>>>   #endif
>>> diff --git a/lib/librte_eal/include/generic/rte_power_intrinsics.h 
>>> b/lib/librte_eal/include/generic/rte_power_intrinsics.h
>>> index a6f1955996..e311d6f8ea 100644
>>> --- a/lib/librte_eal/include/generic/rte_power_intrinsics.h
>>> +++ b/lib/librte_eal/include/generic/rte_power_intrinsics.h
>>> @@ -57,6 +57,22 @@ __rte_experimental
>>>   void rte_power_monitor(const struct rte_power_monitor_cond *pmc,
>>>   const uint64_t tsc_timestamp);
>>>
>>> +/**
>>> + * @warning
>>> + * @b EXPERIMENTAL: this API may change without prior notice
>>> + *
>>> + * Wake up a specific lcore that is in a power optimized state and 
>>> is monitoring
>>> + * an address.
>>> + *
>>> + * @note This function will *not* wake up a core that is in a power 
>>> optimized
>>> + *   state due to calling `rte_power_pause`.
>>> + *
>>> + * @param lcore_id
>>> + *   Lcore ID of a sleeping thread.
>>> + */
>>> +__rte_experimental
>>> +void rte_power_monitor_wakeup(const unsigned int lcore_id);
>>> +
>>>   /**
>>>    * @warning
>>>    * @b EXPERIMENTAL: this API may change without prior notice
>>> diff --git a/lib/librte_eal/ppc/include/rte_power_intrinsics.h 
>>> b/lib/librte_eal/ppc/include/rte_power_intrinsics.h
>>> index 248d1f4a23..2e7db0e7eb 100644
>>> --- a/lib/librte_eal/ppc/include/rte_power_intrinsics.h
>>> +++ b/lib/librte_eal/ppc/include/rte_power_intrinsics.h
>>> @@ -33,6 +33,15 @@ rte_power_pause(const uint64_t tsc_timestamp)
>>>   RTE_SET_USED(tsc_timestamp);
>>>   }
>>>
>>> +/**
>>> + * This function is not supported on PPC64.
>>> + */
>>> +void
>>> +rte_power_monitor_wakeup(const unsigned int lcore_id)
>>> +{
>>> +RTE_SET_USED(lcore_id);
>>> +}
>>> +
>>>   #ifdef __cplusplus
>>>   }
>>>   #endif
>>> diff --git a/lib/librte_eal/version.map b/lib/librte_eal/version.map
>>> index 20945b1efa..ac026e289d 100644
>>> --- a/lib/librte_eal/version.map
>>> +++ b/lib/librte_eal/version.map
>>> @@ -406,6 +406,7 @@ EXPERIMENTAL {
>>>
>>>   # added in 21.02
>>>   rte_power_monitor;
>>> +rte_power_monitor_wakeup;
>>>   rte_power_pause;
>>>   };
>>>
>>> diff --git a/lib/librte_eal/x86/rte_power_intrinsics.c 
>>> b/lib/librte_eal/x86/rte_power_intrinsics.c
>>> index a9cd1afe9d..46a4fb6cd5 100644
>>> --- a/lib/librte_eal/x86/rte_power_intrinsics.c
>>> +++ b/lib/librte_eal/x86/rte_power_intrinsics.c
>>> @@ -2,8 +2,31 @@
>>>    * Copyright(c) 2020 Intel Corporation
>>>    */
>>>
>>> +#include <rte_common.h>
>>> +#include <rte_lcore.h>
>>> +#include <rte_spinlock.h>
>>> +
>>>   #include "rte_power_intrinsics.h"
>>>
>>> +/*
>>> + * Per-lcore structure holding current status of C0.2 sleeps.
>>> + */
>>> +static struct power_wait_status {
>>> +rte_spinlock_t lock;
>>> +volatile void *monitor_addr; /**< NULL if not currently sleeping */
>>> +} __rte_cache_aligned wait_status[RTE_MAX_LCORE];
>>> +
>>> +static inline void
>>> +__umwait_wakeup(volatile void *addr)
>>> +{
>>> +uint64_t val;
>>> +
>>> +/* trigger a write but don't change the value */
>>> +val = __atomic_load_n((volatile uint64_t *)addr, __ATOMIC_RELAXED);
>>> +__atomic_compare_exchange_n((volatile uint64_t *)addr, &val, val, 0,
>>> +__ATOMIC_RELAXED, __ATOMIC_RELAXED);
>>> +}
>>> +
>>>   static uint8_t wait_supported;
>>>
>>>   static inline uint64_t
>>> @@ -36,6 +59,12 @@ rte_power_monitor(const struct 
>>> rte_power_monitor_cond *pmc,
>>>   {
>>>   const uint32_t tsc_l = (uint32_t)tsc_timestamp;
>>>   const uint32_t tsc_h = (uint32_t)(tsc_timestamp >> 32);
>>> +const unsigned int lcore_id = rte_lcore_id();
>>> +struct power_wait_status *s;
>>> +
>>> +/* prevent non-EAL thread from using this API */
>>> +if (lcore_id >= RTE_MAX_LCORE)
>>> +return;
>>>
>>>   /* prevent user from running this instruction if it's not supported */
>>>   if (!wait_supported)
>>> @@ -60,11 +89,24 @@ rte_power_monitor(const struct 
>>> rte_power_monitor_cond *pmc,
>>>   if (masked == pmc->val)
>>>   return;
>>>   }
>>> +
>>> +s = &wait_status[lcore_id];
>>> +
>>> +/* update sleep address */
>>> +rte_spinlock_lock(&s->lock);
>>> +s->monitor_addr = pmc->addr;
>>> +rte_spinlock_unlock(&s->lock);
>>
>> It was a while, since I looked at it last time,
>> but shouldn't we grab the lock before monitor()?
>> I.E:
>> lock();
>> monitor();
>> addr=...;
>> unlock();
>> umwait();
>>
> 
> I don't believe so.
> 
> The idea here is to only store the address when we are looking to sleep, 
> and avoid the locks entirely if we already know we aren't going to 
> sleep. I mean, technically we could lock unconditionally, then unlock 
> when we're done, but there's very little practical difference between 
> the two because the moment we are interested in (sleep) happens the same 
> way whether we lock before or after monitor().

On another thought, putting the lock before monitor() and unlocking 
afterwards allows us to ask for a wakeup earlier, without necessarily 
waiting for a sleep. So think i'll take your suggestion on board anyway, 
thanks!

> 
>>> +
>>>   /* execute UMWAIT */
>>>   asm volatile(".byte 0xf2, 0x0f, 0xae, 0xf7;"
>>>   : /* ignore rflags */
>>>   : "D"(0), /* enter C0.2 */
>>>     "a"(tsc_l), "d"(tsc_h));
>>> +
>>> +/* erase sleep address */
>>> +rte_spinlock_lock(&s->lock);
>>> +s->monitor_addr = NULL;
>>> +rte_spinlock_unlock(&s->lock);
>>>   }
>>>
>>>   /**
>>> @@ -97,3 +139,46 @@ RTE_INIT(rte_power_intrinsics_init) {
>>>   if (i.power_monitor && i.power_pause)
>>>   wait_supported = 1;
>>>   }
>>> +
>>> +void
>>> +rte_power_monitor_wakeup(const unsigned int lcore_id)
>>> +{
>>> +struct power_wait_status *s;
>>> +
>>> +/* prevent buffer overrun */
>>> +if (lcore_id >= RTE_MAX_LCORE)
>>> +return;
>>> +
>>> +/* prevent user from running this instruction if it's not supported */
>>> +if (!wait_supported)
>>> +return;
>>> +
>>> +s = &wait_status[lcore_id];
>>> +
>>> +/*
>>> + * There is a race condition between sleep, wakeup and locking, but we
>>> + * don't need to handle it.
>>> + *
>>> + * Possible situations:
>>> + *
>>> + * 1. T1 locks, sets address, unlocks
>>> + * 2. T2 locks, triggers wakeup, unlocks
>>> + * 3. T1 sleeps
>>> + *
>>> + * In this case, because T1 has already set the address for monitoring,
>>> + * we will wake up immediately even if T2 triggers wakeup before T1
>>> + * goes to sleep.
>>> + *
>>> + * 1. T1 locks, sets address, unlocks, goes to sleep, and wakes up
>>> + * 2. T2 locks, triggers wakeup, and unlocks
>>> + * 3. T1 locks, erases address, and unlocks
>>> + *
>>> + * In this case, since we've already woken up, the "wakeup" was
>>> + * unneeded, and since T1 is still waiting on T2 releasing the lock, 
>>> the
>>> + * wakeup address is still valid so it's perfectly safe to write it.
>>> + */
>>> +rte_spinlock_lock(&s->lock);
>>> +if (s->monitor_addr != NULL)
>>> +__umwait_wakeup(s->monitor_addr);
>>> +rte_spinlock_unlock(&s->lock);
>>> +}
>>> -- 
>>> 2.25.1
> 
> 


-- 
Thanks,
Anatoly


More information about the dev mailing list