[RESEND PATCH v9 4/5] app/testpmd: report lcore usage
David Marchand
david.marchand at redhat.com
Thu Feb 9 09:43:24 CET 2023
On Wed, Feb 8, 2023 at 9:49 AM Robin Jarry <rjarry at redhat.com> wrote:
>
> The --record-core-cycles option already accounts for busy cycles. One
> turn of packet_fwd_t is considered "busy" if there was at least one
> received or transmitted packet.
>
> Rename core_cycles to busy_cycles in struct fwd_stream to make it more
> explicit. Add total_cycles to struct fwd_lcore. Add cycles accounting in
> noisy_vnf where it was missing.
>
> When --record-core-cycles is specified, register a callback with
> rte_lcore_register_usage_cb() and update total_cycles every turn of
> lcore loop based on a starting tsc value.
>
> In the callback, resolve the proper struct fwd_lcore based on lcore_id
> and return the lcore total_cycles and the sum of busy_cycles of all its
> fwd_streams.
>
> This makes the cycles counters available in rte_lcore_dump() and the
> lcore telemetry API:
>
> testpmd> dump_lcores
> lcore 3, socket 0, role RTE, cpuset 3
> lcore 4, socket 0, role RTE, cpuset 4, busy cycles 1228584096/9239923140
> lcore 5, socket 0, role RTE, cpuset 5, busy cycles 1255661768/9218141538
>
> --> /eal/lcore/info,4
> {
> "/eal/lcore/info": {
> "lcore_id": 4,
> "socket": 0,
> "role": "RTE",
> "cpuset": [
> 4
> ],
> "busy_cycles": 10623340318,
> "total_cycles": 55331167354
> }
> }
>
> Signed-off-by: Robin Jarry <rjarry at redhat.com>
> Acked-by: Morten Brørup <mb at smartsharesystems.com>
> Acked-by: Konstantin Ananyev <konstantin.v.ananyev at yandex.ru>
> Reviewed-by: Kevin Laatz <kevin.laatz at intel.com>
> ---
>
> Notes:
> v8 -> v9: Fixed accounting of total cycles
>
> app/test-pmd/noisy_vnf.c | 8 +++++++-
> app/test-pmd/testpmd.c | 42 ++++++++++++++++++++++++++++++++++++----
> app/test-pmd/testpmd.h | 25 +++++++++++++++---------
> 3 files changed, 61 insertions(+), 14 deletions(-)
>
> diff --git a/app/test-pmd/noisy_vnf.c b/app/test-pmd/noisy_vnf.c
> index c65ec6f06a5c..ce5a3e5e6987 100644
> --- a/app/test-pmd/noisy_vnf.c
> +++ b/app/test-pmd/noisy_vnf.c
> @@ -144,6 +144,7 @@ pkt_burst_noisy_vnf(struct fwd_stream *fs)
> struct noisy_config *ncf = noisy_cfg[fs->rx_port];
> struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
> struct rte_mbuf *tmp_pkts[MAX_PKT_BURST];
> + uint64_t start_tsc = 0;
> uint16_t nb_deqd = 0;
> uint16_t nb_rx = 0;
> uint16_t nb_tx = 0;
> @@ -153,6 +154,8 @@ pkt_burst_noisy_vnf(struct fwd_stream *fs)
> bool needs_flush = false;
> uint64_t now;
>
> + get_start_cycles(&start_tsc);
> +
> nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue,
> pkts_burst, nb_pkt_per_burst);
> inc_rx_burst_stats(fs, nb_rx);
> @@ -169,7 +172,7 @@ pkt_burst_noisy_vnf(struct fwd_stream *fs)
> inc_tx_burst_stats(fs, nb_tx);
> fs->tx_packets += nb_tx;
> fs->fwd_dropped += drop_pkts(pkts_burst, nb_rx, nb_tx);
> - return;
> + goto end;
> }
>
> fifo_free = rte_ring_free_count(ncf->f);
> @@ -219,6 +222,9 @@ pkt_burst_noisy_vnf(struct fwd_stream *fs)
> fs->fwd_dropped += drop_pkts(tmp_pkts, nb_deqd, sent);
> ncf->prev_time = rte_get_timer_cycles();
> }
> +end:
> + if (nb_rx > 0 || nb_tx > 0)
> + get_end_cycles(fs, start_tsc);
> }
>
> #define NOISY_STRSIZE 256
> diff --git a/app/test-pmd/testpmd.c b/app/test-pmd/testpmd.c
> index e366f81a0f46..eeb96aefa80b 100644
> --- a/app/test-pmd/testpmd.c
> +++ b/app/test-pmd/testpmd.c
> @@ -2053,7 +2053,7 @@ fwd_stats_display(void)
> fs->rx_bad_outer_ip_csum;
>
> if (record_core_cycles)
> - fwd_cycles += fs->core_cycles;
> + fwd_cycles += fs->busy_cycles;
> }
> for (i = 0; i < cur_fwd_config.nb_fwd_ports; i++) {
> pt_id = fwd_ports_ids[i];
> @@ -2145,7 +2145,7 @@ fwd_stats_display(void)
> else
> total_pkts = total_recv;
>
> - printf("\n CPU cycles/packet=%.2F (total cycles="
> + printf("\n CPU cycles/packet=%.2F (busy cycles="
> "%"PRIu64" / total %s packets=%"PRIu64") at %"PRIu64
> " MHz Clock\n",
> (double) fwd_cycles / total_pkts,
> @@ -2184,8 +2184,10 @@ fwd_stats_reset(void)
>
> memset(&fs->rx_burst_stats, 0, sizeof(fs->rx_burst_stats));
> memset(&fs->tx_burst_stats, 0, sizeof(fs->tx_burst_stats));
> - fs->core_cycles = 0;
> + fs->busy_cycles = 0;
> }
> + for (i = 0; i < cur_fwd_config.nb_fwd_lcores; i++)
> + fwd_lcores[i]->total_cycles = 0;
This instrumentation accuracy may not be that important in testpmd
(becauase testpmd is just a test/validation tool).
However, resetting total_cycles is setting a bad example for people
who may look at this code.
It does not comply with the EAL api.
The associated lcores may still be running the moment a user reset the
fwd stats.
> }
>
> static void
> @@ -2248,6 +2250,7 @@ static void
> run_pkt_fwd_on_lcore(struct fwd_lcore *fc, packet_fwd_t pkt_fwd)
> {
> struct fwd_stream **fsm;
> + uint64_t start_tsc;
> streamid_t nb_fs;
> streamid_t sm_id;
> #ifdef RTE_LIB_BITRATESTATS
> @@ -2262,6 +2265,7 @@ run_pkt_fwd_on_lcore(struct fwd_lcore *fc, packet_fwd_t pkt_fwd)
> #endif
> fsm = &fwd_streams[fc->stream_idx];
> nb_fs = fc->stream_nb;
> + start_tsc = rte_rdtsc();
> do {
> for (sm_id = 0; sm_id < nb_fs; sm_id++)
> if (!fsm[sm_id]->disabled)
> @@ -2284,10 +2288,36 @@ run_pkt_fwd_on_lcore(struct fwd_lcore *fc, packet_fwd_t pkt_fwd)
> latencystats_lcore_id == rte_lcore_id())
> rte_latencystats_update();
> #endif
> -
> + if (record_core_cycles)
> + fc->total_cycles = rte_rdtsc() - start_tsc;
By using a single tsc reference at the start of this function,
total_cycles will be reset every time forwarding is stopped /
restarted.
A more accurate way to account for consumed cycles for this lcore
would be to increase by a differential value for each loop.
Like:
@@ -2248,6 +2248,7 @@ static void
run_pkt_fwd_on_lcore(struct fwd_lcore *fc, packet_fwd_t pkt_fwd)
{
struct fwd_stream **fsm;
+ uint64_t prev_tsc;
streamid_t nb_fs;
streamid_t sm_id;
#ifdef RTE_LIB_BITRATESTATS
@@ -2262,6 +2263,7 @@ run_pkt_fwd_on_lcore(struct fwd_lcore *fc,
packet_fwd_t pkt_fwd)
#endif
fsm = &fwd_streams[fc->stream_idx];
nb_fs = fc->stream_nb;
+ prev_tsc = rte_rdtsc();
do {
for (sm_id = 0; sm_id < nb_fs; sm_id++)
if (!fsm[sm_id]->disabled)
@@ -2285,9 +2287,42 @@ run_pkt_fwd_on_lcore(struct fwd_lcore *fc,
packet_fwd_t pkt_fwd)
rte_latencystats_update();
#endif
+ if (record_core_cycles) {
+ uint64_t current_tsc = rte_rdtsc();
+
+ fc->total_cycles += current_tsc - prev_tsc;
+ prev_tsc = current_tsc;
+ }
} while (! fc->stopped);
}
I also have one interrogation around those updates.
I wonder if we are missing some __atomic_store/load pairs (probably
more an issue for non-x86 arches), since the updates and reading those
cycles happen on different threads.
This issue predates your patch (for fs->core_cycles accesses previously).
I am not asking for a fix right away, this last point can wait post -rc1.
--
David Marchand
More information about the dev
mailing list