[dpdk-dev] [PATCH v1] test/ring: ring perf test case enhancement
Honnappa Nagarahalli
Honnappa.Nagarahalli at arm.com
Thu Dec 20 22:03:54 CET 2018
>
> +Cc Olivier, maintainer of the ring library.
>
> 20/12/2018 12:33, Gavin Hu:
> > From: Joyce Kong <joyce.kong at arm.com>
> >
> > Run ring perf test on all available cores to really verify MPMC operations.
> > The old way of running on a pair of cores is not enough for MPMC
> > rings. We used this test case for ring optimization and it was really
> > helpful for measuring the ring performance in multi-core environment.
IMO, the last sentence does not convey quantifiable information. I suggest taking that out or replacing it with something that explains how it is useful.
> >
> > Suggested-by: Gavin Hu <gavin.hu at arm.com>
> > Signed-off-by: Joyce Kong <joyce.kong at arm.com>
> > Reviewed-by: Ruifeng Wang <Ruifeng.Wang at arm.com>
> > Reviewed-by: Honnappa Nagarahalli <Honnappa.Nagarahalli at arm.com>
> > Reviewed-by: Dharmik Thakkar <Dharmik.Thakkar at arm.com>
> > Reviewed-by: Ola Liljedahl <Ola.Liljedahl at arm.com>
> > Reviewed-by: Gavin Hu <gavin.hu at arm.com>
> > ---
> > test/test/test_ring_perf.c | 82
> > ++++++++++++++++++++++++++++++++++++++++++++--
> > 1 file changed, 80 insertions(+), 2 deletions(-)
> >
> > diff --git a/test/test/test_ring_perf.c b/test/test/test_ring_perf.c
> > index ebb3939..819d119 100644
> > --- a/test/test/test_ring_perf.c
> > +++ b/test/test/test_ring_perf.c
> > @@ -20,12 +20,17 @@
> > * * Empty ring dequeue
> > * * Enqueue/dequeue of bursts in 1 threads
> > * * Enqueue/dequeue of bursts in 2 threads
> > + * * Enqueue/dequeue of bursts in all available threads
> > */
> >
> > #define RING_NAME "RING_PERF"
> > #define RING_SIZE 4096
> > #define MAX_BURST 32
> >
> > +#ifndef ARRAY_SIZE
> > +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) #endif
> > +
> > /*
> > * the sizes to enqueue and dequeue in testing
> > * (marked volatile so they won't be seen as compile-time constants)
> > @@ -248,9 +253,78 @@ run_on_core_pair(struct lcore_pair *cores, struct
> rte_ring *r,
> > }
> > }
> >
> > +static rte_atomic32_t synchro;
> > +static uint64_t queue_count[RTE_MAX_LCORE] = {0};
> > +
> > +#define TIME_MS 100
> > +
> > +static int
> > +load_loop_fn(void *p)
> > +{
> > + uint64_t time_diff = 0;
> > + uint64_t begin = 0;
> > + uint64_t hz = rte_get_timer_hz();
> > + uint64_t lcount = 0;
> > + const unsigned int lcore = rte_lcore_id();
> > + struct thread_params *params = p;
> > + void *burst[MAX_BURST] = {0};
> > +
> > + /* wait synchro for slaves */
> > + if (lcore != rte_get_master_lcore())
> > + while (rte_atomic32_read(&synchro) == 0)
> > + rte_pause();
> > +
> > + begin = rte_get_timer_cycles();
> > + while (time_diff < hz * TIME_MS / 1000) {
> > + rte_ring_mp_enqueue_bulk(params->r, burst, params->size,
> NULL);
> > + rte_ring_mc_dequeue_bulk(params->r, burst, params->size,
> NULL);
> > + lcount++;
> > + time_diff = rte_get_timer_cycles() - begin;
> > + }
IMO, the method of measurement should be changed to reduce the over head of reading the cycles.
> > + queue_count[lcore] = lcount;
> > + return 0;
> > +}
> > +
> > +static int
> > +run_on_all_cores(struct rte_ring *r)
> > +{
> > + uint64_t total = 0;
> > + struct thread_params param = {0};
> > + unsigned int i, c;
> > + for (i = 0; i < ARRAY_SIZE(bulk_sizes); i++) {
> > + printf("\nBulk enq/dequeue count on size %u\n",
> bulk_sizes[i]);
> > + param.size = bulk_sizes[i];
> > + param.r = r;
> > +
> > + /* clear synchro and start slaves */
> > + rte_atomic32_set(&synchro, 0);
> > + if (rte_eal_mp_remote_launch(load_loop_fn,
> > + ¶m, SKIP_MASTER) < 0)
> > + return -1;
> > +
> > + /* start synchro and launch test on master */
> > + rte_atomic32_set(&synchro, 1);
> > + load_loop_fn(¶m);
> > +
> > + rte_eal_mp_wait_lcore();
> > +
> > + RTE_LCORE_FOREACH(c) {
> > + printf("Core [%u] count = %"PRIu64"\n",
> > + c, queue_count[c]);
> > + total += queue_count[c];
> > + }
> > +
> > + printf("Total count (size: %u): %"PRIu64"\n", bulk_sizes[i],
> > + total);
> > + }
> > +
> > + return 0;
> > +}
> > +
> > /*
> > - * Test function that determines how long an enqueue + dequeue of a
> > single item
> > - * takes on a single lcore. Result is for comparison with the bulk enq+deq.
> > + * Test function that determines how long an enqueue + dequeue of a
> > + single
> > + * item takes on a single lcore. Result is for comparison with the
> > + bulk
> > + * enq+deq.
> > */
> > static void
> > test_single_enqueue_dequeue(struct rte_ring *r) @@ -394,6 +468,10 @@
> > test_ring_perf(void)
> > printf("\n### Testing using two NUMA nodes ###\n");
> > run_on_core_pair(&cores, r, enqueue_bulk, dequeue_bulk);
> > }
> > +
> > + printf("\n### Testing using all slave nodes ###\n");
> > + run_on_all_cores(r);
> > +
> > rte_ring_free(r);
> > return 0;
> > }
> >
>
>
>
>
More information about the dev
mailing list