[dpdk-dev] [PATCH v2 4/4] test/lpm: avoid code duplication in rcu qsbr perf

Dharmik Thakkar Dharmik.Thakkar at arm.com
Tue Nov 3 15:03:09 CET 2020



> On Nov 2, 2020, at 11:32 PM, Honnappa Nagarahalli <Honnappa.Nagarahalli at arm.com> wrote:
> 
> <snip>
> 
>>>> 
>>>> Avoid code duplication by combining single and multi threaded tests
>>>> 
>>>> Signed-off-by: Dharmik Thakkar <dharmik.thakkar at arm.com>
>>>> Reviewed-by: Ruifeng Wang <ruifeng.wang at arm.com>
>>>> ---
>>>> app/test/test_lpm_perf.c | 362
>>>> ++++++++++-----------------------------
>>>> 1 file changed, 91 insertions(+), 271 deletions(-)
>>>> 
>>>> diff --git a/app/test/test_lpm_perf.c b/app/test/test_lpm_perf.c
>>>> index
>>>> 224c92fa3d65..229c835c23f7 100644
>>>> --- a/app/test/test_lpm_perf.c
>>>> +++ b/app/test/test_lpm_perf.c
>>>> @@ -67,6 +67,12 @@ enum {
>>>> IP_CLASS_C
>>>> };
>>>> 
>>>> +enum {
>>>> +SINGLE_WRITER = 1,
>>>> +MULTI_WRITER_1,
>>>> +MULTI_WRITER_2
>>>> +};
>>> Do we need this? Can we use the number of cores instead?
>>> 
>> 
>> There are 3 combinations of writes (adds/deletes):
>> 1. Write all the entries - in case of a single writer 2. Write half of the entries -
>> in case of multiple writers 3. Write remaining half of the entries - in case of
>> multiple writers
>> 
>> So, I think this is required.
> IMO, this is not scalable. Essentially, we need 2 parameters to divide the routes among each writer thread. We need 2 parameters, 1) total number of writers 2) the core ID in the linear space.
> Creating a structure with these 2 and passing that to the writer thread would be better and scalable.

Yes, agreed this is only applicable for 2 writers. Currently, the multi writer test is only limited to a maximum of 2 writers.
To support more number of writers, we need something like this (which I believe is in lines with your suggestion):
1. Calculate what each writer will write: single_insert = TOTAL_WRITES / num_writers
2. Pass core ID in linear space as an argument to the writer function: pos_core
3. Calculate si and ei in the writer function: si = pos_core * single_insert; ei = si + single_insert

I can update the patch to enable more than 2 writers.
Do you also suggest we expand the scope of the test to test with more than 2 writers?
This will increase the time for which the test is running (which currently is significant even with 2 writers).

> 
>> 
>>>> +
>>>> /* struct route_rule_count defines the total number of rules in
>>>> following a/b/c
>>>> * each item in a[]/b[]/c[] is the number of common IP address class
>>>> A/B/C, not
>>>> * including the ones for private local network.
>>>> @@ -430,11 +436,16 @@ test_lpm_rcu_qsbr_writer(void *arg)  {
>> unsigned
>>>> int i, j, si, ei; uint64_t begin, total_cycles; -uint8_t core_id =
>>>> (uint8_t)((uintptr_t)arg);
>>>> +uint8_t writer_id = (uint8_t)((uintptr_t)arg);
>>>> uint32_t next_hop_add = 0xAA;
>>>> 
>>>> -/* 2 writer threads are used */
>>>> -if (core_id % 2 == 0) {
>>>> +/* Single writer (writer_id = 1) */
>>>> +if (writer_id == SINGLE_WRITER) {
>>>> +si = 0;
>>>> +ei = NUM_LDEPTH_ROUTE_ENTRIES;
>>>> +}
>>>> +/* 2 Writers (writer_id = 2/3)*/
>>>> +else if (writer_id == MULTI_WRITER_1) {
>>>> si = 0;
>>>> ei = NUM_LDEPTH_ROUTE_ENTRIES / 2;
>>>> } else {
>>>> @@ -447,29 +458,35 @@ test_lpm_rcu_qsbr_writer(void *arg) for (i = 0;
>>>> i < RCU_ITERATIONS; i++) {
>>>> /* Add all the entries */
>>>> for (j = si; j < ei; j++) {
>>>> -pthread_mutex_lock(&lpm_mutex);
>>>> +if (writer_id != SINGLE_WRITER)
>>>> +pthread_mutex_lock(&lpm_mutex);
>>>> if (rte_lpm_add(lpm, large_ldepth_route_table[j].ip,
>>>> large_ldepth_route_table[j].depth,
>>>> next_hop_add) != 0) {
>>>> printf("Failed to add iteration %d, route# %d\n", i, j);
>>>> -pthread_mutex_unlock(&lpm_mutex);
>>>> +if (writer_id != SINGLE_WRITER)
>>>> +
>>>> pthread_mutex_unlock(&lpm_mutex);
>>>> return -1;
>>>> }
>>>> -pthread_mutex_unlock(&lpm_mutex);
>>>> +if (writer_id != SINGLE_WRITER)
>>>> +pthread_mutex_unlock(&lpm_mutex);
>>>> }
>>>> 
>>>> /* Delete all the entries */
>>>> for (j = si; j < ei; j++) {
>>>> -pthread_mutex_lock(&lpm_mutex);
>>>> +if (writer_id != SINGLE_WRITER)
>>>> +pthread_mutex_lock(&lpm_mutex);
>>>> if (rte_lpm_delete(lpm,
>>>> large_ldepth_route_table[j].ip,
>>>> large_ldepth_route_table[j].depth) != 0) { printf("Failed to delete
>>>> iteration %d, route# %d\n", i, j); -pthread_mutex_unlock(&lpm_mutex);
>>>> +if (writer_id != SINGLE_WRITER)
>>>> +
>>>> pthread_mutex_unlock(&lpm_mutex);
>>>> return -1;
>>>> }
>>>> -pthread_mutex_unlock(&lpm_mutex);
>>>> +if (writer_id != SINGLE_WRITER)
>>>> +pthread_mutex_unlock(&lpm_mutex);
>>>> }
>>>> }
>>>> 
>>>> @@ -482,16 +499,17 @@ test_lpm_rcu_qsbr_writer(void *arg)
>>>> 
>>>> /*
>>>> * Functional test:
>>>> - * 2 writers, rest are readers
>>>> + * 1/2 writers, rest are readers
>>>> */
>>>> static int
>>>> -test_lpm_rcu_perf_multi_writer(void)
>>>> +test_lpm_rcu_perf_multi_writer(uint8_t use_rcu)
>>>> {
>>>> struct rte_lpm_config config;
>>>> size_t sz;
>>>> -unsigned int i;
>>>> +unsigned int i, j;
>>>> uint16_t core_id;
>>>> struct rte_lpm_rcu_config rcu_cfg = {0};
>>>> +int (*reader_f)(void *arg) = NULL;
>>>> 
>>>> if (rte_lcore_count() < 3) {
>>>> printf("Not enough cores for lpm_rcu_perf_autotest, expecting at
>>>> least 3\n"); @@ -504,273 +522,76 @@
>>>> test_lpm_rcu_perf_multi_writer(void)
>>>> num_cores++;
>>>> }
>>>> 
>>>> -printf("\nPerf test: 2 writers, %d readers, RCU integration
>>>> enabled\n", -num_cores - 2);
>>>> -
>>>> -/* Create LPM table */
>>>> -config.max_rules = NUM_LDEPTH_ROUTE_ENTRIES; -
>> config.number_tbl8s =
>>>> NUM_LDEPTH_ROUTE_ENTRIES; -config.flags = 0; -lpm =
>>>> rte_lpm_create(__func__, SOCKET_ID_ANY, &config);
>>>> -TEST_LPM_ASSERT(lpm != NULL);
>>>> -
>>>> -/* Init RCU variable */
>>>> -sz = rte_rcu_qsbr_get_memsize(num_cores);
>>>> -rv = (struct rte_rcu_qsbr *)rte_zmalloc("rcu0", sz,
>>>> -RTE_CACHE_LINE_SIZE); -rte_rcu_qsbr_init(rv, num_cores);
>>>> -
>>>> -rcu_cfg.v = rv;
>>>> -/* Assign the RCU variable to LPM */ -if (rte_lpm_rcu_qsbr_add(lpm,
>>>> &rcu_cfg) != 0) { -printf("RCU variable assignment failed\n"); -goto
>>>> error; -}
>>>> -
>>>> -writer_done = 0;
>>>> -__atomic_store_n(&gwrite_cycles, 0, __ATOMIC_RELAXED);
>>>> -
>>>> -__atomic_store_n(&thr_id, 0, __ATOMIC_SEQ_CST);
>>>> -
>>>> -/* Launch reader threads */
>>>> -for (i = 2; i < num_cores; i++)
>>>> -rte_eal_remote_launch(test_lpm_rcu_qsbr_reader, NULL,
>>>> -enabled_core_ids[i]);
>>>> -
>>>> -/* Launch writer threads */
>>>> -for (i = 0; i < 2; i++)
>>>> -rte_eal_remote_launch(test_lpm_rcu_qsbr_writer,
>>>> -(void *)(uintptr_t)i,
>>>> -enabled_core_ids[i]);
>>>> -
>>>> -/* Wait for writer threads */
>>>> -for (i = 0; i < 2; i++)
>>>> -if (rte_eal_wait_lcore(enabled_core_ids[i]) < 0) -goto error;
>>>> -
>>>> -printf("Total LPM Adds: %d\n", TOTAL_WRITES); -printf("Total LPM
>>>> Deletes: %d\n", TOTAL_WRITES); -printf("Average LPM Add/Del:
>>>> %"PRIu64" cycles\n", -__atomic_load_n(&gwrite_cycles,
>>>> __ATOMIC_RELAXED) -/ TOTAL_WRITES);
>>>> -
>>>> -writer_done = 1;
>>>> -/* Wait until all readers have exited */ -for (i = 2; i < num_cores;
>>>> i++) -rte_eal_wait_lcore(enabled_core_ids[i]);
>>>> -
>>>> -rte_lpm_free(lpm);
>>>> -rte_free(rv);
>>>> -lpm = NULL;
>>>> -rv = NULL;
>>>> -
>>>> -/* Test without RCU integration */
>>>> -printf("\nPerf test: 2 writers, %d readers, RCU integration
>>>> disabled\n", -num_cores - 2);
>>>> -
>>>> -/* Create LPM table */
>>>> -config.max_rules = NUM_LDEPTH_ROUTE_ENTRIES; -
>> config.number_tbl8s =
>>>> NUM_LDEPTH_ROUTE_ENTRIES; -config.flags = 0; -lpm =
>>>> rte_lpm_create(__func__, SOCKET_ID_ANY, &config);
>>>> -TEST_LPM_ASSERT(lpm != NULL);
>>>> -
>>>> -writer_done = 0;
>>>> -__atomic_store_n(&gwrite_cycles, 0, __ATOMIC_RELAXED);
>>>> -__atomic_store_n(&thr_id, 0, __ATOMIC_SEQ_CST);
>>>> -
>>>> -/* Launch reader threads */
>>>> -for (i = 2; i < num_cores; i++)
>>>> -rte_eal_remote_launch(test_lpm_reader, NULL, -enabled_core_ids[i]);
>>>> -
>>>> -/* Launch writer threads */
>>>> -for (i = 0; i < 2; i++)
>>>> -rte_eal_remote_launch(test_lpm_rcu_qsbr_writer,
>>>> -(void *)(uintptr_t)i,
>>>> -enabled_core_ids[i]);
>>>> -
>>>> -/* Wait for writer threads */
>>>> -for (i = 0; i < 2; i++)
>>>> -if (rte_eal_wait_lcore(enabled_core_ids[i]) < 0) -goto error;
>>>> -
>>>> -printf("Total LPM Adds: %d\n", TOTAL_WRITES); -printf("Total LPM
>>>> Deletes: %d\n", TOTAL_WRITES); -printf("Average LPM Add/Del:
>>>> %"PRIu64" cycles\n", -__atomic_load_n(&gwrite_cycles,
>>>> __ATOMIC_RELAXED) -/ TOTAL_WRITES);
>>>> -
>>>> -writer_done = 1;
>>>> -/* Wait until all readers have exited */ -for (i = 2; i < num_cores;
>>>> i++) -rte_eal_wait_lcore(enabled_core_ids[i]);
>>>> -
>>>> -rte_lpm_free(lpm);
>>>> -
>>>> -return 0;
>>>> -
>>>> -error:
>>>> -writer_done = 1;
>>>> -/* Wait until all readers have exited */ -rte_eal_mp_wait_lcore();
>>>> -
>>>> -rte_lpm_free(lpm);
>>>> -rte_free(rv);
>>>> -
>>>> -return -1;
>>>> -}
>>>> -
>>>> -/*
>>>> - * Functional test:
>>>> - * Single writer, rest are readers
>>>> - */
>>>> -static int
>>>> -test_lpm_rcu_perf(void)
>>>> -{
>>>> -struct rte_lpm_config config;
>>>> -uint64_t begin, total_cycles;
>>>> -size_t sz;
>>>> -unsigned int i, j;
>>>> -uint16_t core_id;
>>>> -uint32_t next_hop_add = 0xAA;
>>>> -struct rte_lpm_rcu_config rcu_cfg = {0};
>>>> -
>>>> -if (rte_lcore_count() < 2) {
>>>> -printf("Not enough cores for lpm_rcu_perf_autotest, expecting at
>>>> least 2\n"); -return TEST_SKIPPED; -}
>>>> -
>>>> -num_cores = 0;
>>>> -RTE_LCORE_FOREACH_WORKER(core_id) {
>>>> -enabled_core_ids[num_cores] = core_id; -num_cores++; -}
>>>> -
>>>> -printf("\nPerf test: 1 writer, %d readers, RCU integration
>>>> enabled\n", -num_cores);
>>>> -
>>>> -/* Create LPM table */
>>>> -config.max_rules = NUM_LDEPTH_ROUTE_ENTRIES; -
>> config.number_tbl8s =
>>>> NUM_LDEPTH_ROUTE_ENTRIES; -config.flags = 0; -lpm =
>>>> rte_lpm_create(__func__, SOCKET_ID_ANY, &config);
>>>> -TEST_LPM_ASSERT(lpm != NULL);
>>>> -
>>>> -/* Init RCU variable */
>>>> -sz = rte_rcu_qsbr_get_memsize(num_cores);
>>>> -rv = (struct rte_rcu_qsbr *)rte_zmalloc("rcu0", sz,
>>>> -RTE_CACHE_LINE_SIZE); -rte_rcu_qsbr_init(rv, num_cores);
>>>> -
>>>> -rcu_cfg.v = rv;
>>>> -/* Assign the RCU variable to LPM */ -if (rte_lpm_rcu_qsbr_add(lpm,
>>>> &rcu_cfg) != 0) { -printf("RCU variable assignment failed\n"); -goto
>>>> error; -}
>>>> -
>>>> -writer_done = 0;
>>>> -__atomic_store_n(&thr_id, 0, __ATOMIC_SEQ_CST);
>>>> -
>>>> -/* Launch reader threads */
>>>> -for (i = 0; i < num_cores; i++)
>>>> -rte_eal_remote_launch(test_lpm_rcu_qsbr_reader, NULL,
>>>> -enabled_core_ids[i]);
>>>> -
>>>> -/* Measure add/delete. */
>>>> -begin = rte_rdtsc_precise();
>>>> -for (i = 0; i < RCU_ITERATIONS; i++) {
>>>> -/* Add all the entries */
>>>> -for (j = 0; j < NUM_LDEPTH_ROUTE_ENTRIES; j++) -if (rte_lpm_add(lpm,
>>>> large_ldepth_route_table[j].ip, -large_ldepth_route_table[j].depth,
>>>> -next_hop_add) != 0) {
>>>> -printf("Failed to add iteration %d, route# %d\n", -i, j);
>>>> +for (j = 1; j < 3; j++) {
>>>> +if (use_rcu)
>>>> +printf("\nPerf test: %d writer(s), %d reader(s),"
>>>> +       " RCU integration enabled\n", j, num_cores - j); else
>>>> +printf("\nPerf test: %d writer(s), %d reader(s),"
>>>> +       " RCU integration disabled\n", j, num_cores - j);
>>>> +
>>>> +/* Create LPM table */
>>>> +config.max_rules = NUM_LDEPTH_ROUTE_ENTRIES;
>> config.number_tbl8s =
>>>> +NUM_LDEPTH_ROUTE_ENTRIES; config.flags = 0; lpm =
>>>> +rte_lpm_create(__func__, SOCKET_ID_ANY, &config);
>>>> +TEST_LPM_ASSERT(lpm != NULL);
>>>> +
>>>> +/* Init RCU variable */
>>>> +if (use_rcu) {
>>>> +sz = rte_rcu_qsbr_get_memsize(num_cores);
>>>> +rv = (struct rte_rcu_qsbr *)rte_zmalloc("rcu0", sz,
>>>> +
>>>> RTE_CACHE_LINE_SIZE);
>>>> +rte_rcu_qsbr_init(rv, num_cores);
>>>> +
>>>> +rcu_cfg.v = rv;
>>>> +/* Assign the RCU variable to LPM */ if (rte_lpm_rcu_qsbr_add(lpm,
>>>> +&rcu_cfg) != 0) { printf("RCU variable assignment failed\n");
>>>> goto error;
>>>> }
>>>> 
>>>> -/* Delete all the entries */
>>>> -for (j = 0; j < NUM_LDEPTH_ROUTE_ENTRIES; j++) -if
>>>> (rte_lpm_delete(lpm, large_ldepth_route_table[j].ip,
>>>> -large_ldepth_route_table[j].depth) != 0) { -printf("Failed to delete
>>>> iteration %d, route# %d\n", -i, j); -goto error; -} -} -total_cycles
>>>> = rte_rdtsc_precise() - begin;
>>>> +reader_f = test_lpm_rcu_qsbr_reader; } else reader_f =
>>>> +test_lpm_reader;
>>>> 
>>>> -printf("Total LPM Adds: %d\n", TOTAL_WRITES); -printf("Total LPM
>>>> Deletes: %d\n", TOTAL_WRITES); -printf("Average LPM Add/Del: %g
>>>> cycles\n", -(double)total_cycles / TOTAL_WRITES);
>>>> +writer_done = 0;
>>>> +__atomic_store_n(&gwrite_cycles, 0, __ATOMIC_RELAXED);
>>>> 
>>>> -writer_done = 1;
>>>> -/* Wait until all readers have exited */ -for (i = 0; i < num_cores;
>>>> i++) -if (rte_eal_wait_lcore(enabled_core_ids[i]);
>>>> -
>>>> -rte_lpm_free(lpm);
>>>> -rte_free(rv);
>>>> -lpm = NULL;
>>>> -rv = NULL;
>>>> -
>>>> -/* Test without RCU integration */
>>>> -printf("\nPerf test: 1 writer, %d readers, RCU integration
>>>> disabled\n", -num_cores);
>>>> -
>>>> -/* Create LPM table */
>>>> -config.max_rules = NUM_LDEPTH_ROUTE_ENTRIES; -
>> config.number_tbl8s =
>>>> NUM_LDEPTH_ROUTE_ENTRIES; -config.flags = 0; -lpm =
>>>> rte_lpm_create(__func__, SOCKET_ID_ANY, &config);
>>>> -TEST_LPM_ASSERT(lpm != NULL);
>>>> +__atomic_store_n(&thr_id, 0, __ATOMIC_SEQ_CST);
>>>> 
>>>> -writer_done = 0;
>>>> -__atomic_store_n(&thr_id, 0, __ATOMIC_SEQ_CST);
>>>> +/* Launch reader threads */
>>>> +for (i = j; i < num_cores; i++)
>>>> +rte_eal_remote_launch(reader_f, NULL,
>>>> +enabled_core_ids[i]);
>>>> 
>>>> -/* Launch reader threads */
>>>> -for (i = 0; i < num_cores; i++)
>>>> -rte_eal_remote_launch(test_lpm_reader, NULL,
>>>> -enabled_core_ids[i]);
>>>> +/* Launch writer threads */
>>>> +for (i = 0; i < j; i++)
>>>> +rte_eal_remote_launch(test_lpm_rcu_qsbr_writer,
>>>> +(void *)(uintptr_t)(i + j),
>>> This can be just 'j'?
>>> 
>>>> +enabled_core_ids[i]);
>>>> 
>>>> -/* Measure add/delete. */
>>>> -begin = rte_rdtsc_precise();
>>>> -for (i = 0; i < RCU_ITERATIONS; i++) {
>>>> -/* Add all the entries */
>>>> -for (j = 0; j < NUM_LDEPTH_ROUTE_ENTRIES; j++)
>>>> -if (rte_lpm_add(lpm, large_ldepth_route_table[j].ip,
>>>> -large_ldepth_route_table[j].depth,
>>>> -next_hop_add) != 0) {
>>>> -printf("Failed to add iteration %d, route#
>>>> %d\n",
>>>> -i, j);
>>>> +/* Wait for writer threads */
>>>> +for (i = 0; i < j; i++)
>>>> +if (rte_eal_wait_lcore(enabled_core_ids[i]) < 0)
>>>> goto error;
>>>> -}
>>>> 
>>>> -/* Delete all the entries */
>>>> -for (j = 0; j < NUM_LDEPTH_ROUTE_ENTRIES; j++)
>>>> -if (rte_lpm_delete(lpm,
>>>> large_ldepth_route_table[j].ip,
>>>> -large_ldepth_route_table[j].depth) != 0) {
>>>> -printf("Failed to delete iteration %d, route#
>>>> %d\n",
>>>> -i, j);
>>>> -goto error;
>>>> -}
>>>> +printf("Total LPM Adds: %d\n", TOTAL_WRITES);
>>>> +printf("Total LPM Deletes: %d\n", TOTAL_WRITES);
>>>> +printf("Average LPM Add/Del: %"PRIu64" cycles\n",
>>>> +__atomic_load_n(&gwrite_cycles,
>>>> __ATOMIC_RELAXED)
>>>> +/ TOTAL_WRITES);
>>>> +
>>>> +writer_done = 1;
>>>> +/* Wait until all readers have exited */
>>>> +for (i = j; i < num_cores; i++)
>>>> +rte_eal_wait_lcore(enabled_core_ids[i]);
>>>> +
>>>> +rte_lpm_free(lpm);
>>>> +rte_free(rv);
>>>> +lpm = NULL;
>>>> +rv = NULL;
>>>> }
>>>> -total_cycles = rte_rdtsc_precise() - begin;
>>>> -
>>>> -printf("Total LPM Adds: %d\n", TOTAL_WRITES);
>>>> -printf("Total LPM Deletes: %d\n", TOTAL_WRITES);
>>>> -printf("Average LPM Add/Del: %g cycles\n",
>>>> -(double)total_cycles / TOTAL_WRITES);
>>>> -
>>>> -writer_done = 1;
>>>> -/* Wait until all readers have exited */
>>>> -for (i = 0; i < num_cores; i++)
>>>> -rte_eal_wait_lcore(enabled_core_ids[i]);
>>>> -
>>>> -rte_lpm_free(lpm);
>>>> 
>>>> return 0;
>>>> 
>>>> @@ -946,9 +767,8 @@ test_lpm_perf(void)
>>>> rte_lpm_delete_all(lpm);
>>>> rte_lpm_free(lpm);
>>>> 
>>>> -test_lpm_rcu_perf();
>>>> -
>>>> -test_lpm_rcu_perf_multi_writer();
>>>> +test_lpm_rcu_perf_multi_writer(0);
>>>> +test_lpm_rcu_perf_multi_writer(1);
>>>> 
>>>> return 0;
>>>> }
>>>> --
>>>> 2.17.1



More information about the dev mailing list