[dpdk-dev] [PATCH v2] Implement memcmp using AVX/SSE instructions.

Linhaifeng haifeng.lin at huawei.com
Tue May 12 10:13:09 CEST 2015


Hi, Ravi Kerur

On 2015/5/9 5:19, Ravi Kerur wrote:
> Preliminary results on Intel(R) Core(TM) i7-4790 CPU @ 3.60GHz, Ubuntu
> 14.04 x86_64 shows comparisons using AVX/SSE instructions taking 1/3rd
> CPU ticks for 16, 32, 48 and 64 bytes comparison. In addition,

I had write a program to test rte_memcmp and I have a question about the result.
Why cost same CPU ticks for 128 256 512 1024 1500 bytes? Is there any problem in
my test?


[root at localhost test]# gcc avx_test.c -O3  -I /data/linhf/v2r2c00/open-source/dpdk/dpdk-2.0.0/x86_64-native-linuxapp-gcc/include/ -mavx2 -DRTE_MACHINE_CPUFLAG_AVX2
[root at localhost test]# ./a.out 0
each test run 100000000 times
copy 16 bytes costs average 7(rte_memcmp) 10(memcmp) ticks
copy 32 bytes costs average 9(rte_memcmp) 11(memcmp) ticks
copy 64 bytes costs average 6(rte_memcmp) 13(memcmp) ticks
copy 128 bytes costs average 11(rte_memcmp) 14(memcmp) ticks
copy 256 bytes costs average 9(rte_memcmp) 14(memcmp) ticks
copy 512 bytes costs average 9(rte_memcmp) 14(memcmp) ticks
copy 1024 bytes costs average 9(rte_memcmp) 14(memcmp) ticks
copy 1500 bytes costs average 11(rte_memcmp) 14(memcmp) ticks
[root at localhost test]# ./a.out 1
each test run 100000000 times
copy 16 bytes costs average 2(rte_memcpy) 10(memcpy) ticks
copy 32 bytes costs average 2(rte_memcpy) 10(memcpy) ticks
copy 64 bytes costs average 3(rte_memcpy) 10(memcpy) ticks
copy 128 bytes costs average 7(rte_memcpy) 12(memcpy) ticks
copy 256 bytes costs average 9(rte_memcpy) 23(memcpy) ticks
copy 512 bytes costs average 14(rte_memcpy) 34(memcpy) ticks
copy 1024 bytes costs average 37(rte_memcpy) 61(memcpy) ticks
copy 1500 bytes costs average 62(rte_memcpy) 87(memcpy) ticks


Here is my program:

#include <stdio.h>
#include <rte_cycles.h>
#include <smmintrin.h>
#include <rte_memcpy.h>
#include <rte_memcmp.h>

#define TIMES 100000000L

void test_memcpy(size_t n)
{
        uint64_t start, end, i, start2, end2;
        uint8_t *src, *dst;

        src = (uint8_t*)malloc(n * sizeof(uint8_t));
        dst = (uint8_t*)malloc(n * sizeof(uint8_t));

        start = rte_rdtsc();
        for (i = 0; i < TIMES; i++) {
                rte_memcpy(dst, src, n);
        }
        end = rte_rdtsc();

        start2 = rte_rdtsc();
        for (i = 0; i < TIMES; i++) {
                memcpy(dst, src, n);
        }
        end2 = rte_rdtsc();


        free(src);
        free(dst);

        printf("copy %u bytes costs average %llu(rte_memcpy) %llu(memcpy) ticks\n", n, (end - start)/TIMES, (end2 - start2)/TIMES);
}

int test_memcmp(size_t n)
{
        uint64_t start, end, i, start2, end2, j;
        uint8_t *src, *dst;
        int *ret;

        src = (uint8_t*)malloc(n * sizeof(uint8_t));
        dst = (uint8_t*)malloc(n * sizeof(uint8_t));
        ret = (int*)malloc(TIMES * sizeof(int));

        start = rte_rdtsc();
        for (i = 0; i < TIMES; i++) {
                ret[i] = rte_memcmp(dst, src, n);
        }
        end = rte_rdtsc();

        start2 = rte_rdtsc();
        for (i = 0; i < TIMES; i++) {
                ret[i] = memcmp(dst, src, n);
        }
        end2 = rte_rdtsc();

	// avoid gcc to optimize memcmp
        for (i = 0; i < TIMES; i++) {
                t += ret[i];
        }

        free(src);
        free(dst);

        printf("copy %u bytes costs average %llu(rte_memcmp) %llu(memcmp) ticks\n", n, (end - start)/TIMES, (end2 - start2)/TIMES);
        return t;
}




int main(int narg, char** args)
{
        printf("each test run %llu times\n", TIMES);

        if (narg < 2) {
                printf("usage:./avx_test 0/1 1:test memcpy 0:test memcmp\n");
                return -1;
        }

        if (atoi(args[1])) {
                test_memcpy(16);
                test_memcpy(32);
                test_memcpy(64);
                test_memcpy(128);
                test_memcpy(256);
                test_memcpy(512);
                test_memcpy(1024);
                test_memcpy(1500);
        } else {
                test_memcmp(16);
                test_memcmp(32);
                test_memcmp(64);
                test_memcmp(128);
                test_memcmp(256);
                test_memcmp(512);
                test_memcmp(1024);
                test_memcmp(1500);
        }
}








More information about the dev mailing list