Comparing performance of simd copying on standard, 128 bit, and 256 bit
#include <immintrin.h>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <limits.h>
// Data has to be aligned to 128 bit boundary
#define TEST_DATA_SIZE (128 * 10000000)
int main(int argc, char const* argv[]) {
clock_t start, end;
double cpu_time_used;
puts("Char copying comparison test\n-------------------------");
printf("Using data size of %llu characters\n", (unsigned long long)TEST_DATA_SIZE);
char* chr_src = malloc(TEST_DATA_SIZE);
char* chr_dst = calloc(1, TEST_DATA_SIZE);
puts("Allocated the copy buffers");
const char* src_ptr = chr_src;
char* dst_ptr = chr_dst;
size_t counter = TEST_DATA_SIZE;
memset(chr_src, 1, TEST_DATA_SIZE);
puts("Starting copy test");
start = clock();
while(counter -= 1) {
*dst_ptr++ = *src_ptr++;
}
end = clock();
cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC;
printf("The time taken for normal copy is %f seconds\n", cpu_time_used);
memset(chr_dst, 0, TEST_DATA_SIZE);
src_ptr = chr_src;
dst_ptr = chr_dst;
counter = TEST_DATA_SIZE;
puts("Starting SIMD 128bit test");
start = clock();
while (counter -= 16) {
_mm_storeu_si128((__m128i*)dst_ptr, _mm_load_si128((__m128i const*)src_ptr));
src_ptr += 16;
dst_ptr += 16;
}
end = clock();
cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC;
printf("The time taken for SIMD 128bit copy is %f seconds\n", cpu_time_used);
memset(chr_dst, 0, TEST_DATA_SIZE);
src_ptr = chr_src;
dst_ptr = chr_dst;
counter = TEST_DATA_SIZE;
puts("Starting SIMD 256bit test");
start = clock();
while (counter -= 32) {
_mm256_storeu_si256((__m256i*)dst_ptr, _mm256_load_si256((__m256i const*)src_ptr));
src_ptr += 32;
dst_ptr += 32;
}
end = clock();
cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC;
printf("The time taken for SIMD 256bit copy is %f seconds\n", cpu_time_used);
// clean up
free(chr_src);
free(chr_dst);
return 0;
}
#undef TEST_DATA_SIZE
/* Example Result
C:\Users\jweinstein\blog\covid>char_copy.exe
Char copying comparison test
-------------------------
Using data size of 1280000000 characters
Allocated the copy buffers
Starting copy test
The time taken for normal copy is 3.516000 seconds
Starting SIMD 128bit test
The time taken for SIMD 128bit copy is 0.235000 seconds
Starting SIMD 256bit test
The time taken for SIMD 256bit copy is 0.176000 seconds
*/