当我在SO上遇到这个问题时,我很想知道答案.所以我在下面写了一段代码来测试不同场景下的原子操作性能.操作系统是带有CUDA 5.5的Ubuntu 12.04,设备是GeForce GTX780(开普勒架构).我使用-O3
flag 编译代码,CC = 3.5.
#includestatic void HandleError( cudaError_t err, const char *file, int line ) { if (err != cudaSuccess) { printf( "%s in %s at line %d\n", cudaGetErrorString( err ), file, line ); exit( EXIT_FAILURE ); } } #define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ )) #define BLOCK_SIZE 256 #define RESTRICTION_SIZE 32 __global__ void CoalescedAtomicOnGlobalMem(int* data, int nElem) { unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x; for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){ atomicAdd( data+i, 6); //arbitrary number to add } } __global__ void AddressRestrictedAtomicOnGlobalMem(int* data, int nElem) { unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x; for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){ atomicAdd( data+(i&(RESTRICTION_SIZE-1)), 6); //arbitrary number to add } } __global__ void WarpRestrictedAtomicOnGlobalMem(int* data, int nElem) { unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x; for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){ atomicAdd( data+(i>>5), 6); //arbitrary number to add } } __global__ void SameAddressAtomicOnGlobalMem(int* data, int nElem) { unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x; for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){ atomicAdd( data, 6); //arbitrary number to add } } __global__ void CoalescedAtomicOnSharedMem(int* data, int nElem) { __shared__ int smem_data[BLOCK_SIZE]; unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x; for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){ atomicAdd( smem_data+threadIdx.x, data[i]); } } __global__ void AddressRestrictedAtomicOnSharedMem(int* data, int nElem) { __shared__ int smem_data[BLOCK_SIZE]; unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x; for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){ atomicAdd( smem_data+(threadIdx.x&(RESTRICTION_SIZE-1)), data[i&(RESTRICTION_SIZE-1)]); } } __global__ void WarpRestrictedAtomicOnSharedMem(int* data, int nElem) { __shared__ int smem_data[BLOCK_SIZE]; unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x; for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){ atomicAdd( smem_data+(threadIdx.x>>5), data[i>>5]); } } __global__ void SameAddressAtomicOnSharedMem(int* data, int nElem) { __shared__ int smem_data[BLOCK_SIZE]; unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x; for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){ atomicAdd( smem_data, data[0]); } } int main(void) { const int n = 2 << 24; int* data = new int[n]; int i; for(i=0; i >>( dev_data, n); HANDLE_ERROR( cudaPeekAtLastError() ); } HANDLE_ERROR( cudaDeviceSynchronize() ); for(int i=0; i<50; i++) { dim3 blocksize(BLOCK_SIZE); dim3 griddize((12*2048)/BLOCK_SIZE); //12 SMX ON GTX780 each can have 2048 threads AddressRestrictedAtomicOnGlobalMem<< >>( dev_data, n); HANDLE_ERROR( cudaPeekAtLastError() ); } HANDLE_ERROR( cudaDeviceSynchronize() ); for(int i=0; i<50; i++) { dim3 blocksize(BLOCK_SIZE); dim3 griddize((12*2048)/BLOCK_SIZE); //12 SMX ON GTX780 each can have 2048 threads WarpRestrictedAtomicOnGlobalMem<< >>( dev_data, n); HANDLE_ERROR( cudaPeekAtLastError() ); } HANDLE_ERROR( cudaDeviceSynchronize() ); for(int i=0; i<50; i++) { dim3 blocksize(BLOCK_SIZE); dim3 griddize((12*2048)/BLOCK_SIZE); //12 SMX ON GTX780 each can have 2048 threads SameAddressAtomicOnGlobalMem<< >>( dev_data, n); HANDLE_ERROR( cudaPeekAtLastError() ); } HANDLE_ERROR( cudaDeviceSynchronize() ); for(int i=0; i<50; i++) { dim3 blocksize(BLOCK_SIZE); dim3 griddize((12*2048)/BLOCK_SIZE); //12 SMX ON GTX780 each can have 2048 threads CoalescedAtomicOnSharedMem<< >>( dev_data, n); HANDLE_ERROR( cudaPeekAtLastError() ); } HANDLE_ERROR( cudaDeviceSynchronize() ); for(int i=0; i<50; i++) { dim3 blocksize(BLOCK_SIZE); dim3 griddize((12*2048)/BLOCK_SIZE); //12 SMX ON GTX780 each can have 2048 threads AddressRestrictedAtomicOnSharedMem<< >>( dev_data, n); HANDLE_ERROR( cudaPeekAtLastError() ); } HANDLE_ERROR( cudaDeviceSynchronize() ); for(int i=0; i<50; i++) { dim3 blocksize(BLOCK_SIZE); dim3 griddize((12*2048)/BLOCK_SIZE); //12 SMX ON GTX780 each can have 2048 threads WarpRestrictedAtomicOnSharedMem<< >>( dev_data, n); HANDLE_ERROR( cudaPeekAtLastError() ); } HANDLE_ERROR( cudaDeviceSynchronize() ); for(int i=0; i<50; i++) { dim3 blocksize(BLOCK_SIZE); dim3 griddize((12*2048)/BLOCK_SIZE); //12 SMX ON GTX780 each can have 2048 threads SameAddressAtomicOnSharedMem<< >>( dev_data, n); HANDLE_ERROR( cudaPeekAtLastError() ); } HANDLE_ERROR( cudaDeviceSynchronize() ); HANDLE_ERROR( cudaDeviceReset() ); printf("Program finished without error.\n"); return 0; }
基本上在上面的代码中有8个内核,其中所有线程都atomicAdd
对所有数据执行操作.
在全局内存上合并原子加法.
全局内存中受限地址空间的原子添加.(代码中的32)
经线通道的原子添加在全局存储器中的相同地址上.
在全局内存中的同一地址上添加所有线程的原子.
通过将以上项目中的共享替换为global,可以找到项目5到8.选择的块大小为256.
我曾经nvprof
描述过这个程序.输出是:
Time(%) Time Calls Avg Min Max Name 44.33% 2.35113s 50 47.023ms 46.987ms 47.062ms SameAddressAtomicOnSharedMem(int*, int) 31.89% 1.69104s 50 33.821ms 33.818ms 33.826ms SameAddressAtomicOnGlobalMem(int*, int) 10.10% 535.88ms 50 10.718ms 10.707ms 10.738ms WarpRestrictedAtomicOnSharedMem(int*, int) 3.96% 209.95ms 50 4.1990ms 4.1895ms 4.2103ms AddressRestrictedAtomicOnSharedMem(int*, int) 3.95% 209.47ms 50 4.1895ms 4.1893ms 4.1900ms AddressRestrictedAtomicOnGlobalMem(int*, int) 3.33% 176.48ms 50 3.5296ms 3.5050ms 3.5498ms WarpRestrictedAtomicOnGlobalMem(int*, int) 1.08% 57.428ms 50 1.1486ms 1.1460ms 1.1510ms CoalescedAtomicOnGlobalMem(int*, int) 0.84% 44.784ms 50 895.68us 888.65us 905.77us CoalescedAtomicOnSharedMem(int*, int) 0.51% 26.805ms 1 26.805ms 26.805ms 26.805ms [CUDA memcpy HtoD] 0.01% 543.61us 1 543.61us 543.61us 543.61us [CUDA memset]
显然,合并无冲突的原子操作具有最佳性能,同一地址的性能最差.我无法解释的一件事是,与全局内存(在所有线程之间通用)相比,为什么共享内存(块内)的相同地址原子比较慢.
当所有warp通道访问共享内存中的相同位置时,性能非常差,但是(令人惊讶的是)当它们在全局内存上执行时却不是这样.我无法解释原因.另一个混淆的情况是全局的地址限制原子的性能比warp中的所有线程在同一地址上执行时更糟,而第一种情况下的内存争用似乎更低.
无论如何,如果有人能解释上面的分析结果,我会很高兴.
作为前瞻性陈述,在某种程度上,我在这里的评论可能是特定于架构的.但是对于手头的架构(高达cc 3.5,AFAIK),共享内存原子通过代码序列(由汇编器创建)实现.如果多个线程争用对同一存储体/位置的访问,则在共享存储器上操作的该代码序列将被序列化.
RMW操作本身是原子的,没有其他线程可以破坏操作(即创建不正确的结果),但是当线程争用在单个共享内存位置上进行原子操作时,争用会导致序列化,加剧与原子相关的延迟.
引用CUDA手册中的尼克:
与使用单个指令(GATOM或GRED,取决于是否使用返回值)实现原子的全局内存不同,共享内存原子是使用显式锁定/解锁语义实现的,并且编译器发出导致每个线程循环的代码这些锁操作直到线程执行了原子操作.
和:
注意避免争用,或者清单8-2中的循环最多可迭代32次.
我建议你至少阅读完整的8.1.5部分.