我正在尝试3
使用3
流来实现“-方式重叠”,如CUDA流和并发网络研讨会中的示例所示。但是我做不到。
我有Geforce GT 550M(带有一个复制引擎的费米架构),并且我正在使用Windows 7(64位)。
这是我编写的代码。
#include#include "cuda_runtime.h" #include "device_launch_parameters.h" // includes, project #include "helper_cuda.h" #include "helper_functions.h" // helper utility functions #include using namespace std; #define DATA_SIZE 6000000 #define NUM_THREADS 32 #define NUM_BLOCKS 16 #define NUM_STREAMS 3 __global__ void kernel(const int *in, int *out, int dataSize) { int start = blockIdx.x * blockDim.x + threadIdx.x; int end = dataSize; for (int i = start; i < end; i += blockDim.x * gridDim.x) { out[i] = in[i] * in[i]; } } int main() { const int dataSize = DATA_SIZE; int *h_in = new int[dataSize]; int *h_out = new int[dataSize]; int *h_groundTruth = new int[dataSize]; // Input population for(int i = 0; i < dataSize; i++) h_in[i] = 5; for(int i = 0; i < dataSize; i++) h_out[i] = 0; // CPU calculation for ground truth for(int i = 0; i < dataSize; i++) h_groundTruth[i] = h_in[i] * h_in[i]; // Choose which GPU to run on, change this on a multi-GPU system. checkCudaErrors( cudaSetDevice(0) ); int *d_in = 0; int *d_out = 0; int streamSize = dataSize / NUM_STREAMS; size_t memSize = dataSize * sizeof(int); size_t streamMemSize = memSize / NUM_STREAMS; checkCudaErrors( cudaMalloc( (void **)&d_in, memSize) ); checkCudaErrors( cudaMalloc( (void **)&d_out, memSize) ); // registers host memory as page-locked (required for asynch cudaMemcpyAsync) checkCudaErrors(cudaHostRegister(h_in, memSize, cudaHostRegisterPortable)); checkCudaErrors(cudaHostRegister(h_out, memSize, cudaHostRegisterPortable)); // set kernel launch config dim3 nThreads = dim3(NUM_THREADS,1,1); dim3 nBlocks = dim3(NUM_BLOCKS,1,1); cout << "GPU Kernel Configuration : " << endl; cout << "Number of Streams :\t" << NUM_STREAMS << " with size: \t" << streamSize << endl; cout << "Number of Threads :\t" << nThreads.x << "\t" << nThreads.y << "\t" << nThreads.z << endl; cout << "Number of Blocks :\t" << nBlocks.x << "\t" << nBlocks.y << "\t" << nBlocks.z << endl; // create cuda stream cudaStream_t streams[NUM_STREAMS]; for(int i = 0; i < NUM_STREAMS; i++) checkCudaErrors(cudaStreamCreate(&streams[i])); // create cuda event handles cudaEvent_t start, stop; checkCudaErrors(cudaEventCreate(&start)); checkCudaErrors(cudaEventCreate(&stop)); cudaEventRecord(start, 0); // overlapped execution using version 2 for(int i = 0; i < NUM_STREAMS; i++) { int offset = i * streamSize; cudaMemcpyAsync(&d_in[offset], &h_in[offset], streamMemSize, cudaMemcpyHostToDevice, streams[i]); } //cudaMemcpy(d_in, h_in, memSize, cudaMemcpyHostToDevice); for(int i = 0; i < NUM_STREAMS; i++) { int offset = i * streamSize; dim3 subKernelBlock = dim3((int)ceil((float)nBlocks.x / 2)); //kernel<< >>(&d_in[offset], &d_out[offset], streamSize); kernel<< >>(&d_in[offset], &d_out[offset], streamSize/2); kernel<< >>(&d_in[offset + streamSize/2], &d_out[offset + streamSize/2], streamSize/2); } for(int i = 0; i < NUM_STREAMS; i++) { int offset = i * streamSize; cudaMemcpyAsync(&h_out[offset], &d_out[offset], streamMemSize, cudaMemcpyDeviceToHost, streams[i]); } for(int i = 0; i < NUM_STREAMS; i++) checkCudaErrors(cudaStreamSynchronize(streams[i])); cudaEventRecord(stop, 0); checkCudaErrors(cudaStreamSynchronize(0)); checkCudaErrors(cudaDeviceSynchronize()); float gpu_time = 0; checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop)); // release resources checkCudaErrors(cudaEventDestroy(start)); checkCudaErrors(cudaEventDestroy(stop)); checkCudaErrors(cudaHostUnregister(h_in)); checkCudaErrors(cudaHostUnregister(h_out)); checkCudaErrors(cudaFree(d_in)); checkCudaErrors(cudaFree(d_out)); for(int i = 0; i < NUM_STREAMS; i++) checkCudaErrors(cudaStreamDestroy(streams[i])); cudaDeviceReset(); cout << "Execution Time of GPU: " << gpu_time << "ms" << endl; // GPU output check int sum = 0; for(int i = 0; i < dataSize; i++) sum += h_groundTruth[i] - h_out[i]; cout << "Error between CPU and GPU: " << sum << endl; delete[] h_in; delete[] h_out; delete[] h_groundTruth; return 0; }
使用Nsight进行性能分析,我得到以下结果:
看起来似乎是正确的,但为什么流#1中的D2H传输仅在流#2的最后一次内核启动时才开始,而不是在此之前才开始?我还尝试使用8
流(仅通过更改NUM_STREAM
为8
)来实现这样的“-方式3
重叠”,结果如下:
有趣的是,当我使用8
流时,计算和内存传输之间的重叠似乎要好得多。
这个问题的原因是什么?是由于WDDM驱动程序造成的还是我的程序有问题?