cuda - Does CudaMallocManaged allocate memory on the device? -
i'm using unified memory simplify access data on cpu , gpu. far know, cudamallocmanaged should allocate memory on device. wrote simple code check that:
#define type float #define bdimx 16 #define bdimy 16 #include <cuda.h> #include <cstdio> #include <iostream> __global__ void kernel(type *g_output, type *g_input, const int dimx, const int dimy) { __shared__ float s_data[bdimy][bdimx]; int ix = blockidx.x * blockdim.x + threadidx.x; int iy = blockidx.y * blockdim.y + threadidx.y; int in_idx = iy * dimx + ix; // index reading input int tx = threadidx.x; // thread’s x-index corresponding shared memory tile int ty = threadidx.y; // thread’s y-index corresponding shared memory tile s_data[ty][tx] = g_input[in_idx]; __syncthreads(); g_output[in_idx] = s_data[ty][tx] * 1.3; } int main(){ int size_x = 16, size_y = 16; dim3 numtb; numtb.x = (int)ceil((double)(size_x)/(double)bdimx) ; numtb.y = (int)ceil((double)(size_y)/(double)bdimy) ; dim3 tbsize; tbsize.x = bdimx; tbsize.y = bdimy; float* a,* a_out; cudamallocmanaged((void**)&a, size_x * size_y * sizeof(type)); cudamallocmanaged((void**)&a_out, size_x * size_y * sizeof(type)); kernel <<<numtb, tbsize>>>(a_out, a, size_x, size_y); cudadevicesynchronize(); return 0; }
so i'm not accessing data on cpu avoid page faults memory should supposedly on device memory. when run nvprof on code, following results:
invocations metric name metric description min max avg device "tesla k40c (0)" kernel: kernel(float*, float*, int, int) 1 local_load_transactions local load transactions 0 0 0 1 local_store_transactions local store transactions 0 0 0 1 shared_load_transactions shared load transactions 8 8 8 1 shared_store_transactions shared store transactions 8 8 8 1 gld_transactions global load transactions 8 8 8 1 gst_transactions global store transactions 8 8 8 1 sysmem_read_transactions system memory read transactions 32 32 32 1 sysmem_write_transactions system memory write transactions 34 34 34 1 tex_cache_transactions texture cache transactions 0 0 0 1 dram_read_transactions device memory read transactions 0 0 0 1 dram_write_transactions device memory write transactions 0 0 0
so apparently array allocated on system memory , not device memory. missing here?
managed memory allocate physical memory on gpu. can confirm case doing following code:
#include <iostream> void report_gpu_mem() { size_t free, total; cudamemgetinfo(&free, &total); std::cout << "free = " << free << " total = " << total <<std::endl; } int main() { float* a,* a_out; size_t sz = 1 << 24; // 16mb report_gpu_mem(); cudamallocmanaged((void**)&a, sz); report_gpu_mem(); cudamallocmanaged((void**)&a_out, sz); report_gpu_mem(); cudafree(a); report_gpu_mem(); cudafree(a_out); report_gpu_mem(); return cudadevicereset(); }
which allocating 16mb each of 2 managed allocations, , freeing them. no host or device access occurs, there should no triggered transfers or synchronisation. size large enough should exceed minimum granularity of gpu memory manager , trigger changes in visible free memory. compiling , running this:
$ nvcc -arch=sm_52 sleepy.cu $ cuda_visible_devices="0" ./a.out free = 4211929088 total = 4294770688 free = 4194869248 total = 4294770688 free = 4178092032 total = 4294770688 free = 4194869248 total = 4294770688 free = 4211654656 total = 4294770688
the physical free memory on gpu being incremented , decremented 16mb @ each alloc/free.
Comments
Post a Comment