cuda - Does CudaMallocManaged allocate memory on the device? -


i'm using unified memory simplify access data on cpu , gpu. far know, cudamallocmanaged should allocate memory on device. wrote simple code check that:

#define type float #define bdimx 16 #define bdimy 16 #include <cuda.h> #include <cstdio> #include <iostream> __global__ void kernel(type *g_output, type *g_input, const int dimx, const int dimy) { __shared__ float s_data[bdimy][bdimx];   int ix = blockidx.x * blockdim.x + threadidx.x;   int iy = blockidx.y * blockdim.y + threadidx.y;   int in_idx = iy * dimx + ix; // index reading input   int tx = threadidx.x; // thread’s x-index corresponding shared memory tile     int ty = threadidx.y; // thread’s y-index corresponding shared memory tile    s_data[ty][tx] = g_input[in_idx];   __syncthreads();   g_output[in_idx] = s_data[ty][tx] * 1.3;   }   int main(){   int size_x = 16, size_y = 16;   dim3 numtb;     numtb.x = (int)ceil((double)(size_x)/(double)bdimx) ;     numtb.y = (int)ceil((double)(size_y)/(double)bdimy) ;   dim3 tbsize;    tbsize.x = bdimx;   tbsize.y = bdimy;   float* a,* a_out;   cudamallocmanaged((void**)&a,     size_x * size_y * sizeof(type));   cudamallocmanaged((void**)&a_out, size_x * size_y * sizeof(type));    kernel <<<numtb, tbsize>>>(a_out, a, size_x, size_y);     cudadevicesynchronize();   return 0; } 

so i'm not accessing data on cpu avoid page faults memory should supposedly on device memory. when run nvprof on code, following results:

  invocations                               metric name                        metric description         min         max         avg device "tesla k40c (0)" kernel: kernel(float*, float*, int, int)         1                   local_load_transactions                   local load transactions           0           0           0         1                  local_store_transactions                  local store transactions           0           0           0         1                  shared_load_transactions                  shared load transactions           8           8           8         1                 shared_store_transactions                 shared store transactions           8           8           8         1                          gld_transactions                  global load transactions           8           8           8         1                          gst_transactions                 global store transactions           8           8           8         1                  sysmem_read_transactions           system memory read transactions          32          32          32         1                 sysmem_write_transactions          system memory write transactions          34          34          34         1                    tex_cache_transactions                texture cache transactions           0           0           0         1                    dram_read_transactions           device memory read transactions           0           0           0         1                   dram_write_transactions          device memory write transactions           0           0           0 

so apparently array allocated on system memory , not device memory. missing here?

managed memory allocate physical memory on gpu. can confirm case doing following code:

#include <iostream>  void report_gpu_mem() {     size_t free, total;     cudamemgetinfo(&free, &total);     std::cout << "free = " << free << " total = " << total <<std::endl; }  int main() {     float* a,* a_out;     size_t sz = 1 << 24; // 16mb     report_gpu_mem();     cudamallocmanaged((void**)&a, sz);     report_gpu_mem();     cudamallocmanaged((void**)&a_out, sz);     report_gpu_mem();     cudafree(a);     report_gpu_mem();     cudafree(a_out);     report_gpu_mem();     return cudadevicereset(); } 

which allocating 16mb each of 2 managed allocations, , freeing them. no host or device access occurs, there should no triggered transfers or synchronisation. size large enough should exceed minimum granularity of gpu memory manager , trigger changes in visible free memory. compiling , running this:

$ nvcc -arch=sm_52 sleepy.cu  $ cuda_visible_devices="0" ./a.out  free = 4211929088 total = 4294770688 free = 4194869248 total = 4294770688 free = 4178092032 total = 4294770688 free = 4194869248 total = 4294770688 free = 4211654656 total = 4294770688 

the physical free memory on gpu being incremented , decremented 16mb @ each alloc/free.


Comments

Popular posts from this blog

Django REST Framework perform_create: You cannot call `.save()` after accessing `serializer.data` -

Why does Go error when trying to marshal this JSON? -