How to create and use a 1D layered texture in CUDA -


i new cuda. have figured out how 1d , 2d textures in cuda. however, struggling how use 1d layered texture. output of kernel uses texture zeros, incorrect. however, not sure doing wrong. have serious doubts set texture correctly, checked cuda errors everywhere , couldn't find issues. can show me how correctly set 1d layered texture , use it. here code. in advance:

// compile: nvcc backproj.cu -o backproj.out // run: ./backproj.out  // includes, system #include <stdlib.h> #include <stdio.h> #include <string.h> #include <math.h>  // includes cuda #include <cuda_runtime.h> #include <cuda_profiler_api.h>  #define pi acos(-1)  // 1d float textures texture<float, cudatexturetype1dlayered, cudareadmodeelementtype> texref;  // 1d interpolation kernel: should similar if used 1d interpolation on matlab __global__ void interp1kernel(float* d_output, float* d_locations, int numlocations, int layer) {     unsigned int location_idx = blockidx.x * blockdim.x + threadidx.x;     if (location_idx < numlocations) {         // location want interpolate array         float loc2find = (float) d_locations[location_idx] + 0.5f;         // read texture , write global memory         d_output[location_idx] = tex1dlayered(texref, loc2find, layer);     } }  // host code int main() {     // setup h_data , locations interpolate     const unsigned int len = 10;     const unsigned int numlayers = 3;     const unsigned int upsamp = 3;     const unsigned int loclen = 1 + (len - 1) * upsamp;     float idx_spacing = 1/(float)upsamp;     float h_data[len][numlayers], h_loc[loclen];     (int = 0; < len; i++)          (int j = 0; j < numlayers; j++)             h_data[i][j] = 1+cosf((float) pi*i/(j+1.0f));     (int = 0; < loclen; ++)          h_loc[i] = i*idx_spacing;      // memory locations want     float* d_loc;     cudamalloc(&d_loc, loclen * sizeof(float));     cudamemcpy(d_loc, h_loc, loclen*sizeof(float), cudamemcpyhosttodevice);      // allocate cuda array in device memory     cudachannelformatdesc channeldesc = cudacreatechanneldesc(32, 0, 0, 0, cudachannelformatkindfloat);     cudaarray* cuarray;     cudamallocarray(&cuarray, &channeldesc, len, numlayers);      // copy device memory data located @ address h_data in host memory      cudamemcpytoarray(cuarray, 0, 0, h_data, len * numlayers * sizeof(float), cudamemcpyhosttodevice);      // set texture reference parameters     texref.addressmode[0] = cudaaddressmodeborder;     texref.filtermode = cudafiltermodelinear;     texref.normalized = false;      // bind array texture reference     cudabindtexturetoarray(texref, cuarray, channeldesc);      // allocate result of transformation in device memory     float* d_output;     cudamalloc(&d_output, loclen * sizeof(float));      // invoke kernel     int thdsperblk = 256;     int blkspergrid = (int) (loclen / thdsperblk) + 1;     printf("threads per block: %d, blocks per grid: %d\n", thdsperblk, blkspergrid);     interp1kernel <<<blkspergrid, thdsperblk >>>(d_output, d_loc, loclen, 0);      // print results     printf("\n original indices \n");     (int = 0; < len; i++) printf("    %d ", i);     printf("\n original array \n");     (int = 0; < len; i++) printf("%5.3f ", h_data[i][0]);     printf("\n output indices \n");     (int = 0; < loclen; i++) printf("%5.3f ", h_loc[i]);     printf("\n output array \n");     cudamemcpy(h_loc, d_output, loclen * sizeof(float), cudamemcpydevicetohost);     (int = 0; < loclen; i++) printf("%5.3f ", h_loc[i]);     printf("\n");      // free device memory     cudafreearray(cuarray);     cudafree(d_output);      return 0; } 

you must use cudamalloc3darray cudaarraylayered flag set allocate memory layered textures. there complete example of layered texture usage in toolkit samples can study see how work.


Comments

Popular posts from this blog

amazon web services - S3 Pre-signed POST validate file type? -

c# - Check Keyboard Input Winforms -