How to create and use a 1D layered texture in CUDA -
i new cuda. have figured out how 1d , 2d textures in cuda. however, struggling how use 1d layered texture. output of kernel uses texture zeros, incorrect. however, not sure doing wrong. have serious doubts set texture correctly, checked cuda errors everywhere , couldn't find issues. can show me how correctly set 1d layered texture , use it. here code. in advance:
// compile: nvcc backproj.cu -o backproj.out // run: ./backproj.out // includes, system #include <stdlib.h> #include <stdio.h> #include <string.h> #include <math.h> // includes cuda #include <cuda_runtime.h> #include <cuda_profiler_api.h> #define pi acos(-1) // 1d float textures texture<float, cudatexturetype1dlayered, cudareadmodeelementtype> texref; // 1d interpolation kernel: should similar if used 1d interpolation on matlab __global__ void interp1kernel(float* d_output, float* d_locations, int numlocations, int layer) { unsigned int location_idx = blockidx.x * blockdim.x + threadidx.x; if (location_idx < numlocations) { // location want interpolate array float loc2find = (float) d_locations[location_idx] + 0.5f; // read texture , write global memory d_output[location_idx] = tex1dlayered(texref, loc2find, layer); } } // host code int main() { // setup h_data , locations interpolate const unsigned int len = 10; const unsigned int numlayers = 3; const unsigned int upsamp = 3; const unsigned int loclen = 1 + (len - 1) * upsamp; float idx_spacing = 1/(float)upsamp; float h_data[len][numlayers], h_loc[loclen]; (int = 0; < len; i++) (int j = 0; j < numlayers; j++) h_data[i][j] = 1+cosf((float) pi*i/(j+1.0f)); (int = 0; < loclen; ++) h_loc[i] = i*idx_spacing; // memory locations want float* d_loc; cudamalloc(&d_loc, loclen * sizeof(float)); cudamemcpy(d_loc, h_loc, loclen*sizeof(float), cudamemcpyhosttodevice); // allocate cuda array in device memory cudachannelformatdesc channeldesc = cudacreatechanneldesc(32, 0, 0, 0, cudachannelformatkindfloat); cudaarray* cuarray; cudamallocarray(&cuarray, &channeldesc, len, numlayers); // copy device memory data located @ address h_data in host memory cudamemcpytoarray(cuarray, 0, 0, h_data, len * numlayers * sizeof(float), cudamemcpyhosttodevice); // set texture reference parameters texref.addressmode[0] = cudaaddressmodeborder; texref.filtermode = cudafiltermodelinear; texref.normalized = false; // bind array texture reference cudabindtexturetoarray(texref, cuarray, channeldesc); // allocate result of transformation in device memory float* d_output; cudamalloc(&d_output, loclen * sizeof(float)); // invoke kernel int thdsperblk = 256; int blkspergrid = (int) (loclen / thdsperblk) + 1; printf("threads per block: %d, blocks per grid: %d\n", thdsperblk, blkspergrid); interp1kernel <<<blkspergrid, thdsperblk >>>(d_output, d_loc, loclen, 0); // print results printf("\n original indices \n"); (int = 0; < len; i++) printf(" %d ", i); printf("\n original array \n"); (int = 0; < len; i++) printf("%5.3f ", h_data[i][0]); printf("\n output indices \n"); (int = 0; < loclen; i++) printf("%5.3f ", h_loc[i]); printf("\n output array \n"); cudamemcpy(h_loc, d_output, loclen * sizeof(float), cudamemcpydevicetohost); (int = 0; < loclen; i++) printf("%5.3f ", h_loc[i]); printf("\n"); // free device memory cudafreearray(cuarray); cudafree(d_output); return 0; }
you must use cudamalloc3darray
cudaarraylayered
flag set allocate memory layered textures. there complete example of layered texture usage in toolkit samples can study see how work.
Comments
Post a Comment