@@ -155,75 +155,102 @@ void apply_filtration (const float* pfIn, size_t uiULen, size_t uiVLen, const fl
155
155
156
156
157
157
// ! Apply filter in the Fourier space
158
- void apply_filtration2 (const float * pfInAll, size_t uiOffset, size_t uiULen, size_t uiBatch , const float * pfFilter, size_t uiFLen, float fScale , float * pfOut, const GpuIds& gpuids) {
158
+ void apply_filtration2 (const float * pfInAll, size_t uiOffset, size_t uiULen, size_t uiBatchTotal , const float * pfFilter, size_t uiFLen, float fScale , float * pfOut, const GpuIds& gpuids) {
159
159
// Prepare for MultiGPU
160
160
int deviceCount = gpuids.GetLength ();
161
161
cudaCheckErrors (" Device query fail" );
162
162
if (deviceCount == 0 ) {
163
- mexErrMsgIdAndTxt (" apply_filtration " ," There are no available device(s) that support CUDA\n " );
163
+ mexErrMsgIdAndTxt (" apply_filtration2 " ," There are no available device(s) that support CUDA\n " );
164
164
}
165
165
//
166
166
// CODE assumes
167
167
// 1.-All available devices are usable by this code
168
168
// 2.-All available devices are equal, they are the same machine (warning thrown)
169
169
// Check the available devices, and if they are the same
170
170
if (!gpuids.AreEqualDevices ()) {
171
- mexWarnMsgIdAndTxt (" apply_filtration " ," Detected one (or more) different GPUs.\n This code is not smart enough to separate the memory GPU wise if they have different computational times or memory limits.\n First GPU parameters used. If the code errors you might need to change the way GPU selection is performed." );
171
+ mexWarnMsgIdAndTxt (" apply_filtration2 " ," Detected one (or more) different GPUs.\n This code is not smart enough to separate the memory GPU wise if they have different computational times or memory limits.\n First GPU parameters used. If the code errors you might need to change the way GPU selection is performed." );
172
172
}
173
- // USING THE FIRST GPU ONLY
174
173
const float * pfIn = pfInAll+uiOffset;
175
- cudaSetDevice (gpuids[0 ]);
176
- cudaCheckErrors (" apply_filtration fail cudaSetDevice" );
177
- size_t uiPaddingLen = (uiFLen-uiULen) / 2 ;
178
- float * d_pfProjWide = nullptr ;
179
- cudaMalloc ((void **)&d_pfProjWide, uiFLen*uiBatch*sizeof (float ));
180
- cudaCheckErrors (" apply_filtration fail cudaMalloc wide" );
181
- cudaMemset (d_pfProjWide, 0 , uiFLen*uiBatch*sizeof (float ));
182
- cudaCheckErrors (" apply_filtration fail cudaMemset" );
183
- cudaMemcpy2D (&d_pfProjWide[uiPaddingLen], uiFLen*sizeof (float ), pfIn, uiULen*sizeof (float ), uiULen*sizeof (float ), uiBatch, cudaMemcpyHostToDevice);
184
- cudaCheckErrors (" apply_filtration fail cudaMemcpy2D" );
185
-
186
174
const float fFLInv = 1 ./uiFLen;
175
+ const size_t uiBufferSize = (uiFLen+1 )/2 +1 ; // Buffer size for R2C. See https://docs.nvidia.com/cuda/cufft/
176
+ const size_t uiPaddingLen = (uiFLen-uiULen) / 2 ;
177
+ const size_t uiBatch0 = uiBatchTotal / deviceCount;
187
178
188
- size_t uiBufferSize = (uiFLen+1 )/2 +1 ; // Buffer size for R2C. See https://docs.nvidia.com/cuda/cufft/
179
+ float ** pd_pfProjWide = (float **)malloc (deviceCount*sizeof (float *));
180
+ float ** pd_pfFilter = (float **)malloc (deviceCount*sizeof (float *));
181
+ cufftComplex** pd_pcfWork = (cufftComplex**)malloc (deviceCount*sizeof (cufftComplex*));
189
182
190
- cufftHandle cudafftPlanFwd;
191
- cufftHandle cudafftPlanInv;
192
- cufftResult_t fftresult;
193
- fftresult = cufftPlan1d (&cudafftPlanFwd, uiFLen, CUFFT_R2C, uiBatch);
194
- cudafftCheckError (fftresult, " apply_filtration fail cufftPlan1d 1" );
195
- fftresult = cufftPlan1d (&cudafftPlanInv, uiFLen, CUFFT_C2R, uiBatch);
196
- cudafftCheckError (fftresult, " apply_filtration fail cufftPlan1d 2" );
183
+ size_t * puiBatch = (size_t *)malloc (deviceCount*sizeof (size_t ));
184
+ cufftHandle* pcudafftPlanFwd = (cufftHandle*)malloc (deviceCount*sizeof (cufftHandle));
185
+ cufftHandle* pcudafftPlanInv = (cufftHandle*)malloc (deviceCount*sizeof (cufftHandle));
197
186
198
- float * d_pfFilter = nullptr ;
199
- cudaMalloc ((void **)&d_pfFilter, uiFLen * sizeof (float ));
200
- cudaCheckErrors (" apply_filtration fail cudaMalloc 2" );
201
- cudaMemcpy (d_pfFilter, pfFilter, uiFLen * sizeof (float ), cudaMemcpyHostToDevice);
202
- cudaCheckErrors (" apply_filtration fail cudaMemcpy 2" );
187
+ cufftResult_t fftresult;
188
+
189
+ for (int dev = 0 ; dev < deviceCount; ++dev) {
190
+ pd_pfProjWide[dev] = nullptr ;
191
+ pd_pfFilter[dev] = nullptr ;
192
+ pd_pcfWork[dev] = nullptr ;
193
+ puiBatch[dev] = dev!=deviceCount-1 ? uiBatch0 : uiBatchTotal - uiBatch0*dev;
194
+ size_t uiBatch = puiBatch[dev];
203
195
204
- cufftComplex* d_pcfWork = nullptr ;
205
- cudaMalloc ((void **)&d_pcfWork, uiBufferSize * uiBatch*sizeof (cufftComplex));
206
- cudaCheckErrors (" apply_filtration fail cudaMalloc 3" );
196
+ // Allocate extended projection
197
+ cudaSetDevice (gpuids[dev]);
198
+ cudaCheckErrors (" apply_filtration2 fail cudaSetDevice" );
199
+ cudaMalloc ((void **)&pd_pfProjWide[dev], uiFLen*uiBatch*sizeof (float ));
200
+ cudaCheckErrors (" apply_filtration2 fail cudaMalloc wide" );
201
+ // Fill 0
202
+ cudaMemset (pd_pfProjWide[dev], 0 , uiFLen*uiBatch*sizeof (float ));
203
+ cudaCheckErrors (" apply_filtration2 fail cudaMemset" );
204
+ // Insert projection
205
+ cudaMemcpy2D (&pd_pfProjWide[dev][uiPaddingLen], uiFLen*sizeof (float ), &pfIn[dev*uiBatch0*uiULen], uiULen*sizeof (float ), uiULen*sizeof (float ), uiBatch, cudaMemcpyHostToDevice);
206
+ cudaCheckErrors (" apply_filtration2 fail cudaMemcpy2D 1" );
207
207
208
- {
208
+ // Filter
209
+ cudaMalloc ((void **)&pd_pfFilter[dev], uiFLen * sizeof (float ));
210
+ cudaCheckErrors (" apply_filtration2 fail cudaMalloc filter" );
211
+ cudaMemcpy (pd_pfFilter[dev], pfFilter, uiFLen * sizeof (float ), cudaMemcpyHostToDevice);
212
+ cudaCheckErrors (" apply_filtration2 fail cudaMemcpy 2" );
213
+
214
+ // Work space for FFT.
215
+ cudaMalloc ((void **)&pd_pcfWork[dev], uiBufferSize * uiBatch*sizeof (cufftComplex));
216
+ cudaCheckErrors (" apply_filtration2 fail cudaMalloc work" );
217
+ }
218
+ for (int dev = 0 ; dev < deviceCount; ++dev) {
219
+ cudaSetDevice (gpuids[dev]);
220
+ const size_t uiBatch = puiBatch[dev];
221
+ fftresult = cufftPlan1d (&pcudafftPlanFwd[dev], uiFLen, CUFFT_R2C, uiBatch);
222
+ cudafftCheckError (fftresult, " apply_filtration2 fail cufftPlan1d 1" );
223
+ fftresult = cufftPlan1d (&pcudafftPlanInv[dev], uiFLen, CUFFT_C2R, uiBatch);
224
+ cudafftCheckError (fftresult, " apply_filtration2 fail cufftPlan1d 2" );
209
225
const int divU = 128 ;// PIXEL_SIZE_BLOCK;
210
226
const int divV = 1 ;// PIXEL_SIZE_BLOCK;
211
227
dim3 grid ((uiFLen+divU-1 )/divU,(uiBatch+divV-1 )/divV,1 );
212
228
dim3 block (divU,divV,1 );
213
- cufftSetStream (cudafftPlanFwd, 0 );
214
- cufftSetStream (cudafftPlanInv, 0 );
215
- fftresult = cufftExecR2C (cudafftPlanFwd, d_pfProjWide, d_pcfWork);
216
- cudafftCheckError (fftresult, " apply_filtration fail cufftExecR2C" );
217
- ApplyFilter<<<grid, block>>> (d_pcfWork, uiBufferSize, uiBatch, d_pfFilter, fFLInv *fScale );// Kernel d_pcfInOut = d_pcfInOut * pfFilter / uiFLen *
218
- fftresult = cufftExecC2R (cudafftPlanInv, d_pcfWork, d_pfProjWide);
219
- cudafftCheckError (fftresult, " apply_filtration fail cufftExecC2R" );
229
+ fftresult = cufftExecR2C (pcudafftPlanFwd[dev], pd_pfProjWide[dev], pd_pcfWork[dev]);
230
+ cudafftCheckError (fftresult, " apply_filtration2 fail cufftExecR2C" );
231
+ ApplyFilter<<<grid, block>>> (pd_pcfWork[dev], uiBufferSize, uiBatch, pd_pfFilter[dev], fFLInv *fScale );// Kernel d_pcfInOut = d_pcfInOut * pfFilter / uiFLen *
232
+ fftresult = cufftExecC2R (pcudafftPlanInv[dev], pd_pcfWork[dev], pd_pfProjWide[dev]);
233
+ cudafftCheckError (fftresult, " apply_filtration2 fail cufftExecC2R" );
220
234
}
221
- cudaMemcpy2D (pfOut, uiULen*sizeof (float ), &d_pfProjWide[uiPaddingLen], uiFLen*sizeof (float ), uiULen*sizeof (float ), uiBatch, cudaMemcpyDeviceToHost);
222
- cudaCheckErrors (" apply_filtration fail cudaMemcpy 3" );
235
+ for (int dev = 0 ; dev < deviceCount; ++dev) {
236
+ cudaSetDevice (gpuids[dev]);
237
+ const size_t uiBatch = puiBatch[dev];
238
+ cudaMemcpy2D (&pfOut[dev*uiBatch0*uiULen], uiULen*sizeof (float ), &pd_pfProjWide[dev][uiPaddingLen], uiFLen*sizeof (float ), uiULen*sizeof (float ), uiBatch, cudaMemcpyDeviceToHost);
239
+ cudaCheckErrors (" apply_filtration2 fail cudaMemcpy2D 2" );
223
240
224
- cudaFree (d_pcfWork); d_pcfWork = nullptr ;
225
- cudaFree (d_pfProjWide); d_pfProjWide = nullptr ;
226
- cudaFree (d_pfFilter); d_pfFilter = nullptr ;
227
- cufftDestroy (cudafftPlanFwd);
228
- cufftDestroy (cudafftPlanInv);
229
- }
241
+ cudaFree (pd_pcfWork[dev]); pd_pcfWork[dev] = nullptr ;
242
+ cudaFree (pd_pfProjWide[dev]); pd_pfProjWide[dev] = nullptr ;
243
+ cudaFree (pd_pfFilter[dev]); pd_pfFilter[dev] = nullptr ;
244
+ cufftDestroy (pcudafftPlanFwd[dev]);
245
+ cufftDestroy (pcudafftPlanInv[dev]);
246
+ cudaCheckErrors (" apply_filtration2 fail free" );
247
+ }
248
+ cudaDeviceSynchronize ();
249
+
250
+ free (pd_pfProjWide); pd_pfProjWide = nullptr ;
251
+ free (pd_pfFilter); pd_pfFilter = nullptr ;
252
+ free (puiBatch); puiBatch = nullptr ;
253
+ free (pd_pcfWork); pd_pcfWork = nullptr ;
254
+ free (pcudafftPlanFwd); pcudafftPlanFwd = nullptr ;
255
+ free (pcudafftPlanInv); pcudafftPlanInv = nullptr ;
256
+ }
0 commit comments