Skip to content

Commit aa51ec6

Browse files
committed
Make FPD filtering support multiple GPUs
1 parent a0502b5 commit aa51ec6

File tree

1 file changed

+74
-47
lines changed

1 file changed

+74
-47
lines changed

Common/CUDA/FbpFiltration.cu

+74-47
Original file line numberDiff line numberDiff line change
@@ -155,75 +155,102 @@ void apply_filtration (const float* pfIn, size_t uiULen, size_t uiVLen, const fl
155155

156156

157157
//! Apply filter in the Fourier space
158-
void apply_filtration2 (const float* pfInAll, size_t uiOffset, size_t uiULen, size_t uiBatch, const float* pfFilter, size_t uiFLen, float fScale, float* pfOut, const GpuIds& gpuids) {
158+
void apply_filtration2 (const float* pfInAll, size_t uiOffset, size_t uiULen, size_t uiBatchTotal, const float* pfFilter, size_t uiFLen, float fScale, float* pfOut, const GpuIds& gpuids) {
159159
// Prepare for MultiGPU
160160
int deviceCount = gpuids.GetLength();
161161
cudaCheckErrors("Device query fail");
162162
if (deviceCount == 0) {
163-
mexErrMsgIdAndTxt("apply_filtration","There are no available device(s) that support CUDA\n");
163+
mexErrMsgIdAndTxt("apply_filtration2","There are no available device(s) that support CUDA\n");
164164
}
165165
//
166166
// CODE assumes
167167
// 1.-All available devices are usable by this code
168168
// 2.-All available devices are equal, they are the same machine (warning thrown)
169169
// Check the available devices, and if they are the same
170170
if (!gpuids.AreEqualDevices()) {
171-
mexWarnMsgIdAndTxt("apply_filtration","Detected one (or more) different GPUs.\n This code is not smart enough to separate the memory GPU wise if they have different computational times or memory limits.\n First GPU parameters used. If the code errors you might need to change the way GPU selection is performed.");
171+
mexWarnMsgIdAndTxt("apply_filtration2","Detected one (or more) different GPUs.\n This code is not smart enough to separate the memory GPU wise if they have different computational times or memory limits.\n First GPU parameters used. If the code errors you might need to change the way GPU selection is performed.");
172172
}
173-
// USING THE FIRST GPU ONLY
174173
const float* pfIn = pfInAll+uiOffset;
175-
cudaSetDevice(gpuids[0]);
176-
cudaCheckErrors("apply_filtration fail cudaSetDevice");
177-
size_t uiPaddingLen = (uiFLen-uiULen) / 2;
178-
float* d_pfProjWide = nullptr;
179-
cudaMalloc((void**)&d_pfProjWide, uiFLen*uiBatch*sizeof(float));
180-
cudaCheckErrors("apply_filtration fail cudaMalloc wide");
181-
cudaMemset(d_pfProjWide, 0, uiFLen*uiBatch*sizeof(float));
182-
cudaCheckErrors("apply_filtration fail cudaMemset");
183-
cudaMemcpy2D(&d_pfProjWide[uiPaddingLen], uiFLen*sizeof(float), pfIn, uiULen*sizeof(float), uiULen*sizeof(float), uiBatch, cudaMemcpyHostToDevice);
184-
cudaCheckErrors("apply_filtration fail cudaMemcpy2D");
185-
186174
const float fFLInv = 1./uiFLen;
175+
const size_t uiBufferSize = (uiFLen+1)/2+1; // Buffer size for R2C. See https://docs.nvidia.com/cuda/cufft/
176+
const size_t uiPaddingLen = (uiFLen-uiULen) / 2;
177+
const size_t uiBatch0 = uiBatchTotal / deviceCount;
187178

188-
size_t uiBufferSize = (uiFLen+1)/2+1; // Buffer size for R2C. See https://docs.nvidia.com/cuda/cufft/
179+
float** pd_pfProjWide = (float**)malloc(deviceCount*sizeof(float*));
180+
float** pd_pfFilter = (float**)malloc(deviceCount*sizeof(float*));
181+
cufftComplex** pd_pcfWork = (cufftComplex**)malloc(deviceCount*sizeof(cufftComplex*));
189182

190-
cufftHandle cudafftPlanFwd;
191-
cufftHandle cudafftPlanInv;
192-
cufftResult_t fftresult;
193-
fftresult = cufftPlan1d(&cudafftPlanFwd, uiFLen, CUFFT_R2C, uiBatch);
194-
cudafftCheckError(fftresult, "apply_filtration fail cufftPlan1d 1");
195-
fftresult = cufftPlan1d(&cudafftPlanInv, uiFLen, CUFFT_C2R, uiBatch);
196-
cudafftCheckError(fftresult, "apply_filtration fail cufftPlan1d 2");
183+
size_t* puiBatch = (size_t*)malloc(deviceCount*sizeof(size_t));
184+
cufftHandle* pcudafftPlanFwd = (cufftHandle*)malloc(deviceCount*sizeof(cufftHandle));
185+
cufftHandle* pcudafftPlanInv = (cufftHandle*)malloc(deviceCount*sizeof(cufftHandle));
197186

198-
float* d_pfFilter = nullptr;
199-
cudaMalloc((void **)&d_pfFilter, uiFLen * sizeof(float));
200-
cudaCheckErrors("apply_filtration fail cudaMalloc 2");
201-
cudaMemcpy(d_pfFilter, pfFilter, uiFLen * sizeof(float), cudaMemcpyHostToDevice);
202-
cudaCheckErrors("apply_filtration fail cudaMemcpy 2");
187+
cufftResult_t fftresult;
188+
189+
for (int dev = 0; dev < deviceCount; ++dev) {
190+
pd_pfProjWide[dev] = nullptr;
191+
pd_pfFilter[dev] = nullptr;
192+
pd_pcfWork[dev] = nullptr;
193+
puiBatch[dev] = dev!=deviceCount-1? uiBatch0 : uiBatchTotal - uiBatch0*dev;
194+
size_t uiBatch = puiBatch[dev];
203195

204-
cufftComplex* d_pcfWork = nullptr;
205-
cudaMalloc((void **)&d_pcfWork, uiBufferSize * uiBatch*sizeof(cufftComplex));
206-
cudaCheckErrors("apply_filtration fail cudaMalloc 3");
196+
// Allocate extended projection
197+
cudaSetDevice(gpuids[dev]);
198+
cudaCheckErrors("apply_filtration2 fail cudaSetDevice");
199+
cudaMalloc((void**)&pd_pfProjWide[dev], uiFLen*uiBatch*sizeof(float));
200+
cudaCheckErrors("apply_filtration2 fail cudaMalloc wide");
201+
// Fill 0
202+
cudaMemset(pd_pfProjWide[dev], 0, uiFLen*uiBatch*sizeof(float));
203+
cudaCheckErrors("apply_filtration2 fail cudaMemset");
204+
// Insert projection
205+
cudaMemcpy2D(&pd_pfProjWide[dev][uiPaddingLen], uiFLen*sizeof(float), &pfIn[dev*uiBatch0*uiULen], uiULen*sizeof(float), uiULen*sizeof(float), uiBatch, cudaMemcpyHostToDevice);
206+
cudaCheckErrors("apply_filtration2 fail cudaMemcpy2D 1");
207207

208-
{
208+
// Filter
209+
cudaMalloc((void **)&pd_pfFilter[dev], uiFLen * sizeof(float));
210+
cudaCheckErrors("apply_filtration2 fail cudaMalloc filter");
211+
cudaMemcpy(pd_pfFilter[dev], pfFilter, uiFLen * sizeof(float), cudaMemcpyHostToDevice);
212+
cudaCheckErrors("apply_filtration2 fail cudaMemcpy 2");
213+
214+
// Work space for FFT.
215+
cudaMalloc((void **)&pd_pcfWork[dev], uiBufferSize * uiBatch*sizeof(cufftComplex));
216+
cudaCheckErrors("apply_filtration2 fail cudaMalloc work");
217+
}
218+
for (int dev = 0; dev < deviceCount; ++dev) {
219+
cudaSetDevice(gpuids[dev]);
220+
const size_t uiBatch = puiBatch[dev];
221+
fftresult = cufftPlan1d(&pcudafftPlanFwd[dev], uiFLen, CUFFT_R2C, uiBatch);
222+
cudafftCheckError(fftresult, "apply_filtration2 fail cufftPlan1d 1");
223+
fftresult = cufftPlan1d(&pcudafftPlanInv[dev], uiFLen, CUFFT_C2R, uiBatch);
224+
cudafftCheckError(fftresult, "apply_filtration2 fail cufftPlan1d 2");
209225
const int divU = 128;//PIXEL_SIZE_BLOCK;
210226
const int divV = 1;//PIXEL_SIZE_BLOCK;
211227
dim3 grid((uiFLen+divU-1)/divU,(uiBatch+divV-1)/divV,1);
212228
dim3 block(divU,divV,1);
213-
cufftSetStream(cudafftPlanFwd, 0);
214-
cufftSetStream(cudafftPlanInv, 0);
215-
fftresult = cufftExecR2C (cudafftPlanFwd, d_pfProjWide, d_pcfWork);
216-
cudafftCheckError(fftresult, "apply_filtration fail cufftExecR2C");
217-
ApplyFilter<<<grid, block>>>(d_pcfWork, uiBufferSize, uiBatch, d_pfFilter, fFLInv*fScale);// Kernel d_pcfInOut = d_pcfInOut * pfFilter / uiFLen *
218-
fftresult = cufftExecC2R (cudafftPlanInv, d_pcfWork, d_pfProjWide);
219-
cudafftCheckError(fftresult, "apply_filtration fail cufftExecC2R");
229+
fftresult = cufftExecR2C (pcudafftPlanFwd[dev], pd_pfProjWide[dev], pd_pcfWork[dev]);
230+
cudafftCheckError(fftresult, "apply_filtration2 fail cufftExecR2C");
231+
ApplyFilter<<<grid, block>>>(pd_pcfWork[dev], uiBufferSize, uiBatch, pd_pfFilter[dev], fFLInv*fScale);// Kernel d_pcfInOut = d_pcfInOut * pfFilter / uiFLen *
232+
fftresult = cufftExecC2R (pcudafftPlanInv[dev], pd_pcfWork[dev], pd_pfProjWide[dev]);
233+
cudafftCheckError(fftresult, "apply_filtration2 fail cufftExecC2R");
220234
}
221-
cudaMemcpy2D(pfOut, uiULen*sizeof(float), &d_pfProjWide[uiPaddingLen], uiFLen*sizeof(float), uiULen*sizeof(float), uiBatch, cudaMemcpyDeviceToHost);
222-
cudaCheckErrors("apply_filtration fail cudaMemcpy 3");
235+
for (int dev = 0; dev < deviceCount; ++dev) {
236+
cudaSetDevice(gpuids[dev]);
237+
const size_t uiBatch = puiBatch[dev];
238+
cudaMemcpy2D(&pfOut[dev*uiBatch0*uiULen], uiULen*sizeof(float), &pd_pfProjWide[dev][uiPaddingLen], uiFLen*sizeof(float), uiULen*sizeof(float), uiBatch, cudaMemcpyDeviceToHost);
239+
cudaCheckErrors("apply_filtration2 fail cudaMemcpy2D 2");
223240

224-
cudaFree(d_pcfWork); d_pcfWork = nullptr;
225-
cudaFree(d_pfProjWide); d_pfProjWide = nullptr;
226-
cudaFree(d_pfFilter); d_pfFilter = nullptr;
227-
cufftDestroy(cudafftPlanFwd);
228-
cufftDestroy(cudafftPlanInv);
229-
}
241+
cudaFree(pd_pcfWork[dev]); pd_pcfWork[dev] = nullptr;
242+
cudaFree(pd_pfProjWide[dev]); pd_pfProjWide[dev] = nullptr;
243+
cudaFree(pd_pfFilter[dev]); pd_pfFilter[dev] = nullptr;
244+
cufftDestroy(pcudafftPlanFwd[dev]);
245+
cufftDestroy(pcudafftPlanInv[dev]);
246+
cudaCheckErrors("apply_filtration2 fail free");
247+
}
248+
cudaDeviceSynchronize();
249+
250+
free(pd_pfProjWide); pd_pfProjWide = nullptr;
251+
free(pd_pfFilter); pd_pfFilter = nullptr;
252+
free(puiBatch); puiBatch = nullptr;
253+
free(pd_pcfWork); pd_pcfWork = nullptr;
254+
free(pcudafftPlanFwd); pcudafftPlanFwd = nullptr;
255+
free(pcudafftPlanInv); pcudafftPlanInv = nullptr;
256+
}

0 commit comments

Comments
 (0)