@@ -266,7 +266,7 @@ __global__ void rms_norm_kernel(AllReduceParams params)
266
266
local_final_output_buffer += block_offset;
267
267
intermediate_buffer += block_offset;
268
268
269
- #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
269
+ #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && (__CUDA_ARCH__ < 1200) )
270
270
cudaGridDependencySynchronize ();
271
271
#endif
272
272
@@ -309,7 +309,7 @@ __global__ void rms_norm_kernel(AllReduceParams params)
309
309
inter_vec.packed = rms_norm<T, Affine>(denom, inter_vec, weight_vec);
310
310
*reinterpret_cast <int4 *>(&local_final_output_buffer[offset]) = inter_vec.packed ;
311
311
}
312
- #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
312
+ #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && (__CUDA_ARCH__ < 1200) )
313
313
cudaTriggerProgrammaticLaunchCompletion ();
314
314
#endif
315
315
}
@@ -340,7 +340,7 @@ __global__ void rms_pre_post_norm_kernel(AllReduceParams params) // for gemma2 p
340
340
local_final_output_buffer += block_offset;
341
341
intermediate_buffer += block_offset;
342
342
343
- #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
343
+ #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && (__CUDA_ARCH__ < 1200) )
344
344
cudaGridDependencySynchronize ();
345
345
#endif
346
346
@@ -393,7 +393,7 @@ __global__ void rms_pre_post_norm_kernel(AllReduceParams params) // for gemma2 p
393
393
inter_vec.packed = rms_norm<T, Affine>(denom, inter_vec, weight_vec);
394
394
*reinterpret_cast <int4 *>(&local_final_output_buffer[offset]) = inter_vec.packed ;
395
395
}
396
- #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
396
+ #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && (__CUDA_ARCH__ < 1200) )
397
397
cudaTriggerProgrammaticLaunchCompletion ();
398
398
#endif
399
399
}
@@ -744,7 +744,7 @@ struct Reducer<T, RanksPerNode, false>
744
744
template <int ClusterSize, typename T, int RanksPerNode, bool Bias = false , bool Affine = false , bool PushMode = true >
745
745
static __global__ void lamport_style_one_shot_all_reduce_norm_kernel (AllReduceParams params)
746
746
{
747
- #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
747
+ #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && (__CUDA_ARCH__ < 1200) )
748
748
namespace cg = cooperative_groups;
749
749
static_assert (RanksPerNode <= MAX_RANKS_PER_NODE);
750
750
static constexpr int kPackedSize = details::kBytesPerAccess / sizeof (T);
@@ -937,7 +937,7 @@ static __global__ void __launch_bounds__(1024, 1) one_shot_all_reduce_norm_kerne
937
937
buffers[ii] = reinterpret_cast <T*>(params.peer_comm_buffer_ptrs [rank]);
938
938
}
939
939
940
- #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
940
+ #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && (__CUDA_ARCH__ < 1200) )
941
941
cudaGridDependencySynchronize ();
942
942
#endif
943
943
@@ -1001,7 +1001,7 @@ static __global__ void __launch_bounds__(1024, 1) one_shot_all_reduce_norm_kerne
1001
1001
*reinterpret_cast <int4 *>(&local_final_output_buffer[norm_offset + offset]) = sum_vec.packed ;
1002
1002
}
1003
1003
}
1004
- #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
1004
+ #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && (__CUDA_ARCH__ < 1200) )
1005
1005
cudaTriggerProgrammaticLaunchCompletion ();
1006
1006
#endif
1007
1007
}
@@ -1044,7 +1044,7 @@ static __global__ void __launch_bounds__(1024, 1) one_shot_prenorm_all_reduce_no
1044
1044
buffers[ii] = reinterpret_cast <T*>(params.peer_comm_buffer_ptrs [rank]);
1045
1045
}
1046
1046
1047
- #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
1047
+ #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && (__CUDA_ARCH__ < 1200) )
1048
1048
cudaGridDependencySynchronize ();
1049
1049
#endif
1050
1050
@@ -1114,7 +1114,7 @@ static __global__ void __launch_bounds__(1024, 1) one_shot_prenorm_all_reduce_no
1114
1114
sum_vec.packed = rms_norm<T, Affine>(denom, sum_vec, weight_vec);
1115
1115
*reinterpret_cast <int4 *>(&local_final_output_buffer[norm_offset + thread_offset]) = sum_vec.packed ;
1116
1116
}
1117
- #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
1117
+ #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && (__CUDA_ARCH__ < 1200) )
1118
1118
cudaTriggerProgrammaticLaunchCompletion ();
1119
1119
#endif
1120
1120
}
@@ -1128,7 +1128,7 @@ bool is_lamport_supported(int token_num, int hidden_size)
1128
1128
if (disableLamportReduceNormFusion)
1129
1129
return false ;
1130
1130
static int sm = tensorrt_llm::common::getSMVersion ();
1131
- if (sm < 90 )
1131
+ if (sm < 90 || sm >= 120 )
1132
1132
{
1133
1133
return false ;
1134
1134
}
@@ -1355,7 +1355,7 @@ static __global__ void oneShotAllReduceKernel(AllReduceParams params)
1355
1355
buffers[ii] = reinterpret_cast <T*>(params.peer_comm_buffer_ptrs [rank]);
1356
1356
}
1357
1357
1358
- #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
1358
+ #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && (__CUDA_ARCH__ < 1200) )
1359
1359
cudaGridDependencySynchronize ();
1360
1360
#endif
1361
1361
@@ -1424,7 +1424,7 @@ static __global__ void oneShotAllReduceKernel(AllReduceParams params)
1424
1424
*reinterpret_cast <int4 *>(&local_output_buffer[iter_offset]) = sums.packed ;
1425
1425
}
1426
1426
1427
- #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
1427
+ #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && (__CUDA_ARCH__ < 1200) )
1428
1428
cudaTriggerProgrammaticLaunchCompletion ();
1429
1429
#endif
1430
1430
}
@@ -1497,7 +1497,7 @@ static __global__ void __launch_bounds__(512, 1) twoShotAllReduceKernel(AllReduc
1497
1497
buffers[ii] = reinterpret_cast <T*>(params.peer_comm_buffer_ptrs [rank]);
1498
1498
}
1499
1499
1500
- #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
1500
+ #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && (__CUDA_ARCH__ < 1200) )
1501
1501
cudaGridDependencySynchronize ();
1502
1502
#endif
1503
1503
@@ -1631,7 +1631,7 @@ static __global__ void __launch_bounds__(512, 1) twoShotAllReduceKernel(AllReduc
1631
1631
}
1632
1632
}
1633
1633
1634
- #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
1634
+ #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && (__CUDA_ARCH__ < 1200) )
1635
1635
cudaTriggerProgrammaticLaunchCompletion ();
1636
1636
#endif
1637
1637
}
0 commit comments