|
1 | 1 | /*
|
2 |
| - * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. |
| 2 | + * Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved. |
3 | 3 | *
|
4 | 4 | * Licensed under the Apache License, Version 2.0 (the "License");
|
5 | 5 | * you may not use this file except in compliance with the License.
|
|
18 | 18 | #include "gemmList.h"
|
19 | 19 | #include "runner.h"
|
20 | 20 | #include "trtllmGenSrc/DevKernel.h"
|
21 |
| -#include "trtllmGenSrc/MixtureOfExpertsInterface.h" |
22 | 21 | #include "trtllmGenSrc/RoutingKernel.h"
|
23 | 22 | #include <iostream>
|
24 | 23 |
|
@@ -135,43 +134,6 @@ void Runner::run(void* hidden_state, void* hidden_state_scale, void* weight, voi
|
135 | 134 | TLLM_CHECK_WITH_INFO(selectedIndex.size() == 1, "Multiple kernels found for the given element type");
|
136 | 135 | auto const& kernelInfo = gemmList[*selectedIndex.begin()];
|
137 | 136 |
|
138 |
| - // TODO: remove this once we find the way to shuffle the weights offline |
139 |
| - // if (kernelInfo.shuffledMatrixA || kernelInfo.useFusedAct) |
140 |
| - // { |
141 |
| - // // Allocate temporary buffers for shuffled weights using vectors |
142 |
| - // auto numBitsPerElt = trtllm::gen::dtypeGetNumBits(mDtypeElt); |
143 |
| - // const size_t numBytesA = num_experts * hidden_size * intermediate_size * 2 * numBitsPerElt / /* bits */ 8; |
144 |
| - // std::vector<uint8_t> hShuffledA(numBytesA); |
145 |
| - |
146 |
| - // auto numBitsPerSf = trtllm::gen::dtypeGetNumBits(tg::dtypeBlockSfType(mDtypeElt)); |
147 |
| - // const size_t numSfBytes = num_experts * hidden_size * intermediate_size / 16 * 2 * numBitsPerSf / /* bits */ |
148 |
| - // 8; std::vector<uint8_t> hShuffledASf(numSfBytes); |
149 |
| - |
150 |
| - // // Copy weights to host |
151 |
| - // cudaMemcpy(hShuffledA.data(), weight, numBytesA, cudaMemcpyDeviceToHost); |
152 |
| - // cudaMemcpy(hShuffledASf.data(), weight_scale, numSfBytes, cudaMemcpyDeviceToHost); |
153 |
| - |
154 |
| - // // Prepare and shuffle the weights |
155 |
| - // prepareBatchWeightsOnHost(hShuffledA.data(), // wIn |
156 |
| - // hShuffledASf.data(), // wSfIn |
157 |
| - // hShuffledA.data(), // wOut (in-place) |
158 |
| - // hShuffledASf.data(), // wSfOut (in-place) |
159 |
| - // mDtypeElt, // dtypeElt |
160 |
| - // intermediate_size * 2, // m (2x for gated activation) |
161 |
| - // hidden_size, // k |
162 |
| - // kernelInfo.epilogueTileM, // epilogueTileM (from tileN) |
163 |
| - // num_experts, // numBatches |
164 |
| - // kernelInfo.shuffledMatrixA, // useShuffleMatrix |
165 |
| - // kernelInfo.useFusedAct, // useFusedAct (for gated activation) |
166 |
| - // mDtypeElt == tg::Dtype::E2m1, // useBlockScaling |
167 |
| - // 16 // numEltsPerSf (for E2m1) |
168 |
| - // ); |
169 |
| - |
170 |
| - // // Copy shuffled weights back to device |
171 |
| - // // cudaMemcpy(weight, hShuffledA.data(), numBytesA, cudaMemcpyHostToDevice); |
172 |
| - // // cudaMemcpy(weight_scale, hShuffledASf.data(), numSfBytes, cudaMemcpyHostToDevice); |
173 |
| - // } |
174 |
| - |
175 | 137 | gemmCommon::MyOptions options;
|
176 | 138 | options.mTopK = top_k;
|
177 | 139 | options.mBatchM = false;
|
@@ -239,43 +201,6 @@ void Runner::run(void* permuted_hidden_state, void* permuted_hidden_state_scale,
|
239 | 201 | TLLM_CHECK_WITH_INFO(selectedIndex.size() == 1, "Multiple kernels found for the given element and output types");
|
240 | 202 | auto const& kernelInfo = gemmList[*selectedIndex.begin()];
|
241 | 203 |
|
242 |
| - // TODO: remove this once we find the way to shuffle the weights offline |
243 |
| - // if (kernelInfo.shuffledMatrixA) |
244 |
| - // { |
245 |
| - // // Allocate temporary buffers for shuffled weights using vectors |
246 |
| - // auto numBitsPerElt = trtllm::gen::dtypeGetNumBits(mDtypeElt); |
247 |
| - // const size_t numBytesA = num_experts * hidden_size * intermediate_size * numBitsPerElt / /* bits */ 8; |
248 |
| - // std::vector<uint8_t> hShuffledA(numBytesA); |
249 |
| - |
250 |
| - // auto numBitsPerSf = trtllm::gen::dtypeGetNumBits(tg::dtypeBlockSfType(mDtypeElt)); |
251 |
| - // const size_t numSfBytes = num_experts * hidden_size * intermediate_size / 16 * numBitsPerSf / /* bits */ 8; |
252 |
| - // std::vector<uint8_t> hShuffledASf(numSfBytes); |
253 |
| - |
254 |
| - // // Copy weights to host |
255 |
| - // cudaMemcpy(hShuffledA.data(), weight, numBytesA, cudaMemcpyDeviceToHost); |
256 |
| - // cudaMemcpy(hShuffledASf.data(), weight_scale, numSfBytes, cudaMemcpyDeviceToHost); |
257 |
| - |
258 |
| - // // Prepare and shuffle the weights |
259 |
| - // prepareBatchWeightsOnHost(hShuffledA.data(), // wIn |
260 |
| - // hShuffledASf.data(), // wSfIn |
261 |
| - // hShuffledA.data(), // wOut (in-place) |
262 |
| - // hShuffledASf.data(), // wSfOut (in-place) |
263 |
| - // mDtypeElt, // dtypeElt |
264 |
| - // hidden_size, // m |
265 |
| - // intermediate_size, // k |
266 |
| - // kernelInfo.epilogueTileM, // epilogueTileM (from tileN) |
267 |
| - // num_experts, // numBatches |
268 |
| - // kernelInfo.shuffledMatrixA, // useShuffleMatrix |
269 |
| - // false, // useFusedAct (for gated activation) |
270 |
| - // mDtypeElt == tg::Dtype::E2m1, // useBlockScaling |
271 |
| - // 16 // numEltsPerSf (for E2m1) |
272 |
| - // ); |
273 |
| - |
274 |
| - // // Copy shuffled weights back to device |
275 |
| - // // cudaMemcpy(weight, hShuffledA.data(), numBytesA, cudaMemcpyHostToDevice); |
276 |
| - // // cudaMemcpy(weight_scale, hShuffledASf.data(), numSfBytes, cudaMemcpyHostToDevice); |
277 |
| - // } |
278 |
| - |
279 | 204 | gemmCommon::MyOptions options;
|
280 | 205 | options.mTopK = top_k;
|
281 | 206 | options.mBatchM = false;
|
@@ -373,16 +298,7 @@ void Runner::run(MoERunnerArgs const& args, MoEWorkspace const& workspace, cudaS
|
373 | 298 |
|
374 | 299 | setOpsData(args, workspace, convertSfData, activationData, finalizeData);
|
375 | 300 |
|
376 |
| - // Calling routing outside to properly allocate workspace |
377 |
| - // moe::dev::routing::run(routingData, stream); |
378 |
| - |
379 | 301 | void* hidden_states_scale_linear{args.hidden_states_scale};
|
380 |
| - // FIXME check that we receive r128c4 sf layout |
381 |
| - // if (args.mDtypeElt == tg::Dtype::E2m1) |
382 |
| - // { |
383 |
| - // hidden_states_scale_linear = workspace.hidden_states_scale_linear; |
384 |
| - // moe::dev::convertsf::run(convertSfData, stream); |
385 |
| - // } |
386 | 302 |
|
387 | 303 | PermuteGemm1::Runner permuteGemm1(args.mDtypeElt);
|
388 | 304 | permuteGemm1.run(args.hidden_states, hidden_states_scale_linear, args.gemm1_weights, args.gemm1_weights_scale,
|
@@ -411,36 +327,6 @@ void Runner::run(MoERunnerArgs const& args, MoEWorkspace const& workspace, cudaS
|
411 | 327 |
|
412 | 328 | // Run finalize
|
413 | 329 | moe::dev::finalize::run(finalizeData, stream);
|
414 |
| - |
415 |
| - // std::vector<uint8_t> gemm1_output_fp8(64 * args.intermediate_size / 2); |
416 |
| - // printf("array addr 0x%lx\n", &gemm1_output_fp8[0]); |
417 |
| - // printf("local_num_experts addr 0x%lx\n", &args.local_num_experts); |
418 |
| - // cudaMemcpy(gemm1_output_fp8.data(), workspace.gemm1_output, gemm1_output_fp8.size() * sizeof(uint8_t), |
419 |
| - // cudaMemcpyDeviceToHost); |
420 |
| - // std::cout << "args.local_num_experts: " << args.local_num_experts << std::endl; |
421 |
| - // std::cout << "gemm1 output (hex):" << std::endl; |
422 |
| - // for (int offset = 0; offset < 8; offset++) |
423 |
| - // { |
424 |
| - // int base = offset * 2048; |
425 |
| - // for (int i = 0; i < args.num_tokens; i++) |
426 |
| - // { |
427 |
| - // for (int j = 0; j < args.intermediate_size / 2; j += 16) |
428 |
| - // { |
429 |
| - // std::cout << "Token " << i << " [" << std::dec << base + j << "]: "; |
430 |
| - // for (int k = 0; k < 16 && (j + k) < args.intermediate_size / 2; k++) |
431 |
| - // { |
432 |
| - // // std::cout << "offset: " << std::dec << base + i * args.intermediate_size / 2 + j + k << |
433 |
| - // // std::endl; |
434 |
| - // std::cout << "0x" << std::hex << std::setw(2) << std::setfill('0') |
435 |
| - // << static_cast<uint>(gemm1_output_fp8[base + i * args.intermediate_size / 2 + j + k]) |
436 |
| - // << " "; |
437 |
| - // } |
438 |
| - // std::cout << std::endl; |
439 |
| - // } |
440 |
| - // std::cout << std::endl; |
441 |
| - // } |
442 |
| - // } |
443 |
| - // std::cout << "args.local_num_experts: " << args.local_num_experts << std::endl; |
444 | 330 | }
|
445 | 331 | } // namespace MoE
|
446 | 332 |
|
|
0 commit comments