Skip to content

Commit d7d2a6b

Browse files
committed
gpt-2 : better check for CPU backend when settings n_threads
1 parent 45d13b1 commit d7d2a6b

File tree

2 files changed

+8
-9
lines changed

2 files changed

+8
-9
lines changed

examples/gpt-2/main.cpp

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -760,11 +760,9 @@ bool gpt2_eval(
760760
ggml_allocr_alloc_graph(allocr, gf);
761761

762762
// run the computation
763-
#ifndef GGML_USE_CUBLAS
764-
// FIXME: the backend may be CPU even if CUDA is enabled
765-
// if (model.backend.id == GGML_BACKEND_ID_CPU)
766-
ggml_backend_cpu_set_n_threads(model.backend, n_threads);
767-
#endif
763+
if (strcmp(ggml_backend_name(model.backend), "CPU") == 0) {
764+
ggml_backend_cpu_set_n_threads(model.backend, n_threads);
765+
}
768766
ggml_backend_graph_compute(model.backend, gf);
769767

770768
//if (n_past%100 == 0) {

src/ggml-cuda.cu

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@
6262
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
6363
#define cudaMemcpyKind hipMemcpyKind
6464
#define cudaMemset hipMemset
65+
#define cudaMemsetAsync hipMemsetAsync
6566
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
6667
#define cudaSetDevice hipSetDevice
6768
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
@@ -1576,7 +1577,7 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
15761577
}
15771578

15781579
template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
1579-
static __global__ void k_get_rows(const void * x, const int * y, dst_t * dst, const int ncols) {
1580+
static __global__ void k_get_rows(const void * x, const int32_t * y, dst_t * dst, const int ncols) {
15801581
const int col = (blockIdx.x*blockDim.x + threadIdx.x)*2;
15811582
const int row = blockDim.y*blockIdx.y + threadIdx.y;
15821583

@@ -4586,7 +4587,7 @@ static __global__ void scale_f32(const float * x, float * dst, const float scale
45864587

45874588

45884589
template<int qk, int qr, dequantize_kernel_t dq>
4589-
static void get_rows_cuda(const void * x, const int * y, float * dst, const int nrows, const int ncols, cudaStream_t stream) {
4590+
static void get_rows_cuda(const void * x, const int32_t * y, float * dst, const int nrows, const int ncols, cudaStream_t stream) {
45904591
const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
45914592
const int block_num_x = (ncols + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
45924593
const dim3 block_nums(block_num_x, nrows, 1);
@@ -5810,7 +5811,7 @@ static void ggml_cuda_op_repeat(
58105811
GGML_ASSERT(nb0 == sizeof(float));
58115812
GGML_ASSERT(nb00 == sizeof(float));
58125813

5813-
// TODO: very inefficient, implement in a kernel
5814+
// TODO: very inefficient, implement in a kernel, or fewer cudaMemcpyAsync calls for contiguous tensors
58145815
for (int i3 = 0; i3 < nr3; i3++) {
58155816
for (int k3 = 0; k3 < ne03; k3++) {
58165817
for (int i2 = 0; i2 < nr2; i2++) {
@@ -5847,7 +5848,7 @@ static void ggml_cuda_op_get_rows(
58475848
const int ncols = src0->ne[0];
58485849
const int nrows = ggml_nelements(src1);
58495850

5850-
const int * src1_i32 = (const int *) src1_d;
5851+
const int32_t * src1_i32 = (const int32_t *) src1_d;
58515852

58525853
switch (src0->type) {
58535854
case GGML_TYPE_F16:

0 commit comments

Comments
 (0)