gpt-2 : better check for CPU backend when settings n_threads

slaren · slaren · commit d7d2a6bddb08 · 2023-10-03T21:44:17.000+02:00
diff --git a/examples/gpt-2/main.cpp b/examples/gpt-2/main.cpp
@@ -760,11 +760,9 @@ bool gpt2_eval(
     ggml_allocr_alloc_graph(allocr, gf);
 
     // run the computation
-#ifndef GGML_USE_CUBLAS
-    // FIXME: the backend may be CPU even if CUDA is enabled
-    // if (model.backend.id == GGML_BACKEND_ID_CPU)
-    ggml_backend_cpu_set_n_threads(model.backend, n_threads);
-#endif
+    if (strcmp(ggml_backend_name(model.backend), "CPU") == 0) {
+        ggml_backend_cpu_set_n_threads(model.backend, n_threads);
+    }
     ggml_backend_graph_compute(model.backend, gf);
 
     //if (n_past%100 == 0) {
diff --git a/src/ggml-cuda.cu b/src/ggml-cuda.cu
@@ -62,6 +62,7 @@
 #define cudaMemcpyHostToDevice hipMemcpyHostToDevice
 #define cudaMemcpyKind hipMemcpyKind
 #define cudaMemset hipMemset
+#define cudaMemsetAsync hipMemsetAsync
 #define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
 #define cudaSetDevice hipSetDevice
 #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
@@ -1576,7 +1577,7 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
 }
 
 template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
-static __global__ void k_get_rows(const void * x, const int * y, dst_t * dst, const int ncols) {
+static __global__ void k_get_rows(const void * x, const int32_t * y, dst_t * dst, const int ncols) {
     const int col = (blockIdx.x*blockDim.x + threadIdx.x)*2;
     const int row = blockDim.y*blockIdx.y + threadIdx.y;
 
@@ -4586,7 +4587,7 @@ static __global__ void scale_f32(const float * x, float * dst, const float scale
 
 
 template<int qk, int qr, dequantize_kernel_t dq>
-static void get_rows_cuda(const void * x, const int * y, float * dst, const int nrows, const int ncols, cudaStream_t stream) {
+static void get_rows_cuda(const void * x, const int32_t * y, float * dst, const int nrows, const int ncols, cudaStream_t stream) {
     const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
     const int block_num_x = (ncols + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
     const dim3 block_nums(block_num_x, nrows, 1);
@@ -5810,7 +5811,7 @@ static void ggml_cuda_op_repeat(
     GGML_ASSERT(nb0  == sizeof(float));
     GGML_ASSERT(nb00 == sizeof(float));
 
-    // TODO: very inefficient, implement in a kernel
+    // TODO: very inefficient, implement in a kernel, or fewer cudaMemcpyAsync calls for contiguous tensors
     for                         (int i3 = 0; i3 < nr3;  i3++) {
         for                     (int k3 = 0; k3 < ne03; k3++) {
             for                 (int i2 = 0; i2 < nr2;  i2++) {
@@ -5847,7 +5848,7 @@ static void ggml_cuda_op_get_rows(
     const int ncols = src0->ne[0];
     const int nrows = ggml_nelements(src1);
 
-    const int * src1_i32 = (const int *) src1_d;
+    const int32_t * src1_i32 = (const int32_t *) src1_d;
 
     switch (src0->type) {
         case GGML_TYPE_F16: