|
62 | 62 | #define cudaMemcpyHostToDevice hipMemcpyHostToDevice
|
63 | 63 | #define cudaMemcpyKind hipMemcpyKind
|
64 | 64 | #define cudaMemset hipMemset
|
| 65 | +#define cudaMemsetAsync hipMemsetAsync |
65 | 66 | #define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
|
66 | 67 | #define cudaSetDevice hipSetDevice
|
67 | 68 | #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
|
@@ -1576,7 +1577,7 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
|
1576 | 1577 | }
|
1577 | 1578 |
|
1578 | 1579 | template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
1579 |
| -static __global__ void k_get_rows(const void * x, const int * y, dst_t * dst, const int ncols) { |
| 1580 | +static __global__ void k_get_rows(const void * x, const int32_t * y, dst_t * dst, const int ncols) { |
1580 | 1581 | const int col = (blockIdx.x*blockDim.x + threadIdx.x)*2;
|
1581 | 1582 | const int row = blockDim.y*blockIdx.y + threadIdx.y;
|
1582 | 1583 |
|
@@ -4586,7 +4587,7 @@ static __global__ void scale_f32(const float * x, float * dst, const float scale
|
4586 | 4587 |
|
4587 | 4588 |
|
4588 | 4589 | template<int qk, int qr, dequantize_kernel_t dq>
|
4589 |
| -static void get_rows_cuda(const void * x, const int * y, float * dst, const int nrows, const int ncols, cudaStream_t stream) { |
| 4590 | +static void get_rows_cuda(const void * x, const int32_t * y, float * dst, const int nrows, const int ncols, cudaStream_t stream) { |
4590 | 4591 | const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
|
4591 | 4592 | const int block_num_x = (ncols + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
|
4592 | 4593 | const dim3 block_nums(block_num_x, nrows, 1);
|
@@ -5810,7 +5811,7 @@ static void ggml_cuda_op_repeat(
|
5810 | 5811 | GGML_ASSERT(nb0 == sizeof(float));
|
5811 | 5812 | GGML_ASSERT(nb00 == sizeof(float));
|
5812 | 5813 |
|
5813 |
| - // TODO: very inefficient, implement in a kernel |
| 5814 | + // TODO: very inefficient, implement in a kernel, or fewer cudaMemcpyAsync calls for contiguous tensors |
5814 | 5815 | for (int i3 = 0; i3 < nr3; i3++) {
|
5815 | 5816 | for (int k3 = 0; k3 < ne03; k3++) {
|
5816 | 5817 | for (int i2 = 0; i2 < nr2; i2++) {
|
@@ -5847,7 +5848,7 @@ static void ggml_cuda_op_get_rows(
|
5847 | 5848 | const int ncols = src0->ne[0];
|
5848 | 5849 | const int nrows = ggml_nelements(src1);
|
5849 | 5850 |
|
5850 |
| - const int * src1_i32 = (const int *) src1_d; |
| 5851 | + const int32_t * src1_i32 = (const int32_t *) src1_d; |
5851 | 5852 |
|
5852 | 5853 | switch (src0->type) {
|
5853 | 5854 | case GGML_TYPE_F16:
|
|
0 commit comments