Skip to content

OpenCL: Fixes for older devices. #1435

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 19 commits into from
May 20, 2023
Merged
220 changes: 159 additions & 61 deletions ggml-opencl.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,71 +10,63 @@
#include "ggml.h"

#define MULTILINE_QUOTE(...) #__VA_ARGS__
const char * clblast_dequant = MULTILINE_QUOTE(
static const char * program_source = MULTILINE_QUOTE(

typedef char int8_t;
typedef uchar uint8_t;
typedef int int32_t;
typedef uint uint32_t;

constant uint QK4_0 = 32;
struct block_q4_0
{
float d;
uint8_t qs[QK4_0 / 2];
uint8_t qs[16]; /* QK4_0 / 2 */
};

constant uint QK4_1 = 32;
struct block_q4_1
{
float d;
float m;
uint8_t qs[QK4_1 / 2];
uint8_t qs[16]; /* QK4_1 / 2 */
};

constant uint QK5_0 = 32;
struct __attribute__ ((packed)) block_q5_0
{
half d;
uint32_t qh;
uint8_t qs[QK5_0 / 2];
uint8_t qs[16]; /* QK5_0 / 2 */
};

constant uint QK5_1 = 32;
struct block_q5_1
{
half d;
half m;
uint32_t qh;
uint8_t qs[QK5_1 / 2];
uint8_t qs[16]; /* QK5_1 / 2 */
};

constant uint QK8_0 = 32;
struct block_q8_0
{
float d;
uint8_t qs[QK8_0];
int8_t qs[32]; /* QK8_0 */
};


__kernel void dequantize_row_q4_0(__global struct block_q4_0* x, __global float* y) {
constant uint qk = QK4_0;

const uint i = get_global_id(0) / qk;
const uint i = get_global_id(0) / 32; /* QK4_0 */
const uint j = get_local_id(0);

const float d = x[i].d;

const int x0 = (x[i].qs[j] & 0xf) - 8;
const int x1 = (x[i].qs[j] >> 4) - 8;

y[i*qk + j + 0 ] = x0*d;
y[i*qk + j + qk/2] = x1*d;
y[i*32 + j + 0 ] = x0*d;
y[i*32 + j + 16] = x1*d;
}

__kernel void dequantize_row_q4_1(__global struct block_q4_1* x, __global float* y) {
constant uint qk = QK4_1;

const uint i = get_global_id(0) / qk;
const uint i = get_global_id(0) / 32; /* QK4_1 */
const uint j = get_local_id(0);

const float d = x[i].d;
Expand All @@ -83,14 +75,12 @@ __kernel void dequantize_row_q4_1(__global struct block_q4_1* x, __global float*
const int x0 = (x[i].qs[j] & 0xf);
const int x1 = (x[i].qs[j] >> 4);

y[i*qk + j + 0 ] = x0*d + m;
y[i*qk + j + qk/2] = x1*d + m;
y[i*32 + j + 0 ] = x0*d + m;
y[i*32 + j + 16] = x1*d + m;
}

__kernel void dequantize_row_q5_0(__global struct block_q5_0* x, __global float* y) {
constant uint qk = QK5_0;

const uint i = get_global_id(0) / qk;
const uint i = get_global_id(0) / 32; /* QK5_0 */
const uint j = get_local_id(0);

const float d = vload_half(0, (__global half*) &x[i].d);
Expand All @@ -103,14 +93,12 @@ __kernel void dequantize_row_q5_0(__global struct block_q5_0* x, __global float*
const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16;
const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16;

y[i*qk + j + 0 ] = x0*d;
y[i*qk + j + qk/2] = x1*d;
y[i*32 + j + 0 ] = x0*d;
y[i*32 + j + 16] = x1*d;
}

__kernel void dequantize_row_q5_1(__global struct block_q5_1* x, __global float* y) {
constant uint qk = QK5_1;

const uint i = get_global_id(0) / qk;
const uint i = get_global_id(0) / 32; /* QK5_1 */
const uint j = get_local_id(0);

const float d = vload_half(0, (__global half*) &x[i].d);
Expand All @@ -124,17 +112,16 @@ __kernel void dequantize_row_q5_1(__global struct block_q5_1* x, __global float*
const int x0 = (x[i].qs[j] & 0xf) | xh_0;
const int x1 = (x[i].qs[j] >> 4) | xh_1;

y[i*qk + j + 0 ] = x0*d + m;
y[i*qk + j + qk/2] = x1*d + m;
y[i*32 + j + 0 ] = x0*d + m;
y[i*32 + j + 16] = x1*d + m;
}

__kernel void dequantize_row_q8_0(__global struct block_q8_0* x, __global float* y) {
constant uint qk = QK8_0;
const uint i = get_global_id(0) / qk;
const uint i = get_global_id(0) / 32; /* QK8_0 */
const uint j = get_local_id(0);

const float d = x[i].d;
y[i*qk + j] = x[i].qs[j]*d;
y[i*32 + j] = x[i].qs[j]*d;
}

);
Expand All @@ -143,7 +130,7 @@ __kernel void dequantize_row_q8_0(__global struct block_q8_0* x, __global float*
do { \
cl_int err_ = (err); \
if (err_ != CL_SUCCESS) { \
fprintf(stderr, "OpenCL %s error %d at %s:%d\n", name, err_, __FILE__, __LINE__); \
fprintf(stderr, "ggml_opencl: %s error %d at %s:%d\n", name, err_, __FILE__, __LINE__); \
exit(1); \
} \
} while (0)
Expand Down Expand Up @@ -188,36 +175,147 @@ static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, co

void ggml_cl_init(void) {
cl_int err = 0;
char * GGML_CLBLAST_PLATFORM = getenv("GGML_CLBLAST_PLATFORM");
char * GGML_CLBLAST_DEVICE = getenv("GGML_CLBLAST_DEVICE");
int plat_num = (GGML_CLBLAST_PLATFORM == NULL ? 0 : atoi(GGML_CLBLAST_PLATFORM));
int dev_num = (GGML_CLBLAST_DEVICE == NULL ? 0 : atoi(GGML_CLBLAST_DEVICE));
printf("\nInitializing CLBlast (First Run)...");
printf("\nAttempting to use: Platform=%d, Device=%d (If invalid, program will crash)\n",plat_num,dev_num);

enum { NPLAT = 16, NDEV = 16 };

char text_buffer[1024] = {0};

platform = NULL;
device = NULL;

cl_platform_id platforms[NPLAT];
cl_uint num_platforms;
clGetPlatformIDs(0, NULL, &num_platforms);
cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id));
clGetPlatformIDs(num_platforms, platforms, NULL);
platform = platforms[plat_num];
char platform_buffer[1024];
clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_buffer), &platform_buffer, NULL);
cl_uint num_devices;
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id));
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
device = devices[dev_num];
char device_buffer[1024];
clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_buffer), &device_buffer, NULL);
printf("Using Platform: %s Device: %s\n", platform_buffer, device_buffer);
context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
CL_CHECK(err, "clCreateContext");
err = clGetPlatformIDs(NPLAT, platforms, &num_platforms);
CL_CHECK(err, "clGetPlatformIDs");

char * GGML_OPENCL_PLATFORM = getenv("GGML_OPENCL_PLATFORM");
if (GGML_OPENCL_PLATFORM != NULL) {
unsigned plat_num;
if (sscanf(GGML_OPENCL_PLATFORM, " %u", &plat_num) == 1) {
if (plat_num >= num_platforms) {
fprintf(stderr, "ggml_opencl: There is no platform %d\n", plat_num);
exit(1);
} else {
platform = platforms[plat_num];
clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(text_buffer), &text_buffer, NULL);
}
} else {
for (unsigned i = 0; i < num_platforms; i++) {
clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, sizeof(text_buffer), &text_buffer, NULL);
if (strstr(text_buffer, GGML_OPENCL_PLATFORM) != NULL) {
platform = platforms[i];
break;
}
}
}
if (platform == NULL) {
fprintf(stderr, "ggml_opencl: no platform matching '%s' was found.\n", GGML_OPENCL_PLATFORM);
exit(1);
} else {
fprintf(stderr, "ggml_opencl: selecting platform: '%s'\n", text_buffer);
}
}

char * GGML_OPENCL_DEVICE = getenv("GGML_OPENCL_DEVICE");
if (GGML_OPENCL_DEVICE != NULL) {
cl_device_id devices[NDEV];
cl_uint num_devices;
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, NDEV, devices, &num_devices);
CL_CHECK(err, "clGetDeviceIDs");

unsigned dev_num;
if (sscanf(GGML_OPENCL_DEVICE, " %u", &dev_num) == 1) {
if (dev_num >= num_devices) {
fprintf(stderr, "ggml_opencl: There is no device %d\n", dev_num);
exit(1);
} else {
device = devices[dev_num];
clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(text_buffer), &text_buffer, NULL);
}
} else {
for (unsigned i = 0; i < num_devices; i++) {
clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(text_buffer), &text_buffer, NULL);
if (strstr(text_buffer, GGML_OPENCL_DEVICE) != NULL) {
device = devices[i];
break;
}
}
}
if (device == NULL) {
fprintf(stderr, "ggml_opencl: no device matching '%s' was found.\n", GGML_OPENCL_DEVICE);
exit(1);
} else {
fprintf(stderr, "ggml_opencl: selecting device: '%s'\n", text_buffer);
if (platform == NULL) {
err = clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(&platform), &platform, NULL);
CL_CHECK(err, "clGetDeviceInfo");
}
}
}

if (platform == NULL) {
cl_device_id devices[NDEV];
cl_uint num_devices;

for (unsigned i = 0; i < num_platforms; i++) {
clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU, NDEV, devices, &num_devices);
CL_CHECK(err, "clGetDeviceIDs");

if (num_devices > 0) {
platform = platforms[i];
device = devices[0];
if (num_devices > 1) {
fprintf(stderr, "ggml_opencl: platform has more than 1 GPU, selecting the first.\n");
}
fprintf(stderr, "ggml_opencl: autodetected GPU.\n");
break;
}
}
}

cl_context_properties *properties = platform == NULL ? NULL : (cl_context_properties[]){
(intptr_t)CL_CONTEXT_PLATFORM, (intptr_t)platform, 0
};

if (device != NULL) {
context = clCreateContext(properties, 1, &device, NULL, NULL, &err);
CL_CHECK(err, "clCreateContext");
} else {
context = clCreateContextFromType(properties, CL_DEVICE_TYPE_GPU, NULL, NULL, &err);
if (err == CL_DEVICE_NOT_AVAILABLE || err == CL_DEVICE_NOT_FOUND) {
context = clCreateContextFromType(properties, CL_DEVICE_TYPE_DEFAULT, NULL, NULL, &err);
if (err == CL_DEVICE_NOT_AVAILABLE || err == CL_DEVICE_NOT_FOUND) {
context = clCreateContextFromType(properties, CL_DEVICE_TYPE_ALL, NULL, NULL, &err);
}
}
CL_CHECK(err, "clCreateContextFromType");

err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(&device), &device, NULL);
CL_CHECK(err, "clGetContextInfo");

if (platform == NULL) {
err = clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(&platform), &platform, NULL);
CL_CHECK(err, "clGetDeviceInfo");
}
}


GGML_ASSERT(platform != NULL);
clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(text_buffer), &text_buffer, NULL);
fprintf(stderr, "ggml_opencl: using platform: '%s'\n", text_buffer);

GGML_ASSERT(device != NULL);
clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(text_buffer), &text_buffer, NULL);
fprintf(stderr, "ggml_opencl: using device: '%s'\n", text_buffer);


queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err);
if (err == CL_INVALID_PROPERTY || err == CL_INVALID_VALUE) {
queue = clCreateCommandQueue(context, device, 0, &err);
}
CL_CHECK(err, "clCreateCommandQueue");

free(platforms);
free(devices);

program = build_program_from_source(context, device, clblast_dequant);
program = build_program_from_source(context, device, program_source);

// Prepare dequantize kernels
kernel_q4_0 = clCreateKernel(program, "dequantize_row_q4_0", &err);
Expand Down