@@ -1278,6 +1278,126 @@ struct no_init {
1278
1278
};
1279
1279
1280
1280
struct llama_file {
1281
+
1282
+ #if defined(_WIN32)
1283
+ // use FILE * so we don't have to re-open the file to mmap
1284
+ FILE * fp;
1285
+ HANDLE fp_win32;
1286
+ size_t size;
1287
+
1288
+ private:
1289
+ std::string GetErrorMessageWin32(DWORD error_code) const {
1290
+ std::string ret;
1291
+ LPSTR lpMsgBuf = NULL;
1292
+ DWORD bufLen = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
1293
+ NULL, error_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&lpMsgBuf, 0, NULL);
1294
+ if (!bufLen) {
1295
+ ret = format("Win32 error code: %s", error_code);
1296
+ } else {
1297
+ ret = lpMsgBuf;
1298
+ LocalFree(lpMsgBuf);
1299
+ }
1300
+
1301
+ return ret;
1302
+ }
1303
+
1304
+ public:
1305
+
1306
+ llama_file(const char * fname, const char * mode) {
1307
+ fp = ggml_fopen(fname, mode);
1308
+ if (fp == NULL) {
1309
+ throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
1310
+ }
1311
+ fp_win32 = (HANDLE) _get_osfhandle(_fileno(fp));
1312
+ seek(0, SEEK_END);
1313
+ size = tell();
1314
+ seek(0, SEEK_SET);
1315
+ }
1316
+
1317
+ size_t tell() const {
1318
+ // SetFilePointerEx returns the current position when seeking relative 0 bytes
1319
+ LARGE_INTEGER li;
1320
+ li.QuadPart = 0;
1321
+ BOOL ret = SetFilePointerEx(fp_win32, li, &li, FILE_CURRENT);
1322
+ if (!ret) {
1323
+ throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
1324
+ }
1325
+
1326
+ return li.QuadPart;
1327
+ }
1328
+
1329
+ void seek(size_t offset, int whence) const {
1330
+ // no need to convert SEEK_* to FILE_*. The enums are the same.
1331
+ // Still, keep static asserts to avoid failures in the future.
1332
+ static_assert(SEEK_SET == FILE_BEGIN, "SEEK_SET != FILE_BEGIN");
1333
+ static_assert(SEEK_CUR == FILE_CURRENT, "SEEK_CUR != FILE_CURRENT");
1334
+ static_assert(SEEK_END == FILE_END, "SEEK_END != FILE_END");
1335
+
1336
+ LARGE_INTEGER li;
1337
+ li.QuadPart = offset;
1338
+ BOOL ret = SetFilePointerEx(fp_win32, li, NULL, whence);
1339
+ if (!ret) {
1340
+ throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
1341
+ }
1342
+ }
1343
+
1344
+ void read_raw(void * ptr, size_t len) const {
1345
+ // On Win32 ReadFile is significant faster than fread which is again significant faster than std::fstream. Thus
1346
+ // use the Win32 API to do file io instead of the C/C++ library functions.
1347
+
1348
+ // There are conditions under which ReadFile cannot read chunks >64MB.
1349
+ // Thus split the operation into smaller chunks if len exceeds this limit.
1350
+ size_t bytes_read = 0;
1351
+ while (bytes_read < len) {
1352
+ size_t chunk_size = std::min<size_t>(len - bytes_read, 64*1024*1024);
1353
+ DWORD chunk_read = 0;
1354
+ BOOL result = ReadFile(fp_win32, reinterpret_cast<char*>(ptr) + bytes_read, chunk_size, &chunk_read, NULL);
1355
+ if (!result) {
1356
+ throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
1357
+ }
1358
+ if (chunk_read < chunk_size || chunk_read == 0) {
1359
+ throw std::runtime_error("unexpectedly reached end of file");
1360
+ }
1361
+
1362
+ bytes_read += chunk_read;
1363
+ } ;
1364
+ }
1365
+
1366
+ uint32_t read_u32() const {
1367
+ uint32_t val;
1368
+ read_raw(&val, sizeof(val));
1369
+ return val;
1370
+ }
1371
+
1372
+ void write_raw(const void * ptr, size_t len) const {
1373
+ // There are conditions under which WriteFile cannot write chunks >64MB.
1374
+ // Thus split the operation into smaller chunks if len exceeds this limit.
1375
+ size_t bytes_written = 0;
1376
+ while (bytes_written < len) {
1377
+ size_t chunk_size = std::min<size_t>(len - bytes_written, 64*1024*1024);
1378
+ DWORD chunk_written = 0;
1379
+ BOOL result = WriteFile(fp_win32, reinterpret_cast<char const*>(ptr) + bytes_written, chunk_size, &chunk_written, NULL);
1380
+ if (!result) {
1381
+ throw std::runtime_error(format("write error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
1382
+ }
1383
+ if (chunk_written < chunk_size || chunk_written == 0) {
1384
+ throw std::runtime_error("unexpectedly failed to write bytes");
1385
+ }
1386
+
1387
+ bytes_written += chunk_written;
1388
+ }
1389
+ }
1390
+
1391
+ void write_u32(std::uint32_t val) const {
1392
+ write_raw(&val, sizeof(val));
1393
+ }
1394
+
1395
+ ~llama_file() {
1396
+ if (fp) {
1397
+ std::fclose(fp);
1398
+ }
1399
+ }
1400
+ #else
1281
1401
// use FILE * so we don't have to re-open the file to mmap
1282
1402
FILE * fp;
1283
1403
size_t size;
@@ -1298,7 +1418,10 @@ struct llama_file {
1298
1418
#else
1299
1419
long ret = std::ftell(fp);
1300
1420
#endif
1301
- GGML_ASSERT(ret != -1); // this really shouldn't fail
1421
+ if (ret == -1) {
1422
+ throw std::runtime_error(format("ftell error: %s", strerror(errno)));
1423
+ }
1424
+
1302
1425
return (size_t) ret;
1303
1426
}
1304
1427
@@ -1308,7 +1431,9 @@ struct llama_file {
1308
1431
#else
1309
1432
int ret = std::fseek(fp, (long) offset, whence);
1310
1433
#endif
1311
- GGML_ASSERT(ret == 0); // same
1434
+ if (ret != 0) {
1435
+ throw std::runtime_error(format("seek error: %s", strerror(errno)));
1436
+ }
1312
1437
}
1313
1438
1314
1439
void read_raw(void * ptr, size_t len) const {
@@ -1351,6 +1476,7 @@ struct llama_file {
1351
1476
std::fclose(fp);
1352
1477
}
1353
1478
}
1479
+ #endif
1354
1480
};
1355
1481
using llama_files = std::vector<std::unique_ptr<llama_file>>;
1356
1482
@@ -3721,6 +3847,44 @@ struct llama_model_loader {
3721
3847
std::vector<no_init<uint8_t>> read_buf;
3722
3848
std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result;
3723
3849
3850
+ #if defined(GGML_USE_CUDA)
3851
+ // 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
3852
+ // NVMe raid configurations might require more / larger buffers.
3853
+ constexpr size_t num_buffers = 4;
3854
+ constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB
3855
+
3856
+ std::vector<ggml_backend_buffer_t> host_buffers;
3857
+ std::vector<void*> host_ptrs;
3858
+ std::vector<ggml_backend_event_t> events;
3859
+ size_t buffer_idx = 0; // buffer to use for async loads
3860
+
3861
+ ggml_backend_t cuda_backend = nullptr;
3862
+ if (!use_mmap && !check_tensors) {
3863
+ // When not using mmaped io use async uploads from pinned memory to GPU memory.
3864
+ // First determine if the CUDA backend is active, and if so, determine the device ID.
3865
+ ggml_backend_buffer_t buf = bufs_mmap.count(0) ? bufs_mmap.at(0) : nullptr;
3866
+ if (buf) {
3867
+ ggml_backend_buffer_type_t buffer_type = ggml_backend_buffer_get_type(buf);
3868
+ for (int i = 0; i < ggml_backend_cuda_get_device_count(); ++i) {
3869
+ auto * cuda_buffer_type = ggml_backend_cuda_buffer_type(i);
3870
+ if (buffer_type == cuda_buffer_type) {
3871
+ cuda_backend = ggml_backend_cuda_init(i);
3872
+ break;
3873
+ }
3874
+ }
3875
+ }
3876
+
3877
+ // If the cuda backend is active create pinned memory buffers and events for synchronisation.
3878
+ if (cuda_backend) {
3879
+ for (size_t idx = 0; idx < num_buffers; ++idx) {
3880
+ host_buffers.emplace_back(ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), buffer_size));
3881
+ host_ptrs.emplace_back(ggml_backend_buffer_get_base(host_buffers[idx]));
3882
+ events.emplace_back(ggml_backend_event_new(cuda_backend));
3883
+ }
3884
+ }
3885
+ }
3886
+ #endif
3887
+
3724
3888
for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
3725
3889
const auto * weight = get_weight(ggml_get_name(cur));
3726
3890
if (weight == nullptr) {
@@ -3776,19 +3940,55 @@ struct llama_model_loader {
3776
3940
}));
3777
3941
}
3778
3942
} else {
3779
- read_buf.resize(n_size);
3780
- file->seek(weight->offs, SEEK_SET);
3781
- file->read_raw(read_buf.data(), n_size);
3782
- ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
3783
- if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
3784
- throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
3943
+ #if defined(GGML_USE_CUDA)
3944
+ // If cuda_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
3945
+ if (cuda_backend) {
3946
+ file->seek(weight->offs, SEEK_SET);
3947
+
3948
+ size_t bytes_read = 0;
3949
+
3950
+ while (bytes_read < n_size) {
3951
+ size_t read_iteration = std::min<size_t>(buffer_size, n_size - bytes_read);
3952
+
3953
+ ggml_backend_event_synchronize(events[buffer_idx]);
3954
+ file->read_raw(host_ptrs[buffer_idx], read_iteration);
3955
+ ggml_backend_tensor_set_async(cuda_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration);
3956
+ ggml_backend_event_record(events[buffer_idx]);
3957
+
3958
+ bytes_read += read_iteration;
3959
+ ++buffer_idx;
3960
+ buffer_idx %= num_buffers;
3961
+ }
3962
+ }
3963
+ else
3964
+ #endif
3965
+ {
3966
+ read_buf.resize(n_size);
3967
+ file->seek(weight->offs, SEEK_SET);
3968
+ file->read_raw(read_buf.data(), n_size);
3969
+ ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
3970
+ if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
3971
+ throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
3972
+ }
3785
3973
}
3786
3974
}
3787
3975
}
3788
3976
3789
3977
size_done += n_size;
3790
3978
}
3791
3979
3980
+ #if defined(GGML_USE_CUDA)
3981
+ // free temporary resources used for async cuda uploads
3982
+ if (cuda_backend) {
3983
+ for (size_t idx = 0; idx < num_buffers;++idx) {
3984
+ ggml_backend_event_synchronize(events[idx]);
3985
+ ggml_backend_event_free(events[idx]);
3986
+ ggml_backend_buffer_free(host_buffers[idx]);
3987
+ }
3988
+ ggml_backend_free(cuda_backend);
3989
+ }
3990
+ #endif
3991
+
3792
3992
// check validation results
3793
3993
bool validation_failed = false;
3794
3994
for (auto & future : validation_result) {
0 commit comments