Skip to content

Commit 6a2f0b3

Browse files
mtavenrathslaren
andauthored
Implement non-mapped async IO for CUDA on Windows. (#7896)
* Implement non-mapped async IO for CUDA on Windows. On a fast Gen5 NVMe drive this change improves model load time by >3x while it should be the same (or slightly faster) on any other drive. * Free resources except for backend. * Change assertions to exceptions in llama_file, find correct cuda backend to create CUDA resources and respect the use_mmap flag again for CUDA. * Apply suggestions from code review Co-authored-by: slaren <[email protected]> * Fix editorconfig and unused variable * Fix issues with Windows build --------- Co-authored-by: slaren <[email protected]>
1 parent 21be9ca commit 6a2f0b3

File tree

1 file changed

+208
-8
lines changed

1 file changed

+208
-8
lines changed

llama.cpp

+208-8
Original file line numberDiff line numberDiff line change
@@ -1278,6 +1278,126 @@ struct no_init {
12781278
};
12791279

12801280
struct llama_file {
1281+
1282+
#if defined(_WIN32)
1283+
// use FILE * so we don't have to re-open the file to mmap
1284+
FILE * fp;
1285+
HANDLE fp_win32;
1286+
size_t size;
1287+
1288+
private:
1289+
std::string GetErrorMessageWin32(DWORD error_code) const {
1290+
std::string ret;
1291+
LPSTR lpMsgBuf = NULL;
1292+
DWORD bufLen = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
1293+
NULL, error_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&lpMsgBuf, 0, NULL);
1294+
if (!bufLen) {
1295+
ret = format("Win32 error code: %s", error_code);
1296+
} else {
1297+
ret = lpMsgBuf;
1298+
LocalFree(lpMsgBuf);
1299+
}
1300+
1301+
return ret;
1302+
}
1303+
1304+
public:
1305+
1306+
llama_file(const char * fname, const char * mode) {
1307+
fp = ggml_fopen(fname, mode);
1308+
if (fp == NULL) {
1309+
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
1310+
}
1311+
fp_win32 = (HANDLE) _get_osfhandle(_fileno(fp));
1312+
seek(0, SEEK_END);
1313+
size = tell();
1314+
seek(0, SEEK_SET);
1315+
}
1316+
1317+
size_t tell() const {
1318+
// SetFilePointerEx returns the current position when seeking relative 0 bytes
1319+
LARGE_INTEGER li;
1320+
li.QuadPart = 0;
1321+
BOOL ret = SetFilePointerEx(fp_win32, li, &li, FILE_CURRENT);
1322+
if (!ret) {
1323+
throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
1324+
}
1325+
1326+
return li.QuadPart;
1327+
}
1328+
1329+
void seek(size_t offset, int whence) const {
1330+
// no need to convert SEEK_* to FILE_*. The enums are the same.
1331+
// Still, keep static asserts to avoid failures in the future.
1332+
static_assert(SEEK_SET == FILE_BEGIN, "SEEK_SET != FILE_BEGIN");
1333+
static_assert(SEEK_CUR == FILE_CURRENT, "SEEK_CUR != FILE_CURRENT");
1334+
static_assert(SEEK_END == FILE_END, "SEEK_END != FILE_END");
1335+
1336+
LARGE_INTEGER li;
1337+
li.QuadPart = offset;
1338+
BOOL ret = SetFilePointerEx(fp_win32, li, NULL, whence);
1339+
if (!ret) {
1340+
throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
1341+
}
1342+
}
1343+
1344+
void read_raw(void * ptr, size_t len) const {
1345+
// On Win32 ReadFile is significant faster than fread which is again significant faster than std::fstream. Thus
1346+
// use the Win32 API to do file io instead of the C/C++ library functions.
1347+
1348+
// There are conditions under which ReadFile cannot read chunks >64MB.
1349+
// Thus split the operation into smaller chunks if len exceeds this limit.
1350+
size_t bytes_read = 0;
1351+
while (bytes_read < len) {
1352+
size_t chunk_size = std::min<size_t>(len - bytes_read, 64*1024*1024);
1353+
DWORD chunk_read = 0;
1354+
BOOL result = ReadFile(fp_win32, reinterpret_cast<char*>(ptr) + bytes_read, chunk_size, &chunk_read, NULL);
1355+
if (!result) {
1356+
throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
1357+
}
1358+
if (chunk_read < chunk_size || chunk_read == 0) {
1359+
throw std::runtime_error("unexpectedly reached end of file");
1360+
}
1361+
1362+
bytes_read += chunk_read;
1363+
} ;
1364+
}
1365+
1366+
uint32_t read_u32() const {
1367+
uint32_t val;
1368+
read_raw(&val, sizeof(val));
1369+
return val;
1370+
}
1371+
1372+
void write_raw(const void * ptr, size_t len) const {
1373+
// There are conditions under which WriteFile cannot write chunks >64MB.
1374+
// Thus split the operation into smaller chunks if len exceeds this limit.
1375+
size_t bytes_written = 0;
1376+
while (bytes_written < len) {
1377+
size_t chunk_size = std::min<size_t>(len - bytes_written, 64*1024*1024);
1378+
DWORD chunk_written = 0;
1379+
BOOL result = WriteFile(fp_win32, reinterpret_cast<char const*>(ptr) + bytes_written, chunk_size, &chunk_written, NULL);
1380+
if (!result) {
1381+
throw std::runtime_error(format("write error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
1382+
}
1383+
if (chunk_written < chunk_size || chunk_written == 0) {
1384+
throw std::runtime_error("unexpectedly failed to write bytes");
1385+
}
1386+
1387+
bytes_written += chunk_written;
1388+
}
1389+
}
1390+
1391+
void write_u32(std::uint32_t val) const {
1392+
write_raw(&val, sizeof(val));
1393+
}
1394+
1395+
~llama_file() {
1396+
if (fp) {
1397+
std::fclose(fp);
1398+
}
1399+
}
1400+
#else
12811401
// use FILE * so we don't have to re-open the file to mmap
12821402
FILE * fp;
12831403
size_t size;
@@ -1298,7 +1418,10 @@ struct llama_file {
12981418
#else
12991419
long ret = std::ftell(fp);
13001420
#endif
1301-
GGML_ASSERT(ret != -1); // this really shouldn't fail
1421+
if (ret == -1) {
1422+
throw std::runtime_error(format("ftell error: %s", strerror(errno)));
1423+
}
1424+
13021425
return (size_t) ret;
13031426
}
13041427

@@ -1308,7 +1431,9 @@ struct llama_file {
13081431
#else
13091432
int ret = std::fseek(fp, (long) offset, whence);
13101433
#endif
1311-
GGML_ASSERT(ret == 0); // same
1434+
if (ret != 0) {
1435+
throw std::runtime_error(format("seek error: %s", strerror(errno)));
1436+
}
13121437
}
13131438

13141439
void read_raw(void * ptr, size_t len) const {
@@ -1351,6 +1476,7 @@ struct llama_file {
13511476
std::fclose(fp);
13521477
}
13531478
}
1479+
#endif
13541480
};
13551481
using llama_files = std::vector<std::unique_ptr<llama_file>>;
13561482

@@ -3721,6 +3847,44 @@ struct llama_model_loader {
37213847
std::vector<no_init<uint8_t>> read_buf;
37223848
std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result;
37233849

3850+
#if defined(GGML_USE_CUDA)
3851+
// 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
3852+
// NVMe raid configurations might require more / larger buffers.
3853+
constexpr size_t num_buffers = 4;
3854+
constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB
3855+
3856+
std::vector<ggml_backend_buffer_t> host_buffers;
3857+
std::vector<void*> host_ptrs;
3858+
std::vector<ggml_backend_event_t> events;
3859+
size_t buffer_idx = 0; // buffer to use for async loads
3860+
3861+
ggml_backend_t cuda_backend = nullptr;
3862+
if (!use_mmap && !check_tensors) {
3863+
// When not using mmaped io use async uploads from pinned memory to GPU memory.
3864+
// First determine if the CUDA backend is active, and if so, determine the device ID.
3865+
ggml_backend_buffer_t buf = bufs_mmap.count(0) ? bufs_mmap.at(0) : nullptr;
3866+
if (buf) {
3867+
ggml_backend_buffer_type_t buffer_type = ggml_backend_buffer_get_type(buf);
3868+
for (int i = 0; i < ggml_backend_cuda_get_device_count(); ++i) {
3869+
auto * cuda_buffer_type = ggml_backend_cuda_buffer_type(i);
3870+
if (buffer_type == cuda_buffer_type) {
3871+
cuda_backend = ggml_backend_cuda_init(i);
3872+
break;
3873+
}
3874+
}
3875+
}
3876+
3877+
// If the cuda backend is active create pinned memory buffers and events for synchronisation.
3878+
if (cuda_backend) {
3879+
for (size_t idx = 0; idx < num_buffers; ++idx) {
3880+
host_buffers.emplace_back(ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), buffer_size));
3881+
host_ptrs.emplace_back(ggml_backend_buffer_get_base(host_buffers[idx]));
3882+
events.emplace_back(ggml_backend_event_new(cuda_backend));
3883+
}
3884+
}
3885+
}
3886+
#endif
3887+
37243888
for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
37253889
const auto * weight = get_weight(ggml_get_name(cur));
37263890
if (weight == nullptr) {
@@ -3776,19 +3940,55 @@ struct llama_model_loader {
37763940
}));
37773941
}
37783942
} else {
3779-
read_buf.resize(n_size);
3780-
file->seek(weight->offs, SEEK_SET);
3781-
file->read_raw(read_buf.data(), n_size);
3782-
ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
3783-
if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
3784-
throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
3943+
#if defined(GGML_USE_CUDA)
3944+
// If cuda_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
3945+
if (cuda_backend) {
3946+
file->seek(weight->offs, SEEK_SET);
3947+
3948+
size_t bytes_read = 0;
3949+
3950+
while (bytes_read < n_size) {
3951+
size_t read_iteration = std::min<size_t>(buffer_size, n_size - bytes_read);
3952+
3953+
ggml_backend_event_synchronize(events[buffer_idx]);
3954+
file->read_raw(host_ptrs[buffer_idx], read_iteration);
3955+
ggml_backend_tensor_set_async(cuda_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration);
3956+
ggml_backend_event_record(events[buffer_idx]);
3957+
3958+
bytes_read += read_iteration;
3959+
++buffer_idx;
3960+
buffer_idx %= num_buffers;
3961+
}
3962+
}
3963+
else
3964+
#endif
3965+
{
3966+
read_buf.resize(n_size);
3967+
file->seek(weight->offs, SEEK_SET);
3968+
file->read_raw(read_buf.data(), n_size);
3969+
ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
3970+
if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
3971+
throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
3972+
}
37853973
}
37863974
}
37873975
}
37883976

37893977
size_done += n_size;
37903978
}
37913979

3980+
#if defined(GGML_USE_CUDA)
3981+
// free temporary resources used for async cuda uploads
3982+
if (cuda_backend) {
3983+
for (size_t idx = 0; idx < num_buffers;++idx) {
3984+
ggml_backend_event_synchronize(events[idx]);
3985+
ggml_backend_event_free(events[idx]);
3986+
ggml_backend_buffer_free(host_buffers[idx]);
3987+
}
3988+
ggml_backend_free(cuda_backend);
3989+
}
3990+
#endif
3991+
37923992
// check validation results
37933993
bool validation_failed = false;
37943994
for (auto & future : validation_result) {

0 commit comments

Comments
 (0)