|
2 | 2 | #include <atomic> |
3 | 3 | #include <chrono> |
4 | 4 | #include <cstdarg> |
| 5 | +#include <cstdint> |
5 | 6 | #include <fstream> |
6 | 7 | #include <functional> |
7 | 8 | #include <mutex> |
@@ -1381,6 +1382,99 @@ void ModelLoader::process_model_files(bool enable_mmap) { |
1381 | 1382 | LOG_INFO("model files processing completed in %.2fs", process_time_ms / 1000.f); |
1382 | 1383 | } |
1383 | 1384 |
|
| 1385 | +std::vector<MmapTensorStore> ModelLoader::mmap_tensors(std::map<std::string, ggml_tensor*>& tensors, |
| 1386 | + std::set<std::string> ignore_tensors) |
| 1387 | +{ |
| 1388 | + process_model_files(true); |
| 1389 | + |
| 1390 | + std::vector<MmapTensorStore> result; |
| 1391 | + uint64_t mapped_bytes = 0; |
| 1392 | + size_t mapped_tensors = 0; |
| 1393 | + |
| 1394 | + LOG_DEBUG("memory-mapping tensors..."); |
| 1395 | + |
| 1396 | + int64_t t_start = ggml_time_ms(); |
| 1397 | + |
| 1398 | + for (const auto& fdata : file_data) { |
| 1399 | + if (!fdata.mmapped) continue; |
| 1400 | + |
| 1401 | + const std::vector<TensorStorage>& file_tensors = fdata.tensors; |
| 1402 | + std::shared_ptr<MmapWrapper> mmapped = fdata.mmapped; |
| 1403 | + |
| 1404 | + uint8_t * mmap_data = const_cast<uint8_t*>(mmapped->data()); |
| 1405 | + |
| 1406 | + ggml_backend_buffer_t buf_mmap = ggml_backend_cpu_buffer_from_ptr(mmap_data, mmapped->size()); |
| 1407 | + if (!buf_mmap) { |
| 1408 | + LOG_WARN("mmap: failed to create backend buffer for file %s", fdata.path.c_str()); |
| 1409 | + continue; |
| 1410 | + } |
| 1411 | + ggml_backend_buffer_set_usage(buf_mmap, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); |
| 1412 | + |
| 1413 | + size_t file_mapped_bytes = 0; |
| 1414 | + size_t file_mapped_tensors = 0; |
| 1415 | + |
| 1416 | + for (const auto& tensor_storage : file_tensors) { |
| 1417 | + const std::string& name = tensor_storage.name; |
| 1418 | + |
| 1419 | + bool is_ignored = false; |
| 1420 | + for (const auto& ignore_prefix : ignore_tensors) { |
| 1421 | + if (starts_with(name, ignore_prefix)) { |
| 1422 | + is_ignored = true; |
| 1423 | + break; |
| 1424 | + } |
| 1425 | + } |
| 1426 | + if (is_ignored) |
| 1427 | + continue; |
| 1428 | + |
| 1429 | + auto it = tensors.find(name); |
| 1430 | + if (it == tensors.end()) |
| 1431 | + continue; |
| 1432 | + |
| 1433 | + ggml_tensor* dst_tensor = it->second; |
| 1434 | + if (dst_tensor == nullptr) |
| 1435 | + continue; |
| 1436 | + |
| 1437 | + if (tensor_storage.type != dst_tensor->type) |
| 1438 | + continue; |
| 1439 | + |
| 1440 | + size_t tensor_size = tensor_storage.nbytes(); |
| 1441 | + size_t tensor_offset = tensor_storage.offset; |
| 1442 | + |
| 1443 | + if (tensor_storage.ne[0] != dst_tensor->ne[0] || |
| 1444 | + tensor_storage.ne[1] != dst_tensor->ne[1] || |
| 1445 | + tensor_storage.ne[2] != dst_tensor->ne[2] || |
| 1446 | + tensor_storage.ne[3] != dst_tensor->ne[3] || |
| 1447 | + tensor_size != ggml_nbytes(dst_tensor)) { |
| 1448 | + // let load_tensors worry about this |
| 1449 | + continue; |
| 1450 | + } |
| 1451 | + |
| 1452 | + dst_tensor->buffer = buf_mmap; |
| 1453 | + dst_tensor->data = mmap_data + tensor_offset; |
| 1454 | + |
| 1455 | + file_mapped_bytes += tensor_size; |
| 1456 | + file_mapped_tensors++; |
| 1457 | + } |
| 1458 | + |
| 1459 | + if (file_mapped_bytes > 0) { |
| 1460 | + mapped_tensors += file_mapped_tensors; |
| 1461 | + mapped_bytes += file_mapped_bytes; |
| 1462 | + result.push_back({mmapped, buf_mmap}); |
| 1463 | + } |
| 1464 | + } |
| 1465 | + |
| 1466 | + int64_t t_end = ggml_time_ms(); |
| 1467 | + int64_t duration_ms = t_end - t_start; |
| 1468 | + |
| 1469 | + LOG_INFO("memory-mapped %zu tensors in %zu files (%.2f MB), taking %.2fs", |
| 1470 | + mapped_tensors, |
| 1471 | + result.size(), |
| 1472 | + mapped_bytes / (1024.0 * 1024.0), |
| 1473 | + duration_ms / 1000.0); |
| 1474 | + |
| 1475 | + return result; |
| 1476 | +} |
| 1477 | + |
1384 | 1478 | bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads_p, bool enable_mmap) { |
1385 | 1479 |
|
1386 | 1480 | process_model_files(enable_mmap); |
@@ -1473,6 +1567,12 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread |
1473 | 1567 | continue; |
1474 | 1568 | } |
1475 | 1569 |
|
| 1570 | + // skip mmapped tensors |
| 1571 | + if (dst_tensor->buffer != nullptr |
| 1572 | + && ggml_backend_buffer_get_usage(dst_tensor->buffer) == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { |
| 1573 | + continue; |
| 1574 | + } |
| 1575 | + |
1476 | 1576 | size_t nbytes_to_read = tensor_storage.nbytes_to_read(); |
1477 | 1577 |
|
1478 | 1578 | auto read_data = [&](char* buf, size_t n) { |
|
0 commit comments