@@ -1318,16 +1318,11 @@ void ModelLoader::set_wtype_override(ggml_type wtype, std::string tensor_type_ru
13181318 }
13191319}
13201320
1321- bool ModelLoader::load_tensors (on_new_tensor_cb_t on_new_tensor_cb, int n_threads_p, bool enable_mmap) {
1322- int64_t process_time_ms = 0 ;
1323- std::atomic<int64_t > read_time_ms (0 );
1324- std::atomic<int64_t > memcpy_time_ms (0 );
1325- std::atomic<int64_t > copy_to_backend_time_ms (0 );
1326- std::atomic<int64_t > convert_time_ms (0 );
1327- std::atomic<uint64_t > bytes_processed (0 );
1321+ void ModelLoader::process_model_files (bool enable_mmap) {
13281322
1329- int num_threads_to_use = n_threads_p > 0 ? n_threads_p : sd_get_num_physical_cores ();
1330- LOG_DEBUG (" using %d threads for model loading" , num_threads_to_use);
1323+ if (model_files_processed) {
1324+ return ;
1325+ }
13311326
13321327 int64_t start_time = ggml_time_ms ();
13331328
@@ -1339,22 +1334,13 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
13391334 processed_tensor_storages.push_back (tensor_storage);
13401335 }
13411336
1342- process_time_ms = ggml_time_ms () - start_time;
1343-
1344- bool success = true ;
1345- size_t total_tensors_processed = 0 ;
1346- const size_t total_tensors_to_process = processed_tensor_storages.size ();
1347- const int64_t t_start = ggml_time_ms ();
1348- int last_n_threads = 1 ;
1349-
13501337 for (size_t file_index = 0 ; file_index < file_paths_.size (); file_index++) {
13511338 std::string file_path = file_paths_[file_index];
1352- LOG_DEBUG (" loading tensors from %s" , file_path.c_str ());
13531339
1354- std::vector<const TensorStorage* > file_tensors;
1340+ std::vector<TensorStorage> file_tensors;
13551341 for (const auto & ts : processed_tensor_storages) {
13561342 if (ts.file_index == file_index) {
1357- file_tensors.push_back (& ts);
1343+ file_tensors.push_back (ts);
13581344 }
13591345 }
13601346 if (file_tensors.empty ()) {
@@ -1363,7 +1349,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
13631349
13641350 bool is_zip = false ;
13651351 for (auto const & ts : file_tensors) {
1366- if (ts-> index_in_zip >= 0 ) {
1352+ if (ts. index_in_zip >= 0 ) {
13671353 is_zip = true ;
13681354 break ;
13691355 }
@@ -1378,6 +1364,58 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
13781364 }
13791365 }
13801366
1367+ ModelFileData fdata;
1368+ fdata.path = file_path;
1369+ fdata.mmapped = std::shared_ptr<MmapWrapper>(std::move (mmapped));
1370+ fdata.tensors = std::move (file_tensors);
1371+ fdata.is_zip = is_zip;
1372+
1373+ file_data.push_back (std::move (fdata));
1374+ }
1375+
1376+ model_files_processed = true ;
1377+
1378+ int64_t end_time = ggml_time_ms ();
1379+ int64_t process_time_ms = end_time - start_time;
1380+
1381+ LOG_INFO (" model files processing completed in %.2fs" , process_time_ms / 1000 .f );
1382+ }
1383+
1384+ bool ModelLoader::load_tensors (on_new_tensor_cb_t on_new_tensor_cb, int n_threads_p, bool enable_mmap) {
1385+
1386+ process_model_files (enable_mmap);
1387+
1388+ std::atomic<int64_t > read_time_ms (0 );
1389+ std::atomic<int64_t > memcpy_time_ms (0 );
1390+ std::atomic<int64_t > copy_to_backend_time_ms (0 );
1391+ std::atomic<int64_t > convert_time_ms (0 );
1392+ std::atomic<uint64_t > bytes_processed (0 );
1393+
1394+ int num_threads_to_use = n_threads_p > 0 ? n_threads_p : sd_get_num_physical_cores ();
1395+ LOG_DEBUG (" using %d threads for model loading" , num_threads_to_use);
1396+
1397+ int64_t start_time = ggml_time_ms ();
1398+
1399+ size_t total_tensors_to_process = 0 ;
1400+ for (const auto & fdata : file_data) {
1401+ total_tensors_to_process += fdata.tensors .size ();
1402+ }
1403+
1404+ bool success = true ;
1405+ size_t total_tensors_processed = 0 ;
1406+ const int64_t t_start = start_time;
1407+ int last_n_threads = 1 ;
1408+
1409+ for (auto & fdata : file_data) {
1410+ const std::string & file_path = fdata.path ;
1411+ LOG_DEBUG (" loading tensors from %s" , file_path.c_str ());
1412+
1413+ const std::vector<TensorStorage> & file_tensors = fdata.tensors ;
1414+
1415+ bool is_zip = fdata.is_zip ;
1416+
1417+ std::shared_ptr<MmapWrapper> mmapped = fdata.mmapped ;
1418+
13811419 int n_threads = is_zip ? 1 : std::min (num_threads_to_use, (int )file_tensors.size ());
13821420 if (n_threads < 1 ) {
13831421 n_threads = 1 ;
@@ -1418,7 +1456,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
14181456 break ;
14191457 }
14201458
1421- const TensorStorage& tensor_storage = * file_tensors[idx];
1459+ const TensorStorage& tensor_storage = file_tensors[idx];
14221460 ggml_tensor* dst_tensor = nullptr ;
14231461
14241462 t0 = ggml_time_ms ();
@@ -1578,9 +1616,8 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
15781616 }
15791617
15801618 int64_t end_time = ggml_time_ms ();
1581- LOG_INFO (" loading tensors completed, taking %.2fs (process: %.2fs, read: %.2fs, memcpy: %.2fs, convert: %.2fs, copy_to_backend: %.2fs)" ,
1619+ LOG_INFO (" loading tensors completed, taking %.2fs (read: %.2fs, memcpy: %.2fs, convert: %.2fs, copy_to_backend: %.2fs)" ,
15821620 (end_time - start_time) / 1000 .f ,
1583- process_time_ms / 1000 .f ,
15841621 (read_time_ms.load () / (float )last_n_threads) / 1000 .f ,
15851622 (memcpy_time_ms.load () / (float )last_n_threads) / 1000 .f ,
15861623 (convert_time_ms.load () / (float )last_n_threads) / 1000 .f ,
0 commit comments