Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion csrc/engine/compiler/paged_compiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,11 @@ PagedCompiler::Compiled PagedCompiler::get_compiled(const InfinilmModel::Input &
graph_input.slot_mapping.value()->copy_from(input.slot_mapping.value());

auto graph = std::get<0>(result->second.compiled);
auto shared_output = std::shared_ptr<InfinilmModel::Output>(new InfinilmModel::Output{std::get<1>(result->second.compiled)->logits->resume_from_blob_()});
// Reuse the GraphTensor output captured at compile time.
// Do not call resume_from_blob_() on workspace-backed logits:
// that registers a second deleter on the same GPU block and
// triggers double free in PinnableBlockAllocator.
auto shared_output = std::get<1>(result->second.compiled);

return std::make_tuple(graph, shared_output);
}
Expand Down
6 changes: 5 additions & 1 deletion csrc/engine/compiler/static_batching_compiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,11 @@ StaticBatchingCompiler::Compiled StaticBatchingCompiler::get_compiled(
graph_input.total_sequence_lengths.value()->copy_from(input.total_sequence_lengths.value());

auto graph = std::get<0>(result->second.compiled);
auto shared_output = std::shared_ptr<InfinilmModel::Output>(new InfinilmModel::Output{std::get<1>(result->second.compiled)->logits->resume_from_blob_()});
// Reuse the GraphTensor output captured at compile time.
// Do not call resume_from_blob_() on workspace-backed logits:
// that registers a second deleter on the same GPU block and
// triggers double free in PinnableBlockAllocator.
auto shared_output = std::get<1>(result->second.compiled);
return std::make_tuple(graph, shared_output);
}
} else {
Expand Down
5 changes: 3 additions & 2 deletions csrc/engine/infer_engine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,16 @@ InferEngine::InferEngine(
const cache::CacheConfig *cache_config,
bool enable_graph_compiling,
backends::AttentionBackend attention_backend,
std::optional<infinicore::DataType> kv_cache_dtype) // Changed parameter
std::optional<infinicore::DataType> kv_cache_dtype, // Changed parameter
size_t max_num_batched_tokens)
: communication_group_(distributed_config, device_type), attention_backend_(attention_backend) {
if (cache_config != nullptr) {
cache_config_ = cache_config->unique_copy();
}

// Load model config if model_path is provided, model_path must be valid, and config.json exists
this->model_config_ = infinilm::config::ConfigFactory::createConfig(config_str);
auto infinilm_config = std::make_shared<infinilm::global_state::InfinilmConfig>(attention_backend, this->model_config_);
auto infinilm_config = std::make_shared<infinilm::global_state::InfinilmConfig>(attention_backend, this->model_config_, max_num_batched_tokens);

// Only support offline int8 kv cache quantization in this version
if (kv_cache_dtype.has_value()) {
Expand Down
3 changes: 2 additions & 1 deletion csrc/engine/infer_engine.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ class InferEngine {
const cache::CacheConfig *cache_config = nullptr,
bool enable_graph_compiling = false,
backends::AttentionBackend attention_backend = backends::AttentionBackend::Default,
std::optional<infinicore::DataType> kv_cache_dtype = std::nullopt);
std::optional<infinicore::DataType> kv_cache_dtype = std::nullopt,
size_t max_num_batched_tokens = 2048);

// Load a parameter to all workers (each can extract its shard inside RankWorker)
void load_param(const std::string &name, const infinicore::Tensor &param);
Expand Down
4 changes: 4 additions & 0 deletions csrc/global_state/forward_context.hpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#pragma once

#include "../models/infinilm_model.hpp"
#include <unordered_map>

namespace infinilm::global_state {

Expand Down Expand Up @@ -43,6 +44,9 @@ struct AttentionMetadata {
struct ForwardContext {
AttentionMetadata attn_metadata;
std::vector<infinicore::Tensor> kv_cache_vec;

// preallocated workspace for some modules
std::unordered_map<std::string, infinicore::Tensor> preallocated_workspace;
};

void initialize_forward_context(ForwardContext &forward_context);
Expand Down
15 changes: 13 additions & 2 deletions csrc/global_state/infinilm_config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,24 @@ struct InfinilmConfig {
public:
InfinilmConfig() = default;
InfinilmConfig(const infinilm::backends::AttentionBackend &backend,
const std::shared_ptr<infinilm::config::ModelConfig> &model_config)
const std::shared_ptr<infinilm::config::ModelConfig> &model_config,
size_t max_num_batched_tokens)
: attention_backend(backend),
model_config(model_config) {}
model_config(model_config),
max_num_batched_tokens(max_num_batched_tokens) {

if (max_num_batched_tokens > 0) {
const size_t max_position_embeddings = model_config->get<size_t>("max_position_embeddings");
ASSERT(max_num_batched_tokens >= 512 && max_num_batched_tokens <= max_position_embeddings);
enable_preallocated_workspace = true;
}
}

public:
infinilm::backends::AttentionBackend attention_backend;
std::shared_ptr<infinilm::config::ModelConfig> model_config;
size_t max_num_batched_tokens = 0;
bool enable_preallocated_workspace = false;
};

/**
Expand Down
88 changes: 74 additions & 14 deletions csrc/layers/attention/attention.cpp
Original file line number Diff line number Diff line change
@@ -1,17 +1,21 @@
#include "attention.hpp"
#include "../../global_state/global_state.hpp"
#include "../../utils.hpp"
#include "../rotary_embedding/rotary_embedding.hpp"
#include <string>
#include <tuple>

namespace infinilm::layers::attention {

Attention::Attention(std::shared_ptr<infinilm::config::ModelConfig> model_config,
size_t layer_idx,
const infinicore::Device &device) {
const infinicore::Device &device)
: device_(device),
dtype_(model_config->get_dtype()) {
layer_idx_ = layer_idx;
hidden_size_ = model_config->get<size_t>("hidden_size");
head_dim_ = model_config->get<size_t>("head_dim");

const auto &dtype{model_config->get_dtype()};
size_t total_num_heads = model_config->get<size_t>("num_attention_heads");
size_t total_num_kv_heads = model_config->get<size_t>("num_key_value_heads");
bool use_bias = model_config->get_or<bool>("attention_bias", true);
Expand All @@ -31,18 +35,24 @@ Attention::Attention(std::shared_ptr<infinilm::config::ModelConfig> model_config
qkv_proj_ = std::make_shared<layers::linear::QKVParallelLinear>(
hidden_size_, head_dim_, total_num_heads, total_num_kv_heads,
"q_proj", "k_proj", "v_proj", register_fn,
quantization_method, use_bias, dtype, device, rank_info);
quantization_method, use_bias, dtype_, device_, rank_info);
o_proj_ = this->register_module<layers::linear::RowParallelLinear>(
"o_proj", total_num_heads * head_dim_, hidden_size_, quantization_method,
use_output_bias, dtype, device, tp_rank, tp_size, rank_info.comm);
use_output_bias, dtype_, device_, tp_rank, tp_size, rank_info.comm);

rotary_emb_ = infinilm::layers::rotary_embedding::get_rope(model_config, device);
rotary_emb_ = infinilm::layers::rotary_embedding::get_rope(model_config, device_);

float scaling = 1.0f / std::sqrt(static_cast<float>(head_dim_));
attn_ = std::make_shared<AttentionLayer>(num_attention_heads_, head_dim_, scaling, num_key_value_heads_, layer_idx_,
kv_cache_k_scale_, kv_cache_v_scale_, attention_backend_);
kv_cache_k_scale_, kv_cache_v_scale_, attention_backend_, device_);

init_kv_cache_quant_params(register_fn, device, kv_cache_k_scale_, kv_cache_v_scale_);
init_kv_cache_quant_params(register_fn, device_, kv_cache_k_scale_, kv_cache_v_scale_);

rank_qkv_output_size_ = qkv_proj_->out_features() / static_cast<size_t>(tp_size);
enable_preallocated_workspace_ = infinilm::global_state::get_infinilm_config().enable_preallocated_workspace;
if (enable_preallocated_workspace_) {
this->_initialize_preallocated_workspace();
}
}

infinicore::Tensor Attention::forward(const infinicore::Tensor &positions,
Expand All @@ -62,7 +72,15 @@ infinicore::Tensor Attention::forward_static_(const infinicore::Tensor &position
size_t seq_len = shape[1];

// 1. Project Q, K, V
auto [q, k, v] = qkv_proj_->forward_split(hidden_states_mutable);
infinicore::Tensor q;
infinicore::Tensor k;
infinicore::Tensor v;
if (enable_preallocated_workspace_) {
auto qkv_output = max_qkv_output_->narrow({{0, 0, batch_size * seq_len}})->view({batch_size, seq_len, rank_qkv_output_size_});
std::tie(q, k, v) = qkv_proj_->forward_split_(qkv_output, hidden_states_mutable);
} else {
std::tie(q, k, v) = qkv_proj_->forward_split(hidden_states_mutable);
}

// 2. Reshape for multi-head attention
auto q_reshaped = q->view({batch_size, seq_len, num_attention_heads_, head_dim_});
Expand All @@ -89,9 +107,13 @@ infinicore::Tensor Attention::forward_static_(const infinicore::Tensor &position
// 5. Attn Backend calculate
auto attn_output = attn_->forward(q_rope, k_reshaped, v_reshaped);

// 7. Project output
auto output = o_proj_->forward(attn_output);
return output;
// 6. Project output
if (enable_preallocated_workspace_) {
auto o_output = max_o_output_->narrow({{0, 0, batch_size * seq_len}})->view({batch_size, seq_len, hidden_size_});
o_proj_->forward_(o_output, attn_output);
return o_output;
}
return o_proj_->forward(attn_output);
}

infinicore::Tensor Attention::forward_paged_(const infinicore::Tensor &position_ids,
Expand All @@ -106,7 +128,15 @@ infinicore::Tensor Attention::forward_paged_(const infinicore::Tensor &position_
ASSERT_EQ(batch_size, 1);

// 1. Project Q, K, V
auto [q, k, v] = qkv_proj_->forward_split(hidden_states_mutable);
infinicore::Tensor q;
infinicore::Tensor k;
infinicore::Tensor v;
if (enable_preallocated_workspace_) {
auto qkv_output = max_qkv_output_->narrow({{0, 0, seq_len}})->view({1, seq_len, rank_qkv_output_size_});
std::tie(q, k, v) = qkv_proj_->forward_split_(qkv_output, hidden_states_mutable);
} else {
std::tie(q, k, v) = qkv_proj_->forward_split(hidden_states_mutable);
}

// 2. Reshape for multi-head attention
auto q_reshaped = q->view({seq_len, num_attention_heads_, head_dim_});
Expand All @@ -133,8 +163,38 @@ infinicore::Tensor Attention::forward_paged_(const infinicore::Tensor &position_
auto attn_output = attn_->forward(q_reshaped, k_reshaped, v_reshaped);

// 6. Project output
auto output = o_proj_->forward(attn_output);
return output;
if (enable_preallocated_workspace_) {
auto o_output = max_o_output_->narrow({{0, 0, seq_len}})->view({1, seq_len, hidden_size_});
o_proj_->forward_(o_output, attn_output);
return o_output;
}
return o_proj_->forward(attn_output);
}

void Attention::_initialize_preallocated_workspace() {
const auto &infinilm_config = infinilm::global_state::get_infinilm_config();
auto &preallocated_workspace = infinilm::global_state::get_forward_context().preallocated_workspace;
const size_t max_num_batched_tokens = infinilm_config.max_num_batched_tokens;

const std::string attention_cache_key = std::string("Attention_max_num_batched_tokens_")
+ std::to_string(max_num_batched_tokens) + "_rank_qkv_output_size_"
+ std::to_string(rank_qkv_output_size_) + "_hidden_size_"
+ std::to_string(hidden_size_) + "_dtype_"
+ infinicore::toString(dtype_) + "_device_"
+ device_.toString();

size_t max_output_size = std::max(rank_qkv_output_size_, hidden_size_);
if (preallocated_workspace.find(attention_cache_key) == preallocated_workspace.end()) {
auto attention_buffer = infinicore::Tensor::empty({max_num_batched_tokens * max_output_size}, dtype_, device_);
preallocated_workspace[attention_cache_key] = attention_buffer;
}

auto attention_buffer = preallocated_workspace.at(attention_cache_key);
const auto attention_buffer_shape = attention_buffer->shape();
ASSERT(attention_buffer_shape[0] == max_num_batched_tokens * max_output_size);

max_qkv_output_ = attention_buffer->narrow({{0, 0, max_num_batched_tokens * rank_qkv_output_size_}})->view({max_num_batched_tokens, rank_qkv_output_size_});
max_o_output_ = attention_buffer->narrow({{0, 0, max_num_batched_tokens * hidden_size_}})->view({max_num_batched_tokens, hidden_size_});
}

void init_kv_cache_quant_params(std::function<void(const std::string &, infinicore::nn::Parameter)> register_fn,
Expand Down
21 changes: 18 additions & 3 deletions csrc/layers/attention/attention.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
#include "../../global_state/global_state.hpp"
#include "../linear/linear.hpp"
#include "backends/attention_layer.hpp"
#include "infinicore/device.hpp"
#include "infinicore/dtype.hpp"
#include "infinicore/nn/module.hpp"
#include "infinicore/nn/rope.hpp"
#include "infinicore/tensor.hpp"
Expand Down Expand Up @@ -37,6 +39,8 @@ class Attention : public infinicore::nn::Module {
infinicore::Tensor forward_paged_(const infinicore::Tensor &positions,
const infinicore::Tensor &hidden_states) const;

void _initialize_preallocated_workspace();

protected:
std::shared_ptr<infinilm::layers::linear::QKVParallelLinear> qkv_proj_;
std::shared_ptr<infinilm::layers::linear::RowParallelLinear> o_proj_;
Expand All @@ -49,13 +53,24 @@ class Attention : public infinicore::nn::Module {
size_t num_key_value_heads_;
size_t hidden_size_;
size_t head_dim_;
infinicore::Device device_;
infinicore::DataType dtype_;

// For off-line kv cache quantization
INFINICORE_NN_PARAMETER(kv_cache_k_scale);
INFINICORE_NN_PARAMETER(kv_cache_v_scale);

private:
bool enable_preallocated_workspace_{false};

size_t rank_qkv_output_size_;

// preallocated workspace for Attention
infinicore::Tensor max_qkv_output_;
infinicore::Tensor max_o_output_;
};
void init_kv_cache_quant_params(std::function<void(const std::string &, infinicore::nn::Parameter)> register_fn,
const infinicore::Device &device,
infinicore::nn::Parameter &kv_cache_k_scale,
infinicore::nn::Parameter &kv_cache_v_scale);
const infinicore::Device &device,
infinicore::nn::Parameter &kv_cache_k_scale,
infinicore::nn::Parameter &kv_cache_v_scale);
} // namespace infinilm::layers::attention
9 changes: 5 additions & 4 deletions csrc/layers/attention/backends/attention_layer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,17 @@ AttentionLayer::AttentionLayer(size_t num_heads,
size_t layer_idx,
infinicore::Tensor k_scale,
infinicore::Tensor v_scale,
::infinilm::backends::AttentionBackend attn_backend) : k_scale_(k_scale), v_scale_(v_scale), layer_idx_(layer_idx), attn_backend_(attn_backend) {
::infinilm::backends::AttentionBackend attn_backend,
const infinicore::Device &device) : k_scale_(k_scale), v_scale_(v_scale), layer_idx_(layer_idx), attn_backend_(attn_backend) {
switch (attn_backend) {
case ::infinilm::backends::AttentionBackend::STATIC_ATTN:
attn_backend_impl_ = std::make_shared<backends::StaticAttentionImpl>(num_heads, head_size, scale, num_kv_heads, layer_idx);
attn_backend_impl_ = std::make_shared<backends::StaticAttentionImpl>(num_heads, head_size, scale, num_kv_heads, layer_idx, device);
break;
case ::infinilm::backends::AttentionBackend::PAGED_ATTN:
attn_backend_impl_ = std::make_shared<backends::PagedAttentionImpl>(num_heads, head_size, scale, num_kv_heads, layer_idx);
attn_backend_impl_ = std::make_shared<backends::PagedAttentionImpl>(num_heads, head_size, scale, num_kv_heads, layer_idx, device);
break;
case ::infinilm::backends::AttentionBackend::FLASH_ATTN:
attn_backend_impl_ = std::make_shared<backends::FlashAttentionImpl>(num_heads, head_size, scale, num_kv_heads, layer_idx);
attn_backend_impl_ = std::make_shared<backends::FlashAttentionImpl>(num_heads, head_size, scale, num_kv_heads, layer_idx, device);
break;
default:
throw std::runtime_error("infinilm::layers::attention::AttentionLayer: unsupported attention backend");
Expand Down
3 changes: 2 additions & 1 deletion csrc/layers/attention/backends/attention_layer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ class AttentionLayer {
size_t layer_idx,
infinicore::Tensor k_scale,
infinicore::Tensor v_scale,
::infinilm::backends::AttentionBackend attention_backend);
::infinilm::backends::AttentionBackend attention_backend,
const infinicore::Device &device);

infinicore::Tensor forward(infinicore::Tensor &query,
infinicore::Tensor &key,
Expand Down
Loading