diff --git a/csrc/engine/rank_worker.cpp b/csrc/engine/rank_worker.cpp index e64f2568..cc54188a 100644 --- a/csrc/engine/rank_worker.cpp +++ b/csrc/engine/rank_worker.cpp @@ -370,18 +370,22 @@ void RankWorker::thread_loop() { const auto &vocab_size{logits_shape[2]}; const auto &total_len{logits_shape[1]}; const auto &batch_size{logits_shape[0]}; + int32_t seq_length = static_cast(batch_size * total_len); auto n_req = local_args.input_offsets.value()->size(0) - 1; int32_t *input_offsets = (int32_t *)local_args.input_offsets.value()->data(); + ASSERT(input_offsets[n_req] == seq_length); auto output_ids{infinicore::Tensor::empty({n_req}, infinicore::DataType::I64, rank_info_.device)}; for (auto i{decltype(n_req)(0)}; i < n_req; ++i) { - auto score{logits->view({batch_size * total_len, vocab_size})->narrow({{0, size_t(input_offsets[i + 1] - 1), 1}})->view({vocab_size})}; + int32_t score_index = input_offsets[i + 1] - 1; + ASSERT((input_offsets[i + 1] > input_offsets[i]) && (score_index >= 0) && (score_index < seq_length)); + + auto score{logits->view({batch_size * total_len, vocab_size})->narrow({{0, size_t(score_index), 1}})->view({vocab_size})}; auto out{output_ids->narrow({{0, i, 1}})->view({})}; float random_val = std::uniform_real_distribution(0, 1)(rng_); - infinicore::op::random_sample_( - out, score, random_val, top_p, top_k, temperature); + infinicore::op::random_sample_(out, score, random_val, top_p, top_k, temperature); } output_ids = output_ids->to(infinicore::Device::cpu()); diff --git a/python/infinilm/processors/basic_llm_processor.py b/python/infinilm/processors/basic_llm_processor.py index 6fafb55a..650452ab 100644 --- a/python/infinilm/processors/basic_llm_processor.py +++ b/python/infinilm/processors/basic_llm_processor.py @@ -4,6 +4,29 @@ from ..llm.scheduler import SchedulerOutput +def extend_to_alignment(lst, alignment: int = 64): + """Pad ``lst`` to a multiple of ``alignment`` elements with ``-1``. + + ``alignment`` is in elements, not bytes (default 64). Required for safe + ``infinicore.from_list`` copies; callers should ``narrow`` to the logical + length before passing data to kernels. + + Args: + lst: Input list of numeric offsets or cumulative lengths. + alignment: Element-count alignment. Defaults to 64. + + Returns: + A new list. Empty input yields ``[0]``; already aligned yields a copy. + """ + if not lst: + return [0] + n = len(lst) + aligned_len = ((n + alignment - 1) // alignment) * alignment + if aligned_len == n: + return lst[:] + return lst + [-1] * (aligned_len - n) + + @register_processor("default") class BasicLLMProcessor(InfinilmProcessor): def __init__(self, model_dir_path: str): @@ -25,7 +48,9 @@ def __call__(self, prompt: str, return_tensors: str = None, **kwargs) -> dict: import infinicore result = {} - for key, tensor in self.tokenizer(prompt, return_tensors="pt", add_special_tokens=False).items(): + for key, tensor in self.tokenizer( + prompt, return_tensors="pt", add_special_tokens=False + ).items(): result[key] = tensor.from_torch(tensor) return result @@ -42,9 +67,13 @@ def apply_chat_template( normalized_conversation = [] for message in conversation: if isinstance(message["content"], list): - assert len(message["content"]) == 1, "Only one content item supported in list" + assert len(message["content"]) == 1, ( + "Only one content item supported in list" + ) content_item = message["content"][0] - assert "type" in content_item and "text" in content_item, "Content dict must have 'type' and 'text' keys" + assert "type" in content_item and "text" in content_item, ( + "Content dict must have 'type' and 'text' keys" + ) normalized_conversation.append( {"role": message["role"], "content": content_item["text"]} ) @@ -228,17 +257,42 @@ def _build_model_input_from_batch_scheduler_output( block_tables.append(padded_block_table) cu_seqlens.append(cu_seqlens[-1] + seq_len) + assert seq_offsets[-1] == len(tokens), ( + f"seq_offsets[-1]={seq_offsets[-1]} != len(tokens)={len(tokens)}" + ) + + length = len(seq_offsets) + # Pad to a 64-element boundary for safe from_list/H2D copy, then narrow + # back to the logical length. + seq_offsets = extend_to_alignment(seq_offsets) + cu_seqlens = extend_to_alignment(cu_seqlens) + + # TODO: 其他position_ids,past_kv_lengths,total_kv_lengths,slot_mapping应该都是一维的,请也要padding,并narrow。 + input_ids = infinicore.from_list([tokens], dtype=infinicore.int64) + position_ids = infinicore.from_list(position_ids, dtype=infinicore.int64) + past_kv_lengths = infinicore.from_list(cached_lens, dtype=infinicore.int32) + total_kv_lengths = infinicore.from_list(seq_lens, dtype=infinicore.int32) + + input_offsets = infinicore.from_list( + seq_offsets, dtype=infinicore.int32 + ).narrow(0, 0, length) + + cu_seqlens = infinicore.from_list(cu_seqlens, dtype=infinicore.int32).narrow( + 0, 0, length + ) + + block_tables = infinicore.from_list(block_tables, dtype=infinicore.int32) + slot_mapping = infinicore.from_list(slot_mapping, dtype=infinicore.int64) + return { - "input_ids": infinicore.from_list([tokens], dtype=infinicore.int64), - "position_ids": infinicore.from_list(position_ids, dtype=infinicore.int64), - "past_kv_lengths": infinicore.from_list( - cached_lens, dtype=infinicore.int32 - ), - "total_kv_lengths": infinicore.from_list(seq_lens, dtype=infinicore.int32), - "input_offsets": infinicore.from_list(seq_offsets, dtype=infinicore.int32), - "cu_seqlens": infinicore.from_list(cu_seqlens, dtype=infinicore.int32), - "block_tables": infinicore.from_list(block_tables, dtype=infinicore.int32), - "slot_mapping": infinicore.from_list(slot_mapping, dtype=infinicore.int64), + "input_ids": input_ids, + "position_ids": position_ids, + "past_kv_lengths": past_kv_lengths, + "total_kv_lengths": total_kv_lengths, + "input_offsets": input_offsets, + "cu_seqlens": cu_seqlens, + "block_tables": block_tables, + "slot_mapping": slot_mapping, "temperature": temperature, "top_k": top_k, "top_p": top_p,