diff --git a/common/debug.cpp b/common/debug.cpp index 0df409a79db..4732f73c443 100644 --- a/common/debug.cpp +++ b/common/debug.cpp @@ -3,6 +3,10 @@ #include "log.h" #include +#include +#include +#include +#include #include static std::string common_ggml_ne_string(const ggml_tensor * t) { @@ -155,6 +159,53 @@ template bool common_debug_cb_eval(struct ggml_tensor * t, b if (!ggml_is_quantized(t->type) && matches_filter) { uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data(); common_debug_print_tensor(data, t->type, t->ne, t->nb, 3); + + // Optional full-tensor binary dump for layer-by-layer comparison work. + // Activated by setting env var LLAMA_DUMP_TENSORS_FILE=/path/to/out.bin. + // Optionally narrow what gets dumped with LLAMA_DUMP_TENSORS_REGEX + // (a single regex; anchored implicitly with regex_search). If unset, + // every tensor that already matched cb_data's filter gets dumped. + // Per-tensor binary record (little-endian): + // u32 name_len, char name[name_len], + // u32 dtype (ggml_type), i64 ne[4], + // u64 n_bytes, u8 data[n_bytes] + const char * dump_path = std::getenv("LLAMA_DUMP_TENSORS_FILE"); + if (dump_path) { + static std::regex dump_regex; + static bool dump_regex_set = false; + static bool dump_regex_valid = false; + if (!dump_regex_set) { + dump_regex_set = true; + const char * pat = std::getenv("LLAMA_DUMP_TENSORS_REGEX"); + if (pat && *pat) { + try { dump_regex = std::regex(pat); dump_regex_valid = true; } + catch (const std::regex_error &) { dump_regex_valid = false; } + } + } + bool should_dump = !dump_regex_valid || std::regex_search(t->name, dump_regex); + if (should_dump) { + static FILE * dump_fout = nullptr; + static std::string opened_path; + if (!dump_fout || opened_path != dump_path) { + if (dump_fout) fclose(dump_fout); + dump_fout = std::fopen(dump_path, "wb"); + opened_path = dump_path; + } + if (dump_fout) { + uint32_t name_len = (uint32_t) std::strlen(t->name); + std::fwrite(&name_len, 4, 1, dump_fout); + std::fwrite(t->name, 1, name_len, dump_fout); + uint32_t dtype = (uint32_t) t->type; + std::fwrite(&dtype, 4, 1, dump_fout); + int64_t ne[4] = { t->ne[0], t->ne[1], t->ne[2], t->ne[3] }; + std::fwrite(ne, 8, 4, dump_fout); + uint64_t nbytes = (uint64_t) ggml_nbytes(t); + std::fwrite(&nbytes, 8, 1, dump_fout); + std::fwrite(data, 1, nbytes, dump_fout); + std::fflush(dump_fout); + } + } + } } return true; diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 46469c86200..dbb11961d62 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -1490,6 +1490,9 @@ def get_vocab_base_pre(self, tokenizer) -> str: if chkhsh == "e4d54df1ebc1f2b91acd986c5b51aa50837d5faf7c7398e73c1f9e9ee5d19869": # ref: https://huggingface.co/kakaocorp/kanana-2-30b-a3b-instruct-2601 res = "kanana2" + if chkhsh == "5f9861fd826d8e124b222f41f41b928e78d8f6c8fbdf25625d06cc1e8736662c": + # ref: https://huggingface.co/OpenLLM-France/Luciole-1B-Base + res = "qwen2" if res is None: logger.warning("\n") @@ -1515,15 +1518,179 @@ def get_vocab_base_pre(self, tokenizer) -> str: def _set_vocab_none(self) -> None: self.gguf_writer.add_tokenizer_model("none") - def _set_vocab_gpt2(self) -> None: + @staticmethod + def _gpt2_bytes_to_unicode() -> dict[int, str]: + # Returns the GPT-2 byte-to-unicode mapping: each byte (0-255) maps to a + # printable unicode character. Printable ASCII and Latin-1 supplement bytes + # map to themselves; remaining bytes are shifted to 256+. + # This is the same as openai/gpt-2's bytes_to_unicode(). + bs = list(range(ord("!"), ord("~") + 1)) + list(range(0xA1, 0xAC + 1)) + list(range(0xAE, 0xFF + 1)) + cs = list(bs) + n = 0 + for b in range(256): + if b not in bs: + bs.append(b) + cs.append(256 + n) + n += 1 + return dict(zip(bs, (chr(c) for c in cs))) + + def _set_vocab_gpt2(self, convert_metaspace_to_gpt2=False) -> None: tokens, toktypes, tokpre = self.get_vocab_base() + + if convert_metaspace_to_gpt2: + # The tokenizer uses raw UTF-8 with Metaspace (▁ for spaces), but + # the "gpt2" tokenizer model in llama.cpp expects GPT-2 byte encoding + # (where each byte is mapped to a printable unicode char, e.g. space -> Ġ). + # Convert all tokens: replace ▁ back to space, then apply GPT-2 byte encoding. + byte_encoder = self._gpt2_bytes_to_unicode() + seen: set[str] = set() + for i, token in enumerate(tokens): + if toktypes[i] in (gguf.TokenType.NORMAL, gguf.TokenType.USER_DEFINED): + if token == " ": + # Useless token in Luciole + encoded = "".join(byte_encoder[b] for b in "\u2581".encode("utf-8")) + else: + encoded = "".join(byte_encoder[b] for b in token.replace("\u2581", " ").encode("utf-8")) + assert encoded not in seen, f"Unexpected collision in GPT-2 byte encoding: {encoded!r} for '{token}'" + seen.add(encoded) + tokens[i] = encoded + else: # gguf.TokenType.CONTROL + print("NOCOMMIT", i, token, toktypes[i]) + assert token not in seen, f"Unexpected collision in GPT-2 byte encoding: {token}" + seen.add(token) + self.gguf_writer.add_tokenizer_model("gpt2") self.gguf_writer.add_tokenizer_pre(tokpre) self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_types(toktypes) - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) + if convert_metaspace_to_gpt2: + special_vocab.merges = [ + " ".join( + "".join(byte_encoder[b] for b in part.replace("\u2581", " ").encode("utf-8")) + for part in merge.split(" ") + ) + for merge in special_vocab.merges + ] special_vocab.add_to_gguf(self.gguf_writer) + return tokens + + def _set_vocab_bpe_as_spm(self) -> None: + """Convert a HuggingFace BPE tokenizer (with Metaspace ▁) to SPM format for llama.cpp. + + This reads the vocab from tokenizer.json, keeps tokens in their original + UTF-8 form (with ▁ preserved), assigns scores from merge ranks, and adds + byte fallback tokens <0x00>-<0xFF> required by the SPM tokenizer in C++. + """ + from transformers import AutoTokenizer + + tokenizer = AutoTokenizer.from_pretrained(self.dir_model) + vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab)) + + reverse_vocab = {id_: tok for tok, id_ in tokenizer.vocab.items()} + added_vocab = tokenizer.get_added_vocab() + added_tokens_decoder = tokenizer.added_tokens_decoder + + # Build merge rank lookup: token_text -> rank (lower rank = merged earlier = higher priority) + merge_ranks: dict[str, int] = {} + merges_file = self.dir_model / "tokenizer.json" + if merges_file.is_file(): + import json as _json + with open(merges_file, "r", encoding="utf-8") as f: + tokenizer_json = _json.load(f) + merges = tokenizer_json.get("model", {}).get("merges", []) + for rank, merge in enumerate(merges): + # merge can be "token_a token_b" (str) or ["token_a", "token_b"] (list) + parts = merge.split(" ") if isinstance(merge, str) else merge + merged_token = "".join(parts) + if merged_token not in merge_ranks: + merge_ranks[merged_token] = rank + + # Prepare token arrays + tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] + scores: list[float] = [-10000.0] * vocab_size + toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size + + # Track which byte values are covered (for byte fallback) + byte_token_ids: dict[int, int] = {} + + for token_id in range(vocab_size): + if token_id not in reverse_vocab: + continue + + token_text = reverse_vocab[token_id] + + if token_id in added_tokens_decoder: + info = added_tokens_decoder[token_id] + if info.special or self.does_token_look_special(token_text): + tokens[token_id] = token_text.encode("utf-8") + scores[token_id] = 0.0 + toktypes[token_id] = SentencePieceTokenTypes.CONTROL + continue + + # Check if this is a byte fallback token (<0xHH>) or a single-byte token + import re as _re + raw_bytes = token_text.encode("utf-8") + byte_match = _re.fullmatch(r"<0x([0-9A-Fa-f]{2})>", token_text) + if byte_match: + byte_val = int(byte_match.group(1), 16) + byte_token_ids[byte_val] = token_id + tokens[token_id] = token_text.encode("utf-8") + scores[token_id] = -10000.0 + toktypes[token_id] = SentencePieceTokenTypes.BYTE + continue + elif len(raw_bytes) == 1: + byte_token_ids[raw_bytes[0]] = token_id + + # Assign score based on merge rank or token_id + if token_text in merge_ranks: + # Merged tokens: earlier merges get higher (less negative) scores + # Use negative rank so that rank 0 (first merge) gets highest score + score = -float(merge_ranks[token_text]) + else: + # Base tokens (single chars) get high scores; unknown tokens get low scores + if len(raw_bytes) == 1: + score = 0.0 + else: + score = -10000.0 + float(token_id) + + tokens[token_id] = raw_bytes + scores[token_id] = score + toktypes[token_id] = SentencePieceTokenTypes.NORMAL + + # Add byte fallback tokens for any missing byte values + # SPM in llama.cpp requires <0x00> through <0xFF> with BYTE type + next_pad_idx = 0 + for byte_val in range(256): + if byte_val in byte_token_ids: + continue # already handled above + hex_str = f"<0x{byte_val:02X}>" + if byte_val in byte_token_ids: + tid = byte_token_ids[byte_val] + tokens[tid] = hex_str.encode("utf-8") + toktypes[tid] = SentencePieceTokenTypes.BYTE + scores[tid] = -10000.0 + else: + # Find an unused PAD slot + while next_pad_idx < len(tokens) and toktypes[next_pad_idx] != SentencePieceTokenTypes.UNUSED: + next_pad_idx += 1 + if next_pad_idx < vocab_size: + tokens[next_pad_idx] = hex_str.encode("utf-8") + toktypes[next_pad_idx] = SentencePieceTokenTypes.BYTE + scores[next_pad_idx] = -10000.0 + next_pad_idx += 1 + else: + logger.warning(f"No room to add byte fallback token {hex_str}") + + self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_tokenizer_pre("default") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + return tokens def _set_vocab_qwen(self): dir_model = self.dir_model @@ -9607,14 +9774,50 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter yield from super().modify_tensors(data_torch, name, bid) +LUCIOLE_TO_BPE = False +def set_vocab_luciole(self): + # Luciole + # Promote every entry of added_tokens_decoder to a control token, even those + # flagged "special": false in tokenizer_config.json (e.g. , + # , , ). Otherwise llama.cpp's + # tokenizer BPE-splits them at inference, diverging from training. + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(self.dir_model) + added_token_texts = {info.content for info in tokenizer.added_tokens_decoder.values()} + original_does_token_look_special = self.does_token_look_special + + def does_token_look_special_with_added(token): + token_text = token.decode("utf-8") if isinstance(token, (bytes, bytearray)) else token + if token_text in added_token_texts: + return True + return original_does_token_look_special(token) + + self.does_token_look_special = does_token_look_special_with_added + try: + if LUCIOLE_TO_BPE: + tokens = self._set_vocab_gpt2(convert_metaspace_to_gpt2=True) + self.gguf_writer.add_pad_token_id(tokens.index("")) + self.gguf_writer.add_unk_token_id(tokens.index("")) + else: + tokens = self._set_vocab_bpe_as_spm() + self.gguf_writer.add_pad_token_id(tokens.index(b"")) + self.gguf_writer.add_unk_token_id(tokens.index(b"")) + finally: + self.does_token_look_special = original_does_token_look_special + self.gguf_writer.add_add_space_prefix(True) + + @ModelBase.register("NemotronForCausalLM") class NemotronModel(TextModel): model_arch = gguf.MODEL_ARCH.NEMOTRON def set_vocab(self): - self._set_vocab_sentencepiece() - self.gguf_writer.add_pad_token_id(0) - self.gguf_writer.add_unk_token_id(1) + if (self.dir_model / "tokenizer.model").is_file(): + self._set_vocab_sentencepiece() + self.gguf_writer.add_pad_token_id(0) + self.gguf_writer.add_unk_token_id(1) + else: + set_vocab_luciole(self) def set_gguf_parameters(self): super().set_gguf_parameters() @@ -9642,8 +9845,20 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter # model.layers.{l}.input_layernorm.weight # model.layers.{l}.post_attention_layernorm.weight # model.norm.weight + # NOTE: cast to fp32 BEFORE the +1 — source weights are bf16/fp16 and the + # add would otherwise happen at the source dtype, quantizing γ by ~3.9e-3 + # (bf16) / ~9.8e-4 (fp16) per element. GGUF stores these tensors as F32, + # so doing the arithmetic at full precision is free. if name.endswith("norm.weight"): - data_torch = data_torch + 1 + data_torch = data_torch.float() + 1 + + # for tied embeddings, duplicate token_embd as output.weight + if self.hparams.get("tie_word_embeddings", False) and name == "model.embed_tokens.weight": + yield (self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch) + + # skip lm_head.weight if tie_word_embeddings is True (already emitted from embed_tokens above) + if self.hparams.get("tie_word_embeddings", False) and name == "lm_head.weight": + return yield from super().modify_tensors(data_torch, name, bid) @@ -10091,6 +10306,8 @@ def __init__(self, *args, **kwargs): self.model_arch = gguf.MODEL_ARCH.NEMOTRON_H_MOE self.is_moe = True + self.is_luciole = hparams.get("bos_token_id", -1) == 0 + super().__init__(*args, **kwargs) # Save the top-level head_dim for later @@ -10164,6 +10381,10 @@ def set_gguf_parameters(self): self.gguf_writer.add_moe_latent_size(latent_size) def set_vocab(self): + if self.is_luciole: + set_vocab_luciole(self) + return + super().set_vocab() # The tokenizer _does_ add a BOS token (via post_processor type diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp index 17d162d95d3..3f5e382b120 100644 --- a/examples/eval-callback/eval-callback.cpp +++ b/examples/eval-callback/eval-callback.cpp @@ -15,7 +15,14 @@ static bool run(llama_context * ctx, const common_params & params) { const bool add_bos = llama_vocab_get_add_bos(vocab); - std::vector tokens = common_tokenize(ctx, params.prompt, add_bos); + // Opt-in atomic tokenization of control strings: set + // LLAMA_TOKENIZE_PARSE_SPECIAL=1 to make chat-template tokens like + // <|im_start|> / <|im_end|> / tokenize as a single id instead + // of being byte-split. Default behaviour (env var unset) is unchanged. + const char * env_parse_special = std::getenv("LLAMA_TOKENIZE_PARSE_SPECIAL"); + const bool parse_special = env_parse_special != nullptr && + env_parse_special[0] != '\0' && env_parse_special[0] != '0'; + std::vector tokens = common_tokenize(ctx, params.prompt, add_bos, parse_special); if (tokens.empty()) { LOG_ERR("%s : there are not input tokens to process - (try to provide a prompt with '-p')\n", __func__); diff --git a/test_conversion/README.md b/test_conversion/README.md new file mode 100644 index 00000000000..98ebf416ed6 --- /dev/null +++ b/test_conversion/README.md @@ -0,0 +1,283 @@ +# test_conversion + +Validate that an HF transformers model has been converted to GGUF +faithfully — both the tokenizer/chat-template AND the model weights — by +comparing the **transformers reference** against the **GGUF served by +Ollama**. Used to catch tokenizer drift, broken chat templates, +quantization regressions, conversion bugs in `convert_hf_to_gguf.py`, +etc., before publishing a release. + +The suite has two parts: + +1. **`test_main.py`** — main 5-step pipeline (tokenizer, chat + template, behaviour, logits). Run this for every release. +2. **`run_layer_diff.py` + `compare_layers.py`** — deeper layer-by-layer + activation comparison. Run when (1) flags a logit-level regression + and you need to localize which op causes it. + +## Prerequisites + +### Python + +```bash +pip install transformers torch requests numpy matplotlib safetensors +``` + +### llama.cpp built (with our patches) + +The layer-diff tool uses two env-gated patches in `llama.cpp`: + +- `common/debug.cpp` — binary tensor dump triggered by + `LLAMA_DUMP_TENSORS_FILE` and `LLAMA_DUMP_TENSORS_REGEX`. Pure no-op + when those vars are unset. +- `examples/eval-callback/eval-callback.cpp` — atomic tokenization of + control tokens (`<|im_start|>` etc.) when `LLAMA_TOKENIZE_PARSE_SPECIAL=1`. + Default behaviour unchanged. + +Build: + +```bash +cd /path/to/llama.cpp +cmake -B build +cmake --build build --target llama-eval-callback -j$(nproc) +``` + +### Ollama server running + +```bash +ollama serve # in another terminal +``` + +The main pipeline talks to it on `http://localhost:11434`. + +--- + +## Step 0 — Convert HF → GGUF + +Starting from a HuggingFace transformers checkpoint directory: + +```bash +HF_MODEL=/path/to/Luciole-1B-SFT-1.2 # contains config.json, model.safetensors, tokenizer.* +GGUF_DIR=/path/to/Luciole-1B-SFT-1.2-gguf # output directory + +mkdir -p "$GGUF_DIR" +python /path/to/llama.cpp/convert_hf_to_gguf.py \ + "$HF_MODEL" \ + --outfile "$GGUF_DIR/Luciole-1B-SFT-f16.gguf" \ + --outtype f16 +``` + +For a quantized variant (smaller, faster, with some precision loss): + +```bash +/path/to/llama.cpp/build/bin/llama-quantize \ + "$GGUF_DIR/Luciole-1B-SFT-f16.gguf" \ + "$GGUF_DIR/Luciole-1B-SFT-q4_k_m.gguf" \ + Q4_K_M +``` + +## Step 1 — Write the Ollama `Modelfile` + +In `$GGUF_DIR/Modelfile`: + +``` +FROM ./Luciole-1B-SFT-f16.gguf # or your quantized variant +PARAMETER seed 1234 +PARAMETER num_ctx 32000 +PARAMETER temperature 0.6 +SYSTEM "You are a helpful AI assistant named Luciole, trained by LINAGORA and OpenLLM France." +TEMPLATE """ +…your Go-template version of the jinja chat template, including {{- range .Tools }}{{ . }}{{- end }} for tool support… +""" +PARAMETER stop "<|im_end|>" +PARAMETER stop "<|im_start|>" +… +``` + +Two pitfalls Ollama 0.24 hits silently: + +- `FROM` must be **relative** to the Modelfile directory. Absolute paths + fail with `no Modelfile or safetensors files found`. +- Ollama detects tool-calling capability from the template body. For the + `nemotron` architecture only the literal `{{ . }}` form inside + `{{ range .Tools }}` is recognized — `{{ .Function }}` or + `{{ json . }}` will silently disable tool support (Ollama returns + `does not support tools` on any tool request). + +--- + +## Step 2 — Run the main pipeline + +```bash +cd /path/to/llama.cpp/test_conversion +python test_main.py "$HF_MODEL" "$GGUF_DIR" +``` + +This runs five steps; each writes a JSON into +`results/__vs__/` and is skipped on rerun if +its JSON already exists. Pass `--force` to recompute, or delete the +JSON manually for a partial rerun. Slow steps can be turned off with +`--no-behavior` and `--no-logits`. + +### What each step checks + +| step | script | output | what it tests | +|------|--------|--------|---------------| +| 1 | `run_transformers.py` | `transformers.json` | renders each test case with `tokenizer.apply_chat_template(...)` and tokenizes — the reference for everything else. | +| 2 | `run_ollama.py` | `ollama.json` | per case, asks Ollama for `prompt_eval_count` two ways: (a) `/api/chat` (Ollama applies its Modelfile template + GGUF tokenizer); (b) `/api/generate raw=true` fed the transformers-rendered prompt (GGUF tokenizer only). | +| 3 | `run_behavior.py` | `behavior.json` | for cases with `expected_behavior`, sends the prompt to the model via `/api/generate raw=true`, parses the generated text for `{…}`, verifies tool name + required args. Bypasses Ollama's `/api/chat` tool parser, which is unreliable on `nemotron`-arch models in 0.24. | +| 4 | `run_logits.py` | `logits.json` | for the same prompts, runs transformers forward pass and Ollama with `logprobs=true`, compares next-token top-K distributions. Catches quantization / conversion regressions invisible to a binary tool-call test. | +| 5 | `compare.py` | stdout + exit code | unified report. | + +### Reading the report + +`compare.py` prints three sections. + +**Token-count comparison** — per case, two columns: +- `tokenizer` = transformers `apply_chat_template(tokenize=True)` length vs + Ollama's `prompt_eval_count` on the same rendered prompt via raw mode. +- `chat template` = same but Ollama applies its own template. + +Some mismatches are flagged `[WARN]` (with note) instead of `[FAIL]`: +- **tokenizer +1 in tool cases**: known llama.cpp SentencePiece quirk + — a single space following a special/added token gets segmented as a + spurious `▁▁` (two-space) piece. See the *Known issues* section + below. +- **chat template −N in tool cases**: Ollama renders each tool via + Go's `json.Marshal` (compact JSON, no spaces); jinja's `tojson` uses + pretty JSON (with spaces). Same data, only whitespace differs. + +**Behavioural check** — for each `expected_behavior` case, did the +model emit a valid `` block with the right name + args? + +**Logit comparison** — for each case (skipping ones without a generation +prompt), how close are the next-token distributions? + +- `top1` ✓/✗: same most-likely next token (matched by vocab id) +- `|Δlp_top1|`: absolute logprob diff on the chosen token + (fp16-vs-fp16: typically < 0.1; Q4_K_M: < 0.5 normal) +- `mean|Δlp|_top3`: mean of `|Δlp|` over TF's top-3 tokens +- `top5_overlap` / `miss`: how many of TF's top-5 are even in Ollama's top-K + +The aggregate thresholds for FAIL/WARN are documented at the top of +`compare.py`. + +### Exit code + +- `0` — PASS, or PASS with known acceptable warnings. +- `1` — at least one `[FAIL]` somewhere. + +--- + +## Step 3 — Layer-by-layer diagnostic (optional) + +When the logit step flags an unexpected regression on a specific case, +this localizes which layer (and which op type within the layer) is +introducing the divergence. + +```bash +python run_layer_diff.py "$HF_MODEL" "$GGUF_DIR/Luciole-1B-SFT-f16.gguf" \ + --transformers-output results/__vs__/transformers.json \ + --case 02_system_user \ + --work-dir results/__vs__/layer_diff_02_system_user + +python compare_layers.py results/__vs__/layer_diff_02_system_user --top-tail-only +``` + +Outputs: +- `tf_layers.npz` — transformers per-layer hidden states + intermediate + hook outputs (input_layernorm, self_attn, post_attention_layernorm, + mlp, final_norm, logits). +- `gguf_layers.bin` — llama.cpp per-layer activations + (`attn_norm-i`, `ffn_inp-i`, `ffn_norm-i`, `l_out-i`, `result_norm`, + `result_output`). +- `layer_diff_report.txt` — per-pair max/mean abs diff, relative max, + cosine distance, l2 relative error. +- `layer_diff_overview.png` — log-scale divergence vs layer index, + one series per op type. +- `layer_diff_l_out.png` — focused view of per-layer block output drift. + +`--top-tail-only` restricts comparison to the **last token position** — +this is what matters for next-token prediction and avoids confusion at +the last layer where llama.cpp uses `inp_out_ids` to compute only the +last position. + +### How to read the layer-diff + +If `L00 attn_norm` matches to floating-point precision but `L00 ffn_inp` +diverges, the **attention block** is to blame. If `L00 attn_norm` already +diverges, the **input embedding or tokenization** is to blame (or the +input LN itself). And so on along the column of op types. + +--- + +## Known issues / current findings (Luciole 1B SFT 1.2) + +- **Tokenizer +1 in tool cases** — llama.cpp's SentencePiece-style + tokenizer emits a spurious `▁▁` (double-space) piece when a special + token is followed by exactly one literal space then text. Affects the + fixed instruction string `function name and arguments within + XML tags:` in the system prompt of every + tool-using conversation. Reported upstream; harmless (decoded string + unchanged), but the model sees one out-of-distribution token per + request. Flagged `[WARN]`. + +- **Chat template `−N` in tool cases** — Ollama renders each tool with + Go's `json.Marshal` (compact); jinja uses pretty JSON. ~21 tokens + saved per tool definition. Cosmetic; the model parses both + identically. Flagged `[WARN]`. + +- **Logit drift at layer 0, attention block** — even with f16 GGUF + matching f16 transformers, the attention output already diverges + significantly at L00 (cos_d ≈ 0.05 on case 02). Most likely + PyTorch SDPA vs llama.cpp attention kernel: different reduction + orders in fp16 give different accumulation. Drifts to ~0.3 cos_d by + the last layer. Top-1 token usually still matches. + +- **`convert_hf_to_gguf.py` precision pitfall** — for the Nemotron + LayerNorm1p hack, `data_torch + 1` must be done in fp32, otherwise + the bf16 source values round before storage. Use + `data_torch.float() + 1`. Other entries in the converter with the + same `+ 1` pattern (Gemma, Nemotron-H, line ~8887 MTP block, lines + 5731+ for some mamba variant) should be audited similarly. + +--- + +## File layout + +``` +test_conversion/ +├── README.md # this file +├── test_main.py # main orchestrator (steps 1–5) +├── test_cases.py # canonical test conversations +├── run_transformers.py # step 1 +├── run_ollama.py # step 2 +├── run_behavior.py # step 3 +├── run_logits.py # step 4 +├── compare.py # step 5 — unified report +├── run_layer_diff.py # layer-diff tool +├── compare_layers.py # layer-diff report + plots +├── test.sh # convenience wrapper (if present) +└── results/ # outputs land here, one subfolder per __vs__ + └── __vs__/ + ├── transformers.json + ├── ollama.json + ├── behavior.json + ├── logits.json + └── layer_diff_/ + ├── tf_layers.npz + ├── gguf_layers.bin + ├── meta.json + ├── layer_diff_report.txt + └── *.png +``` + +## Hard-coded paths to update + +`run_layer_diff.py` has the llama.cpp build path baked in at the top: + +```python +LLAMA_BIN_DIR = Path("/home/jlouradour/src.nowsl/llama.cpp/build/bin") +``` + +Change this if your build directory is elsewhere. diff --git a/test_conversion/compare.py b/test_conversion/compare.py new file mode 100644 index 00000000000..1cd182881db --- /dev/null +++ b/test_conversion/compare.py @@ -0,0 +1,348 @@ +""" +Compare transformers.json and ollama.json (token counts) and optionally +behavior.json (functional tool-call check), then print a per-test report. + +Token-count comparison (two checks per case): + + Tokenizer + Pass the transformers-rendered prompt through Ollama with raw=true + and compare prompt_eval_count to len(transformers token_ids). + Tests just the GGUF tokenizer, isolated from the chat template. + + Chat template + Pass the conversation through Ollama's /api/chat. + Tests Ollama's template + GGUF tokenizer together. + +Known acceptable divergences (reported as [WARN], not [FAIL]): + + [tokenizer +1 in tool cases] + SentencePiece single-space-after-special-token quirk in llama.cpp's + BPE tokenizer. The model sees one extra space-prefix token where the + HF tokenizer didn't. Harmless (same decoded string). See llama.cpp + issue tracker for the upstream bug. + + [chat template -N in tool cases] + Ollama renders each tool via Go's json.Marshal (compact JSON, no + spaces). The jinja template uses tojson (pretty JSON, with spaces). + Same data, same field order, just whitespace. The model parses both + identically; cosmetic. + +Behavioural section (only if --behavior path is provided): + + For each case with an `expected_behavior` field, run_behavior.py asked + Ollama to actually generate a turn and checked whether the assistant's + response satisfied the expectation (correct tool_call name + args). + +Logits section (only if --logits path is provided): + + Per-case next-token top-K log-probability comparison between + transformers (reference) and Ollama (GGUF). Catches subtle conversion + or quantization regressions that the binary behavioural test misses. + Aggregate metrics: top-1 agreement rate, mean KL divergence. + +Exit code is 0 iff no [FAIL] anywhere; [WARN]s are non-fatal. + + Heuristic thresholds for the logits section: + top-1 agreement < 50% -> [FAIL] (very likely conversion bug) + top-1 agreement < 80% -> [WARN] + aggregate |Δlp_top1| > 1.0 -> [FAIL] (model is very differently confident) + aggregate |Δlp_top1| > 0.3 -> [WARN] + aggregate mean|Δlp|_top3 > 3.0 -> [WARN] (not a FAIL — top-3 is + sensitive to tail noise; use + only as a soft signal) + + Notes: + - Top-1 lp diff is the primary numeric signal. For fp16-vs-fp16 it + is typically < 0.1. For Q4_K_M, < 0.5 is normal. + - mean|Δlp|_top3 is reported for completeness but is noisy by nature: + fp16 softmax precision degrades for low-probability tokens, so a + single tail outlier can drag the mean up. Use top-1 metrics for + pass/fail; treat mean_top3 as informational. + - Top-1 mismatches often differ only in vocab variant (e.g. `▁The` + vs `The` for the same word) — those are real model behaviour + differences worth noting but typically caused by SP/llama.cpp + tokenization quirks, not gross conversion errors. + +Usage: + python compare.py + [--behavior ] + [--logits ] +""" + +import argparse +import json +import sys +from pathlib import Path + + +def load(path): + return {r["name"]: r for r in json.loads(Path(path).read_text())} + + +def fmt(status): + return {"ok": "[ OK ]", "warn": "[WARN]", "fail": "[FAIL]"}[status] + + +def classify(case_has_tools, tf_count, actual_count, kind): + """Return (status, label, note) for one column.""" + if actual_count is None: + return "ok", "skipped", None # neutral; not a failure if intentionally skipped + diff = actual_count - tf_count + if diff == 0: + return "ok", f"{tf_count} vs {actual_count}", None + + if kind == "tokenizer" and diff == 1 and case_has_tools: + return ("warn", + f"{tf_count} vs {actual_count} (+1)", + "SPM single-space-after-special-token quirk (llama.cpp tokenizer bug; harmless)") + if kind == "chat" and diff < 0 and case_has_tools: + return ("warn", + f"{tf_count} vs {actual_count} ({diff:+d})", + "Ollama renders tools as compact JSON; jinja uses pretty JSON (whitespace only; cosmetic)") + return "fail", f"{tf_count} vs {actual_count} ({diff:+d})", None + + +def report_counts(tf_map, ol_map): + """Returns (has_any_fail, has_any_warn, collected_notes_by_name).""" + names = sorted(set(tf_map) | set(ol_map)) + notes_by_name = {} # name -> list of strings + any_fail = False + any_warn = False + + print(f"\n{'name':<40} {'tokenizer':<32} {'chat template':<32}") + print("-" * 110) + + for name in names: + tf_r = tf_map.get(name) + ol_r = ol_map.get(name) + + if tf_r is None or "error" in tf_r: + err = tf_r.get("error", "missing") if tf_r else "missing" + print(f"{name:<40} transformers ERROR: {err}") + any_fail = True + continue + if ol_r is None: + print(f"{name:<40} ollama ERROR: missing") + any_fail = True + continue + + tf_count = tf_r["token_count"] + has_tools = tf_r.get("tools") is not None + + # Tokenizer probe + raw_err = ol_r.get("raw_error") + if raw_err: + tok_status, tok_label = "fail", f"err: {raw_err[:24]}" + tok_note = None + else: + tok_status, tok_label, tok_note = classify( + has_tools, tf_count, ol_r.get("raw_prompt_eval_count"), "tokenizer") + + # Chat template probe + chat_err = ol_r.get("chat_error") + if chat_err: + chat_status, chat_label = "fail", f"err: {chat_err[:24]}" + chat_note = None + else: + chat_status, chat_label, chat_note = classify( + has_tools, tf_count, ol_r.get("chat_prompt_eval_count"), "chat") + + tok_cell = f"{fmt(tok_status)} {tok_label}" + chat_cell = f"{fmt(chat_status)} {chat_label}" + print(f"{name:<40} {tok_cell:<32} {chat_cell:<32}") + + notes = [] + if tok_note: notes.append(f"tokenizer: {tok_note}") + if chat_note: notes.append(f"chat: {chat_note}") + if notes: + notes_by_name[name] = notes + + any_fail = any_fail or (tok_status == "fail" or chat_status == "fail") + any_warn = any_warn or (tok_status == "warn" or chat_status == "warn") + + # Print accumulated WARN notes (each unique note once is more readable) + if notes_by_name: + print() + print("WARN notes:") + printed = set() + for name, notes in notes_by_name.items(): + for n in notes: + if n not in printed: + print(f" - {n}") + printed.add(n) + print(" (cases with these warnings: " + + ", ".join(sorted(notes_by_name)) + ")") + + return any_fail, any_warn + + +def report_logits(logits_list): + """Logits section. Returns (has_any_fail, has_any_warn).""" + print(f"\n{'name':<40} {'top1':<5} {'|Δlp_top1|':<11} {'mean|Δlp|_top3':<15} " + f"{'top5':<5} {'miss':<5} {'tf top-1':<22} {'ol top-1':<22}") + print("-" * 140) + + top1_total = 0 + top1_matches = 0 + top1_diffs = [] + top3_means = [] + skipped = [] + any_error = False + + for r in logits_list: + name = r["name"] + if "skipped" in r: + skipped.append((name, r["skipped"])) + continue + if "error" in r: + print(f"{name:<40} ERROR: {r['error']}") + any_error = True + continue + cmp = r.get("comparison") + if not cmp: + print(f"{name:<40} no comparison (empty logprobs?)") + any_error = True + continue + + top1_total += 1 + if cmp["top1_match"]: + top1_matches += 1 + if cmp.get("top1_lp_diff") is not None: + top1_diffs.append(cmp["top1_lp_diff"]) + if cmp.get("mean_lp_diff_top3") is not None: + top3_means.append(cmp["mean_lp_diff_top3"]) + + f = lambda v: (f"{v:.4f}" if v is not None else "n/a") + miss_s = f"{cmp['tf_top5_missing_in_ollama_topk']}/5" + tf1 = cmp["tf_top1"]; ol1 = cmp["ol_top1"] + tf_lbl = f"{tf1['tok']!r}@{tf1['lp']}" + ol_lbl = f"{ol1['tok']!r}@{ol1['lp']}" + marker = "✓" if cmp["top1_match"] else "✗" + print(f"{name:<40} {marker:<5} {f(cmp.get('top1_lp_diff')):<11} " + f"{f(cmp.get('mean_lp_diff_top3')):<15} {cmp['top5_overlap']}/5 " + f"{miss_s:<5} {tf_lbl[:21]:<22} {ol_lbl[:21]:<22}") + + if skipped: + print() + print(" Skipped (no add_generation_prompt — no canonical next token):") + for n, why in skipped: + print(f" - {n} ({why})") + + print() + if top1_total == 0: + print(" (no logit comparisons completed)") + return any_error, False + + top1_rate = top1_matches / top1_total + agg_top1 = (sum(top1_diffs) / len(top1_diffs)) if top1_diffs else None + agg_top3 = (sum(top3_means) / len(top3_means)) if top3_means else None + + print(f" aggregate over {top1_total} comparable case(s):") + print(f" top-1 agreement = {top1_matches}/{top1_total} = {top1_rate*100:.1f}%") + if agg_top1 is not None: + print(f" aggregate |Δlp_top1| = {agg_top1:.4f}") + if agg_top3 is not None: + print(f" aggregate mean|Δlp|_top3 = {agg_top3:.4f}") + + fail = False + warn = False + if top1_rate < 0.5: + print(f" [FAIL] top-1 agreement {top1_rate*100:.1f}% < 50% — likely a conversion bug") + fail = True + elif top1_rate < 0.8: + print(f" [WARN] top-1 agreement {top1_rate*100:.1f}% < 80% — investigate the mismatching cases") + warn = True + if agg_top1 is not None: + if agg_top1 > 1.0: + print(f" [FAIL] |Δlp_top1| {agg_top1:.4f} > 1.0 — confidence on top token diverges sharply") + fail = True + elif agg_top1 > 0.3: + print(f" [WARN] |Δlp_top1| {agg_top1:.4f} > 0.3 — model is less confident on chosen tokens; investigate") + warn = True + if agg_top3 is not None and agg_top3 > 3.0: + # WARN-only: top-3 mean is noisy by nature (fp16 softmax tail). + print(f" [WARN] mean|Δlp|_top3 {agg_top3:.4f} > 3.0 — distribution shifted in top-3 (soft signal)") + warn = True + return (fail or any_error), warn + + +def report_behavior(behavior_map): + """Behavioural section. Returns has_any_fail.""" + print(f"\n{'name':<40} {'behaviour':<60}") + print("-" * 110) + + any_fail = False + for name in sorted(behavior_map): + r = behavior_map[name] + ok = r.get("pass") is True + reason = r.get("fail_reason") or "" + status = "ok" if ok else "fail" + print(f"{name:<40} {fmt(status)} {reason[:55]}") + if not ok: + any_fail = True + # Print actual model output details for debugging + raw = r.get("raw_output") + if raw is not None: + snippet = raw if len(raw) <= 200 else raw[:200] + "...(truncated)" + print(f" raw model output: {snippet!r}") + parsed = r.get("parsed_tool_call") + if parsed: + print(f" parsed tool_call: name={parsed.get('name')!r} args={parsed.get('arguments')!r}") + return any_fail + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("transformers_json") + parser.add_argument("ollama_json") + parser.add_argument("--behavior", default=None, + help="Optional behaviour JSON from run_behavior.py") + parser.add_argument("--logits", default=None, + help="Optional logits JSON from run_logits.py") + args = parser.parse_args() + + tf_map = load(args.transformers_json) + ol_map = load(args.ollama_json) + + print("=" * 120) + print("TOKEN COUNT COMPARISON") + print("=" * 120) + count_fail, count_warn = report_counts(tf_map, ol_map) + + behavior_fail = False + if args.behavior and Path(args.behavior).exists(): + beh_map = load(args.behavior) + if beh_map: + print() + print("=" * 120) + print("BEHAVIOURAL CHECK (model actually called the right tool)") + print("=" * 120) + behavior_fail = report_behavior(beh_map) + + logits_fail = False + logits_warn = False + if args.logits and Path(args.logits).exists(): + logits_list = json.loads(Path(args.logits).read_text()) + if logits_list: + print() + print("=" * 120) + print("LOGIT COMPARISON (next-token top-K distribution; transformers vs Ollama)") + print("=" * 120) + logits_fail, logits_warn = report_logits(logits_list) + + print() + any_fail = count_fail or behavior_fail or logits_fail + any_warn = count_warn or logits_warn + if any_fail: + print(">>> RESULT: FAIL") + sys.exit(1) + elif any_warn: + print(">>> RESULT: PASS (with known acceptable warnings)") + sys.exit(0) + else: + print(">>> RESULT: PASS") + sys.exit(0) + + +if __name__ == "__main__": + main() diff --git a/test_conversion/compare_layers.py b/test_conversion/compare_layers.py new file mode 100644 index 00000000000..2522343068c --- /dev/null +++ b/test_conversion/compare_layers.py @@ -0,0 +1,283 @@ +""" +Compare per-layer activations from tf_layers.npz and gguf_layers.bin +and produce a divergence plot. + +For each pair (transformers tensor, GGUF tensor) referring to the same +position in the graph, we compute: + + max_abs_diff max |x_tf - x_gg| in fp32 + mean_abs_diff mean over all elements + rel_max max_abs_diff / max(|x_tf|, |x_gg|, 1e-8) + cosine 1 - cos(x_tf, x_gg) (smaller = closer) + l2_rel ||x_tf - x_gg|| / ||x_tf|| + +Mappings used (nemotron architecture): + GGUF transformers + l_out-i hidden-(i+1) per-layer output (post-residual) + attn_norm-i attn_norm-i input_layernorm output + ffn_inp-i (attn-residual sum) not directly hookable in transformers; + approximated as hidden-i + self_attn-i + ffn_norm-i post_norm-i post_attention_layernorm output + ffn_out-i (mlp + residual) approximated as ffn_inp + mlp output + result_norm final_norm output of model.model.norm + result_output logits final LM head + +The first three columns of the report use l_out / final_norm / logits, which +are the most reliable (no hook approximation). The fine-grained per-op +analysis uses the rest. + +Output: + /layer_diff_report.txt text report + /layer_diff_overview.png log-scale per-layer divergence plot + /layer_diff_by_op.png per-op-type breakdown + +Usage: + python compare_layers.py +""" + +import argparse +import json +import struct +import sys +from collections import defaultdict +from pathlib import Path + +import numpy as np + + +# GGML type ids → numpy dtype (for the ones we'll encounter on activation tensors). +GGML_TYPE = { + 0: ("f32", np.float32), + 1: ("f16", np.float16), + 24: ("bf16", None), # handled specially + 26: ("i32", np.int32), + 30: ("i64", np.int64), +} + + +def parse_gguf_dump(path: Path): + """Yield (name, np_array_fp32) from the binary dump.""" + with open(path, "rb") as f: + data = f.read() + i = 0 + n = 0 + while i < len(data): + if i + 4 > len(data): + break + name_len = struct.unpack_from(" 1024: + break + name = data[i:i+name_len].decode("utf-8", errors="replace"); i += name_len + dtype = struct.unpack_from(" 1) or (1,) + try: + arr = arr.reshape(shape[::-1]) # ggml stores ne in element-stride order + except ValueError: + # fallback: leave as flat + pass + yield name, arr + n += 1 + + +def compute_diff_metrics(x_tf, x_gg): + """Both inputs flattened to fp32 and same total element count.""" + if x_tf.size != x_gg.size: + return None + a = x_tf.astype(np.float32).reshape(-1) + b = x_gg.astype(np.float32).reshape(-1) + diff = a - b + abs_diff = np.abs(diff) + max_abs = float(abs_diff.max()) + mean_abs = float(abs_diff.mean()) + denom = float(max(np.abs(a).max(), np.abs(b).max(), 1e-8)) + rel_max = max_abs / denom + # cosine-distance + na = float(np.linalg.norm(a)) + nb = float(np.linalg.norm(b)) + cos = 1.0 - float(a @ b) / (na * nb + 1e-12) + l2_rel = float(np.linalg.norm(diff)) / (na + 1e-12) + return dict(max_abs=max_abs, mean_abs=mean_abs, rel_max=rel_max, cosine=cos, l2_rel=l2_rel) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("work_dir") + parser.add_argument("--top-tail-only", action="store_true", + help="For multi-token tensors, only compare the LAST token position " + "(matches what next-token prediction uses).") + args = parser.parse_args() + + work_dir = Path(args.work_dir).resolve() + tf_path = work_dir / "tf_layers.npz" + gg_path = work_dir / "gguf_layers.bin" + meta = json.loads((work_dir / "meta.json").read_text()) + + print(f"Comparing layer activations for case {meta['case']!r}") + print(f" HF dir: {meta['hf_model_dir']}") + print(f" GGUF: {meta['gguf_path']}") + print() + + tf = dict(np.load(tf_path)) + gg = dict(parse_gguf_dump(gg_path)) + print(f"transformers: {len(tf)} arrays") + print(f"gguf: {len(gg)} arrays") + + n_layers = max(int(k.split("-")[1]) for k in tf if k.startswith("hidden-")) + 1 - 1 + # hidden_states has N+1 entries (0..N); N = num_layers + print(f"layers: {n_layers}") + + T = int(tf["tokens"].shape[-1]) + print(f"tokens: {T}") + + # Mapping rules: each entry is (label, op_type, tf_array_or_callable, gg_key) + # tf entry can be a string (npz key) or a callable taking the tf dict and + # returning an array — to combine multiple hook outputs. + def add_resid(i): + """ffn_inp = hidden[i] + self_attn[i] (post-attention, pre-MLP, with residual)""" + return lambda tfd: tfd[f"hidden-{i}"] + tfd[f"self_attn-{i}"] + + mappings = [] + for i in range(n_layers): + mappings.append((f"L{i:02d} attn_norm", "norm", f"attn_norm-{i}", f"attn_norm-{i}")) + mappings.append((f"L{i:02d} ffn_inp", "post_attn", add_resid(i), f"ffn_inp-{i}")) + mappings.append((f"L{i:02d} ffn_norm", "norm", f"post_norm-{i}", f"ffn_norm-{i}")) + mappings.append((f"L{i:02d} l_out", "block_out", f"hidden-{i+1}", f"l_out-{i}")) + mappings.append(("final_norm", "norm", "final_norm", "result_norm")) + mappings.append(("logits", "head", "logits", "result_output")) + + rows = [] + for label, op, tk, gk in mappings: + # Resolve transformers tensor (string key or callable on the dict) + if callable(tk): + try: + x_tf = tk(tf) + except KeyError as e: + rows.append({"label": label, "op": op, "skip": f"missing tf key {e}"}) + continue + else: + if tk not in tf: + rows.append({"label": label, "op": op, "skip": f"missing tf={tk}"}) + continue + x_tf = tf[tk] + if gk not in gg: + rows.append({"label": label, "op": op, "skip": f"missing gg={gk}"}) + continue + x_gg = gg[gk] + if args.top_tail_only: + # Pick last token slice + x_tf = x_tf.reshape(-1, x_tf.shape[-1])[-1] if x_tf.ndim >= 2 else x_tf + x_gg = x_gg.reshape(-1, x_gg.shape[-1])[-1] if x_gg.ndim >= 2 else x_gg + m = compute_diff_metrics(x_tf, x_gg) + if m is None: + rows.append({"label": label, "op": op, "skip": f"shape mismatch tf={x_tf.shape} gg={x_gg.shape}"}) + continue + rows.append({"label": label, "op": op, **m, "shape": tuple(x_tf.shape)}) + + # ─── Text report ─── + out_txt = work_dir / "layer_diff_report.txt" + lines = [] + lines.append(f"Layer-by-layer activation comparison — case {meta['case']!r}") + lines.append(f"HF dtype: {meta['dtype']} on {meta['device']}") + lines.append("") + lines.append(f"{'label':<25} {'op':<10} {'shape':<22} {'max|Δ|':<10} {'mean|Δ|':<10} {'rel_max':<9} {'cos_d':<10} {'l2_rel':<10}") + lines.append("-" * 120) + for r in rows: + if "skip" in r: + lines.append(f"{r['label']:<25} {r['op']:<10} SKIP: {r['skip']}") + continue + shape = str(r["shape"]) + lines.append(f"{r['label']:<25} {r['op']:<10} {shape:<22} " + f"{r['max_abs']:<10.4g} {r['mean_abs']:<10.4g} " + f"{r['rel_max']:<9.3g} {r['cosine']:<10.4g} {r['l2_rel']:<10.4g}") + report = "\n".join(lines) + out_txt.write_text(report + "\n") + print() + print(report) + print() + print(f"Wrote {out_txt}") + + # ─── Plots ─── + try: + import matplotlib + matplotlib.use("Agg") + import matplotlib.pyplot as plt + except ImportError: + print("matplotlib not installed; skipping plots (pip install matplotlib)") + return + + valid = [r for r in rows if "skip" not in r] + layer_rows = [r for r in valid if r["label"].startswith("L")] + + # Plot 1: overview, divergence vs layer index, separate lines per op + fig, axes = plt.subplots(1, 2, figsize=(14, 5)) + by_op = defaultdict(list) + for r in layer_rows: + # label is "L00 attn_norm" / "L00 ffn_norm" / "L00 l_out" + op_label = r["label"].split()[1] + layer_idx = int(r["label"][1:3]) + by_op[op_label].append((layer_idx, r)) + + for ax, metric in zip(axes, ["max_abs", "l2_rel"]): + for op_label, lst in by_op.items(): + lst.sort() + xs = [li for li, _ in lst] + ys = [r[metric] for _, r in lst] + ax.plot(xs, ys, marker="o", label=op_label) + # add final_norm and logits as scatter at x = N + for r in valid: + if r["label"] == "final_norm": + ax.scatter([n_layers], [r[metric]], marker="*", s=200, label="final_norm") + if r["label"] == "logits": + ax.scatter([n_layers + 0.3], [r[metric]], marker="X", s=150, label="logits") + ax.set_xlabel("layer index") + ax.set_ylabel(metric) + ax.set_yscale("log") + ax.set_title(f"divergence per layer ({metric})") + ax.grid(True, which="both", alpha=0.3) + ax.legend(fontsize=8) + fig.suptitle(f"transformers vs GGUF — case {meta['case']!r}") + fig.tight_layout() + p1 = work_dir / "layer_diff_overview.png" + fig.savefig(p1, dpi=110, bbox_inches="tight") + print(f"Wrote {p1}") + + # Plot 2: cumulative growth of l_out divergence to highlight where drift accumulates + fig, ax = plt.subplots(figsize=(10, 5)) + l_out_rows = sorted(by_op.get("l_out", [])) + if l_out_rows: + xs = [li for li, _ in l_out_rows] + for metric in ["max_abs", "l2_rel", "cosine"]: + ys = [r[metric] for _, r in l_out_rows] + ax.plot(xs, ys, marker="o", label=metric) + ax.set_xlabel("layer index") + ax.set_ylabel("metric") + ax.set_yscale("log") + ax.set_title(f"l_out divergence over depth — case {meta['case']!r}") + ax.grid(True, which="both", alpha=0.3) + ax.legend() + fig.tight_layout() + p2 = work_dir / "layer_diff_l_out.png" + fig.savefig(p2, dpi=110, bbox_inches="tight") + print(f"Wrote {p2}") + + +if __name__ == "__main__": + main() diff --git a/test_conversion/run_behavior.py b/test_conversion/run_behavior.py new file mode 100644 index 00000000000..8c1762d2194 --- /dev/null +++ b/test_conversion/run_behavior.py @@ -0,0 +1,224 @@ +""" +Behavioural test: for each test case that has an `expected_behavior` field, +actually run the model and verify the assistant response satisfies the +expectation (e.g. emits a tool_call with the right name + args). + +IMPORTANT design note — why /api/generate raw=true: + + Ollama's /api/chat tool-call parser (at least in 0.24) silently drops + model output when it detects a tool_call tag but fails to extract a + valid call mid-stream. The model can emit a perfectly well-formed + ... block and the chat API still returns + {"content": "", "tool_calls": null}. That hides whether the *model* + works. + + To test the model behind Ollama, we bypass the chat layer entirely: + 1. Take the transformers-rendered prompt (already computed in + transformers.json — same exact string the model would see at + training time). + 2. Feed it via /api/generate with raw=true (no template, no parsing). + 3. Read back the raw text the model emitted, parse blocks + ourselves with a regex, and check the expectation. + + This tests the GGUF model + tokenizer end-to-end without Ollama's chat + quirks getting in the way. + +Currently supports one kind of expectation: + + expected_behavior = { + "tool_call": { + "name": "get_weather", + "required_args": ["location"], + "args_must_contain": {"location": "paris"}, # case-insensitive substring + } + } + +Output JSON: one entry per case with expected_behavior, including +{name, expected, raw_output, parsed_tool_call, pass, fail_reason}. + +Usage: + python run_behavior.py + --transformers-output + [--model-name NAME] + [--ollama-url URL] + [--num-predict N] +""" + +import argparse +import json +import re +import sys +import traceback +from pathlib import Path + +try: + import requests +except ImportError: + sys.exit("ERROR: this script needs the 'requests' package (pip install requests).") + +sys.path.insert(0, str(Path(__file__).parent)) +from test_cases import TEST_CASES # noqa: E402 +from run_ollama import check_ollama_alive, ollama_create, ollama_delete, _post # noqa: E402 + + +# Matches " ... " with any whitespace inside. +TOOL_CALL_RE = re.compile(r"\s*(\{.*?\})\s*", re.DOTALL) + + +def ollama_generate_raw(url, model, prompt, num_predict): + payload = { + "model": model, + "prompt": prompt, + "raw": True, + "stream": False, + "options": { + "num_predict": num_predict, + "temperature": 0, + "seed": 0, + # Stop at the chat-message terminator so we don't waste tokens + # generating into the next turn. + "stop": ["<|im_end|>"], + }, + } + return _post(f"{url}/api/generate", payload) + + +def parse_tool_call_from_text(text): + """Find the first ... block and parse its JSON. + + Returns (call_dict_or_None, error_str_or_None) where call_dict is + {"name": str, "arguments": dict} on success. + """ + m = TOOL_CALL_RE.search(text) + if not m: + return None, "no ... block in output" + body = m.group(1) + try: + obj = json.loads(body) + except json.JSONDecodeError as e: + return None, f" body is not valid JSON: {e}; body={body!r}" + if "name" not in obj: + return None, f" body has no 'name' field; body={obj!r}" + args = obj.get("arguments", {}) + if isinstance(args, str): + # Some templates emit arguments as a JSON-encoded string. + try: + args = json.loads(args) + except json.JSONDecodeError: + pass + return {"name": obj["name"], "arguments": args}, None + + +def evaluate_tool_call(expected, parsed): + """Check the parsed tool call matches expectation. + + Returns (pass: bool, reason: str | None). + """ + if parsed["name"] != expected["name"]: + return False, f"wrong tool name: got {parsed['name']!r}, expected {expected['name']!r}" + + args = parsed["arguments"] + if not isinstance(args, dict): + return False, f"arguments not a dict: {args!r}" + + for key in expected.get("required_args", []): + if key not in args: + return False, f"missing required arg {key!r} (got args: {list(args)})" + + for key, needle in expected.get("args_must_contain", {}).items(): + val = args.get(key) + if val is None: + return False, f"missing arg {key!r}" + if needle.lower() not in str(val).lower(): + return False, f"arg {key!r}={val!r} does not contain {needle!r}" + + return True, None + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("modelfile_path") + parser.add_argument("output_json") + parser.add_argument("--transformers-output", required=True, + help="Path to transformers.json (provides the exact " + "rendered prompt for each case)") + parser.add_argument("--model-name", default="test-chat-template-tmp", + help="Temporary ollama model name (created/deleted by the script).") + parser.add_argument("--ollama-url", default="http://localhost:11434") + parser.add_argument("--num-predict", type=int, default=256, + help="Max tokens the model may generate per case.") + args = parser.parse_args() + + version = check_ollama_alive(args.ollama_url) + print(f"[behavior] Ollama reachable (version {version})") + + tf_path = Path(args.transformers_output) + if not tf_path.exists(): + sys.exit(f"ERROR: transformers output not found at {tf_path}. " + "Run run_transformers.py first.") + transformers_by_name = {r["name"]: r for r in json.loads(tf_path.read_text())} + + cases_with_behavior = [c for c in TEST_CASES if c.get("expected_behavior")] + if not cases_with_behavior: + print("[behavior] No test cases have an 'expected_behavior' field; nothing to do.") + Path(args.output_json).write_text("[]") + return + + print(f"[behavior] {len(cases_with_behavior)} behavioural case(s) to run") + + ollama_create(args.model_name, args.modelfile_path) + try: + results = [] + for case in cases_with_behavior: + print(f"[behavior] {case['name']}") + entry = { + "name": case["name"], + "expected_behavior": case["expected_behavior"], + } + try: + tf = transformers_by_name.get(case["name"]) + if tf is None or "rendered_prompt" not in tf: + raise RuntimeError( + f"no rendered_prompt for {case['name']} in transformers output" + ) + resp = ollama_generate_raw( + args.ollama_url, args.model_name, + tf["rendered_prompt"], num_predict=args.num_predict, + ) + raw_output = resp.get("response", "") + entry["raw_output"] = raw_output + entry["eval_count"] = resp.get("eval_count") + + eb = case["expected_behavior"] + if "tool_call" in eb: + parsed, parse_err = parse_tool_call_from_text(raw_output) + entry["parsed_tool_call"] = parsed + if parsed is None: + entry["pass"] = False + entry["fail_reason"] = parse_err + else: + ok, reason = evaluate_tool_call(eb["tool_call"], parsed) + entry["pass"] = ok + entry["fail_reason"] = reason + else: + entry["pass"] = False + entry["fail_reason"] = f"unknown expected_behavior keys: {list(eb)}" + except Exception as e: + traceback.print_exc() + entry["pass"] = False + entry["fail_reason"] = f"{type(e).__name__}: {e}" + + marker = "OK " if entry.get("pass") else "FAIL" + print(f" -> {marker} {entry.get('fail_reason') or ''}") + results.append(entry) + finally: + ollama_delete(args.model_name) + + out = Path(args.output_json) + out.parent.mkdir(parents=True, exist_ok=True) + out.write_text(json.dumps(results, indent=2, ensure_ascii=False)) + print(f"[behavior] Wrote {len(results)} results to {out}") + + +if __name__ == "__main__": + main() diff --git a/test_conversion/run_layer_diff.py b/test_conversion/run_layer_diff.py new file mode 100644 index 00000000000..167eca64c19 --- /dev/null +++ b/test_conversion/run_layer_diff.py @@ -0,0 +1,180 @@ +""" +Layer-by-layer activation dump for both backends, on a single anchor prompt. + +Outputs two files in : + + tf_layers.npz — numpy archive with one array per intermediate tensor: + "tokens" : input token ids (1, T) + "hidden-i" for i in 0..N : per-layer output of the i-th block + (transformers .hidden_states) + "attn_norm-i" for i in 0..N-1 : output of input_layernorm + "self_attn-i" for i in 0..N-1 : output of the attention block (without residual) + "post_norm-i" for i in 0..N-1 : output of post_attention_layernorm + "mlp-i" for i in 0..N-1 : output of MLP (without residual) + "final_norm" : after model.model.norm + "logits" : final LM head output + + gguf_layers.bin — binary dump from llama-eval-callback (env-gated). + Records of: u32 name_len, name, u32 dtype, i64 ne[4], u64 nbytes, data. + +The companion compare_layers.py loads both and computes per-layer divergence. + +Usage: + python run_layer_diff.py + --transformers-output + --case + --work-dir + [--device cuda|cpu] + [--dtype fp16|fp32|bf16] +""" + +import argparse +import json +import os +import shutil +import subprocess +import sys +from pathlib import Path + +import numpy as np +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer + + +LLAMA_BIN_DIR = Path("/home/jlouradour/src.nowsl/llama.cpp/build/bin") +EVAL_CALLBACK_BIN = LLAMA_BIN_DIR / "llama-eval-callback" + +# Tensor name regex passed to the patched llama.cpp dumper. +# Keep aligned with the names cb()'d by src/models/nemotron.cpp. +GGUF_DUMP_REGEX = r'^(attn_norm|ffn_inp|ffn_norm|ffn_out|l_out|result_norm|result_output)-?[0-9]*$' + + +def transformers_dump(model_dir: Path, prompt: str, device: str, dtype: torch.dtype, out_path: Path): + print(f"[tf] Loading model from {model_dir} ({device}, {dtype})") + tokenizer = AutoTokenizer.from_pretrained(str(model_dir), trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained( + str(model_dir), torch_dtype=dtype, trust_remote_code=True, + ).to(device) + model.eval() + + # Match the way our test renders prompts: tokenize the raw rendered prompt + # without adding extra special tokens — the prompt already contains them. + inputs = tokenizer(prompt, return_tensors="pt", add_special_tokens=False).to(device) + input_ids = inputs["input_ids"] + print(f"[tf] Input tokens: {input_ids.shape}, last-pos id = {int(input_ids[0,-1])}") + + captures = {} + + def hook_for(key): + def fn(_mod, _inp, out): + t = out[0] if isinstance(out, tuple) else out + captures[key] = t.detach().cpu().float().numpy() + return fn + + handles = [] + for i, layer in enumerate(model.model.layers): + handles.append(layer.input_layernorm.register_forward_hook(hook_for(f"attn_norm-{i}"))) + handles.append(layer.self_attn.register_forward_hook(hook_for(f"self_attn-{i}"))) + handles.append(layer.post_attention_layernorm.register_forward_hook(hook_for(f"post_norm-{i}"))) + handles.append(layer.mlp.register_forward_hook(hook_for(f"mlp-{i}"))) + handles.append(model.model.norm.register_forward_hook(hook_for("final_norm"))) + + with torch.no_grad(): + out = model(**inputs, output_hidden_states=True) + + for h in handles: + h.remove() + + # Per-layer hidden states (hidden[i] is the output of layer i; hidden[0] = embeddings) + for i, h in enumerate(out.hidden_states): + captures[f"hidden-{i}"] = h.detach().cpu().float().numpy() + captures["logits"] = out.logits.detach().cpu().float().numpy() + captures["tokens"] = input_ids.cpu().numpy() + + out_path.parent.mkdir(parents=True, exist_ok=True) + np.savez(str(out_path), **captures) + print(f"[tf] Saved {len(captures)} arrays to {out_path}") + + # Free memory: model is no longer needed + del model + if device == "cuda": + torch.cuda.empty_cache() + + +def gguf_dump(gguf_path: Path, prompt: str, out_path: Path): + out_path.parent.mkdir(parents=True, exist_ok=True) + if out_path.exists(): + out_path.unlink() + env = os.environ.copy() + env["LD_LIBRARY_PATH"] = str(LLAMA_BIN_DIR) + ":" + env.get("LD_LIBRARY_PATH", "") + env["LLAMA_DUMP_TENSORS_FILE"] = str(out_path) + env["LLAMA_DUMP_TENSORS_REGEX"] = GGUF_DUMP_REGEX + # Atomic tokenization of <|im_start|> etc., so the token count matches + # what HF transformers produces on a fully-rendered chat-template prompt. + env["LLAMA_TOKENIZE_PARSE_SPECIAL"] = "1" + + cmd = [str(EVAL_CALLBACK_BIN), + "-m", str(gguf_path), + "-p", prompt, + "-n", "1"] + print(f"[gguf] Running {EVAL_CALLBACK_BIN.name} with regex={GGUF_DUMP_REGEX!r}") + res = subprocess.run(cmd, env=env, capture_output=True, text=True) + if res.returncode != 0: + print(res.stdout[-1000:]) + print(res.stderr[-1000:]) + sys.exit(f"[gguf] llama-eval-callback failed (exit {res.returncode})") + if not out_path.exists() or out_path.stat().st_size == 0: + sys.exit(f"[gguf] dump file empty: {out_path}") + print(f"[gguf] Dump size: {out_path.stat().st_size/1024:.1f} KB") + + +def main(): + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument("hf_model_dir") + parser.add_argument("gguf_path") + parser.add_argument("--transformers-output", required=True, + help="Path to existing transformers.json (provides rendered_prompt per case)") + parser.add_argument("--case", required=True, + help="Test case name from test_cases.py to use as the anchor prompt") + parser.add_argument("--work-dir", required=True) + parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu", choices=["cuda", "cpu"]) + parser.add_argument("--dtype", default=None, choices=["fp16", "fp32", "bf16"], + help="Defaults to fp16 on cuda, fp32 on cpu") + args = parser.parse_args() + + hf_dir = Path(args.hf_model_dir).resolve() + gguf_path = Path(args.gguf_path).resolve() + work_dir = Path(args.work_dir).resolve() + work_dir.mkdir(parents=True, exist_ok=True) + + tf_json = json.loads(Path(args.transformers_output).read_text()) + case = next((c for c in tf_json if c["name"] == args.case), None) + if case is None: + sys.exit(f"case {args.case!r} not found in {args.transformers_output}") + prompt = case["rendered_prompt"] + + if args.dtype is None: + args.dtype = "fp16" if args.device == "cuda" else "fp32" + torch_dtype = {"fp16": torch.float16, "fp32": torch.float32, "bf16": torch.bfloat16}[args.dtype] + + tf_out = work_dir / "tf_layers.npz" + gg_out = work_dir / "gguf_layers.bin" + + transformers_dump(hf_dir, prompt, args.device, torch_dtype, tf_out) + gguf_dump(gguf_path, prompt, gg_out) + + # Also save the prompt + tokens so compare_layers can sanity check. + meta = { + "case": args.case, + "prompt": prompt, + "device": args.device, + "dtype": args.dtype, + "hf_model_dir": str(hf_dir), + "gguf_path": str(gguf_path), + } + (work_dir / "meta.json").write_text(json.dumps(meta, indent=2, ensure_ascii=False)) + print(f"Saved meta to {work_dir / 'meta.json'}") + + +if __name__ == "__main__": + main() diff --git a/test_conversion/run_logits.py b/test_conversion/run_logits.py new file mode 100644 index 00000000000..c1dd875f3e2 --- /dev/null +++ b/test_conversion/run_logits.py @@ -0,0 +1,308 @@ +""" +Logit-level comparison: for each test case, compute the next-token +top-K log-probability distribution from + (a) the transformers model (forward pass on the rendered prompt) + (b) Ollama serving the GGUF (logprobs API on the same prompt) +and report top-1 agreement, top-5 overlap, and KL divergence. + +This catches subtle numerical regressions in the GGUF (quantization, +conversion bugs, wrong activation, etc.) that the binary tool-call +behavioural test would not notice. + +Per-case metrics: + + top1_match bool — same most-likely next token (most important) + top1_lp_diff float — |TF top-1 logprob − Ollama top-1 logprob|. + Concrete confidence delta on the chosen token. + fp16-vs-fp16: typically < 0.1. + Q4_K_M: typically < 0.5. + top5_overlap int — how many of TF's top-5 are in Ollama's top-5 (0..5). + mean_lp_diff_top3 float — primary aggregate metric: mean |Δlp| over TF's + top-3 tokens (aligned by token ID). Top-3 covers + the bulk of the probability mass; excluding the + 4-5 tail tokens avoids the high noise that fp16 + softmax has on low-probability logits. + mean_lp_diff_top5 float — same but over top-5; reported for completeness. + Naturally noisier; use top-3 for judgement. + tf_top5_missing int — count of TF's top-5 tokens not in Ollama's + top-K. High counts mean Ollama wasn't even close + on those tokens (significant divergence). + kl_div_renorm float — secondary: KL on renormalized common-top-K. + Can be inflated; ignore unless other signals + also flag. + +Cases whose rendered prompt does NOT end at a generation point (i.e. +add_generation_prompt was False — last message was assistant text/tool_calls, +no `<|im_start|>assistant\\n` suffix) are SKIPPED: there is no canonical +"next token" to predict there. + +Usage: + python run_logits.py + --transformers-output + [--model-name NAME] + [--ollama-url URL] + [--top-k K] + [--device cuda|cpu] + [--dtype fp16|fp32] +""" + +import argparse +import json +import math +import sys +import traceback +from pathlib import Path + +try: + import requests # noqa: F401 (imported via run_ollama as well, but be explicit) +except ImportError: + sys.exit("ERROR: this script needs the 'requests' package (pip install requests).") + +try: + import torch + from transformers import AutoModelForCausalLM, AutoTokenizer +except ImportError as e: + sys.exit(f"ERROR: this script needs torch + transformers ({e}).") + +sys.path.insert(0, str(Path(__file__).parent)) +from run_ollama import check_ollama_alive, ollama_create, ollama_delete, _post # noqa: E402 + + +def transformers_topk(model, tokenizer, prompt, top_k): + """Forward-pass the model and return top-K (token_id, token, logprob).""" + inputs = tokenizer(prompt, return_tensors="pt", add_special_tokens=False) + device = next(model.parameters()).device + inputs = {k: v.to(device) for k, v in inputs.items()} + with torch.no_grad(): + out = model(**inputs) + logits = out.logits[0, -1].float() + logprobs = torch.log_softmax(logits, dim=-1) + vals, idxs = torch.topk(logprobs, top_k) + return [ + { + "token_id": int(i), + "token": tokenizer.convert_ids_to_tokens(int(i)), + "logprob": float(lp), + } + for i, lp in zip(idxs.tolist(), vals.tolist()) + ] + + +def ollama_topk(url, model_name, prompt, top_k): + """Get Ollama's top-K next-token logprobs (raw=true so it doesn't apply the chat template).""" + payload = { + "model": model_name, + "prompt": prompt, + "raw": True, + "stream": False, + "options": {"num_predict": 1, "temperature": 0, "seed": 0}, + "logprobs": True, + "top_logprobs": top_k, + } + resp = _post(f"{url}/api/generate", payload) + lps = resp.get("logprobs") or [] + if not lps: + return None + first = lps[0] + return [ + {"token": t["token"], "logprob": t["logprob"], "bytes": t.get("bytes")} + for t in first.get("top_logprobs", []) + ] + + +_SPM_SPACE = "▁" # ▁ — SentencePiece's word-boundary marker + + +def ollama_token_to_id(tokenizer, ol_entry, vocab): + """Map an Ollama-reported token to the transformers vocab ID. + + Critical: we look the token up DIRECTLY in the vocab dict, not via + tokenizer.encode(). The encoder normalizes (e.g. always converts a + leading literal-space `' Bonjour'` to `▁Bonjour`), which would COLLIDE + distinct GGUF vocab entries (`' Bonjour'`/▁Bonjour id=34362 vs + `'Bonjour'` id=21327) and cause ol_by_id[id] to be set to the WRONG + logprob (whichever distinct token appears later in the top-K list). + """ + s = ol_entry["token"] + + # 1. Try the SentencePiece form: leading literal-space → ▁ prefix. + spm_form = (_SPM_SPACE + s[1:]) if s.startswith(" ") else s + if spm_form in vocab: + return vocab[spm_form] + + # 2. Try the raw string (for non-space-prefixed tokens like 'Bonjour'). + if s in vocab: + return vocab[s] + + # 3. Last resort: lossy re-tokenize. May collide; logged via caller. + ids = tokenizer.encode(s, add_special_tokens=False) + if len(ids) == 1: + return ids[0] + return None + + +def compare_topk(tf_top, ol_top, tokenizer): + """Compare next-token top-K distributions and return per-case metrics.""" + if not tf_top or not ol_top: + return None + + # Annotate Ollama entries with transformers vocab IDs (direct vocab lookup). + vocab = tokenizer.get_vocab() + ol_with_ids = [ + {**t, "token_id": ollama_token_to_id(tokenizer, t, vocab)} for t in ol_top + ] + + tf1 = tf_top[0] + ol1 = ol_with_ids[0] + top1_match = tf1["token_id"] == ol1["token_id"] + + tf_top5_ids = {t["token_id"] for t in tf_top[:5]} + ol_top5_ids = {t["token_id"] for t in ol_with_ids[:5] if t["token_id"] is not None} + top5_overlap = len(tf_top5_ids & ol_top5_ids) + + # PRIMARY: absolute logprob differences on TF's top-N tokens (aligned + # by token ID via Ollama's top-K). Reported on top-1 (concrete) and + # top-3 (aggregate). Top-5 also computed for completeness but is + # naturally noisy because fp16 softmax precision is lowest in the tail. + ol_by_id = {t["token_id"]: t["logprob"] for t in ol_with_ids if t["token_id"] is not None} + + def diffs_over(n): + d = [] + miss = 0 + for t in tf_top[:n]: + ol_lp = ol_by_id.get(t["token_id"]) + if ol_lp is None: + miss += 1 + else: + d.append(abs(t["logprob"] - ol_lp)) + return d, miss + + d3, _ = diffs_over(3) + d5, missing5 = diffs_over(5) + mean3 = (sum(d3) / len(d3)) if d3 else None + mean5 = (sum(d5) / len(d5)) if d5 else None + + # Top-1 logprob diff specifically (most interpretable; same token assumed). + top1_lp_diff = None + if top1_match: + ol_top1_lp = ol_by_id.get(tf1["token_id"]) + if ol_top1_lp is not None: + top1_lp_diff = abs(tf1["logprob"] - ol_top1_lp) + + # SECONDARY METRIC: KL on renormalized common-top-K (kept for reference; + # can be inflated when overlap is small). + tf_by_id = {t["token_id"]: t["logprob"] for t in tf_top} + common = set(tf_by_id) & set(ol_by_id) + kl = None + if common: + tf_p = {i: math.exp(tf_by_id[i]) for i in common} + ol_p = {i: math.exp(ol_by_id[i]) for i in common} + s_tf = sum(tf_p.values()) + s_ol = sum(ol_p.values()) + if s_tf > 0 and s_ol > 0: + kl = 0.0 + for i in common: + p = tf_p[i] / s_tf + q = ol_p[i] / s_ol + if p > 1e-12 and q > 1e-12: + kl += p * math.log(p / q) + + return { + "top1_match": top1_match, + "tf_top1": {"id": tf1["token_id"], "tok": tf1["token"], "lp": round(tf1["logprob"], 4)}, + "ol_top1": {"id": ol1["token_id"], "tok": ol1["token"], "lp": round(ol1["logprob"], 4)}, + "top1_lp_diff": top1_lp_diff, + "top5_overlap": top5_overlap, + "tf_top5_missing_in_ollama_topk": missing5, + "mean_lp_diff_top3": mean3, + "mean_lp_diff_top5": mean5, + "kl_div_renorm": kl, + } + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("hf_model_dir", help="Path to HuggingFace transformers model directory") + parser.add_argument("modelfile_path", help="Path to the GGUF Modelfile (for ollama create)") + parser.add_argument("output_json") + parser.add_argument("--transformers-output", required=True, + help="Path to transformers.json (provides the rendered prompts)") + parser.add_argument("--model-name", default="test-chat-template-tmp") + parser.add_argument("--ollama-url", default="http://localhost:11434") + parser.add_argument("--top-k", type=int, default=20) + parser.add_argument("--device", + default="cuda" if torch.cuda.is_available() else "cpu", + choices=["cuda", "cpu"]) + parser.add_argument("--dtype", default=None, choices=["fp16", "fp32", "bf16"], + help="Defaults to fp16 on cuda, fp32 on cpu") + args = parser.parse_args() + + version = check_ollama_alive(args.ollama_url) + print(f"[logits] Ollama reachable (version {version})") + + tf_path = Path(args.transformers_output) + if not tf_path.exists(): + sys.exit(f"ERROR: transformers output not found at {tf_path}") + transformers_data = json.loads(tf_path.read_text()) + + if args.dtype is None: + args.dtype = "fp16" if args.device == "cuda" else "fp32" + torch_dtype = {"fp16": torch.float16, "fp32": torch.float32, "bf16": torch.bfloat16}[args.dtype] + + print(f"[logits] Loading transformers model from {args.hf_model_dir} ({args.device}, {args.dtype})") + tokenizer = AutoTokenizer.from_pretrained(args.hf_model_dir, trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained( + args.hf_model_dir, torch_dtype=torch_dtype, trust_remote_code=True, + ).to(args.device) + model.eval() + + ollama_create(args.model_name, args.modelfile_path) + try: + results = [] + for entry in transformers_data: + name = entry["name"] + prompt = entry.get("rendered_prompt") + if not prompt: + continue + if entry.get("add_generation_prompt") is False: + # No canonical next-token prediction: the conversation ends + # on the assistant's own message (closed by <|im_end|>). + # Skip — there's nothing meaningful to compare. + print(f"[logits] {name} SKIP (no add_generation_prompt)") + results.append({"name": name, "skipped": "no add_generation_prompt"}) + continue + print(f"[logits] {name}") + try: + tf_top = transformers_topk(model, tokenizer, prompt, args.top_k) + ol_top = ollama_topk(args.ollama_url, args.model_name, prompt, args.top_k) + cmp = compare_topk(tf_top, ol_top, tokenizer) + results.append({ + "name": name, + "comparison": cmp, + "tf_top5": tf_top[:5], + "ol_top5": (ol_top or [])[:5], + }) + if cmp: + fmt = lambda v: (f"{v:.4f}" if v is not None else "n/a") + print(f" top1_match={cmp['top1_match']} " + f"|Δlp_top1|={fmt(cmp['top1_lp_diff'])} " + f"mean|Δlp|_top3={fmt(cmp['mean_lp_diff_top3'])} " + f"top5_overlap={cmp['top5_overlap']}/5 " + f"missing={cmp['tf_top5_missing_in_ollama_topk']}/5") + except Exception as e: + traceback.print_exc() + results.append({"name": name, "error": f"{type(e).__name__}: {e}"}) + finally: + ollama_delete(args.model_name) + del model + if args.device == "cuda": + torch.cuda.empty_cache() + + out = Path(args.output_json) + out.parent.mkdir(parents=True, exist_ok=True) + out.write_text(json.dumps(results, indent=2, ensure_ascii=False)) + print(f"[logits] Wrote {len(results)} results to {out}") + + +if __name__ == "__main__": + main() diff --git a/test_conversion/run_ollama.py b/test_conversion/run_ollama.py new file mode 100644 index 00000000000..d7e836c01ea --- /dev/null +++ b/test_conversion/run_ollama.py @@ -0,0 +1,228 @@ +""" +For each test case, query Ollama and collect the input-token count. + +Two probes per case: + chat_prompt_eval_count : tokens fed to the model when the conversation + is passed through Ollama's /api/chat (i.e. + Ollama applies its Modelfile TEMPLATE, then + tokenizes with the GGUF tokenizer). + raw_prompt_eval_count : tokens fed when the *transformers-rendered* + prompt is passed through /api/generate with + raw=true (i.e. only the GGUF tokenizer runs; + no chat template applied). This isolates the + tokenizer from the template. + +The Ollama model is created from the Modelfile at startup and deleted at exit. + +Usage: + python run_ollama.py + [--model-name NAME] + [--transformers-output JSON] + [--ollama-url URL] +""" + +import argparse +import json +import subprocess +import sys +import traceback +from pathlib import Path + +try: + import requests +except ImportError: + sys.exit("ERROR: this script needs the 'requests' package (pip install requests).") + +sys.path.insert(0, str(Path(__file__).parent)) +from test_cases import TEST_CASES # noqa: E402 + + +def check_ollama_alive(url): + try: + r = requests.get(f"{url}/api/version", timeout=3) + r.raise_for_status() + return r.json().get("version", "?") + except Exception as e: + sys.exit( + f"ERROR: cannot reach Ollama at {url} ({e}).\n" + " Start it first with: ollama serve" + ) + + +def validate_from_target(modelfile_path): + """Check that the Modelfile's FROM target exists (relative to the Modelfile).""" + modelfile_path = Path(modelfile_path).resolve() + for line in modelfile_path.read_text().splitlines(): + line = line.strip() + if line.startswith("FROM "): + target = line[len("FROM "):].strip().strip('"') + # Ollama requires a relative path for local FROM files; absolute + # paths trigger a misleading "no Modelfile or safetensors files + # found" error. + if target.startswith("/"): + sys.exit( + f"ERROR: Modelfile {modelfile_path} has an absolute FROM path " + f"({target}). Ollama needs it relative to the Modelfile." + ) + resolved = (modelfile_path.parent / target).resolve() + if not resolved.exists(): + sys.exit( + f"ERROR: Modelfile {modelfile_path} references\n" + f" FROM {target}\n" + f"but {resolved} does not exist.\n" + f"Either rename the GGUF or edit the FROM line in the Modelfile.\n" + f"(NB: ollama reports this as 'invalid model name' — misleading.)" + ) + return + sys.exit(f"ERROR: no FROM line found in {modelfile_path}") + + +def ollama_create(name, modelfile_path): + """Create (or overwrite) an ollama model from the given Modelfile.""" + modelfile_path = Path(modelfile_path).resolve() + validate_from_target(modelfile_path) + print(f"[ollama] Creating model '{name}' from {modelfile_path}") + # Run from the modelfile's directory so its FROM ./X.gguf resolves. + result = subprocess.run( + ["ollama", "create", name, "-f", modelfile_path.name], + cwd=str(modelfile_path.parent), + capture_output=True, + text=True, + ) + if result.returncode != 0: + sys.exit( + "ERROR: 'ollama create' failed:\n" + f" STDOUT: {result.stdout}\n" + f" STDERR: {result.stderr}" + ) + + +def ollama_delete(name): + print(f"[ollama] Removing temporary model '{name}'") + subprocess.run(["ollama", "rm", name], capture_output=True, text=True) + + +def normalize_for_ollama(messages): + """Convert OpenAI-style messages to the variant Ollama's /api/chat accepts. + + Differences observed empirically: + - tool_calls[].function.arguments must be an OBJECT, not a JSON string. + - The OpenAI-style {"type": "function", "function": {...}} wrapper around + each tool_call is tolerated, but we strip it to be safe. + """ + out = [] + for msg in messages: + m = dict(msg) + tcs = m.get("tool_calls") + if tcs: + new_tcs = [] + for tc in tcs: + fn = tc.get("function", tc) + args = fn.get("arguments") + if isinstance(args, str): + try: + args = json.loads(args) + except json.JSONDecodeError: + pass # leave it; ollama will complain again + new_tcs.append({"function": {"name": fn["name"], "arguments": args}}) + m["tool_calls"] = new_tcs + out.append(m) + return out + + +def _post(url, payload): + r = requests.post(url, json=payload, timeout=300) + if r.status_code >= 400: + # Surface Ollama's actual complaint instead of a bare HTTPError. + raise RuntimeError(f"HTTP {r.status_code} from {url}: {r.text}") + return r.json() + + +def ollama_chat(url, model, messages, tools): + payload = { + "model": model, + "messages": normalize_for_ollama(messages), + "stream": False, + "options": {"num_predict": 1, "temperature": 0, "seed": 0}, + } + if tools is not None: + payload["tools"] = tools + return _post(f"{url}/api/chat", payload) + + +def ollama_generate_raw(url, model, prompt): + payload = { + "model": model, + "prompt": prompt, + "raw": True, + "stream": False, + "options": {"num_predict": 1, "temperature": 0, "seed": 0}, + } + return _post(f"{url}/api/generate", payload) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("modelfile_path", help="Path to the Modelfile") + parser.add_argument("output_json", help="Path to write the JSON results") + parser.add_argument("--model-name", default="test-chat-template-tmp", + help="Temporary ollama model name (will be created and removed). " + "Must match ollama's naming rules: lowercase letters, digits, " + "hyphens and periods only (no underscores).") + parser.add_argument("--transformers-output", default=None, + help="Path to the transformers.json (enables the raw tokenizer probe)") + parser.add_argument("--ollama-url", default="http://localhost:11434") + args = parser.parse_args() + + version = check_ollama_alive(args.ollama_url) + print(f"[ollama] Server reachable (version {version})") + + transformers_by_name = {} + if args.transformers_output and Path(args.transformers_output).exists(): + ref = json.loads(Path(args.transformers_output).read_text()) + transformers_by_name = {r["name"]: r for r in ref} + print(f"[ollama] Loaded {len(transformers_by_name)} transformers references " + f"(will probe tokenizer with raw=true)") + else: + print("[ollama] No transformers reference available; skipping raw-tokenizer probe") + + ollama_create(args.model_name, args.modelfile_path) + + results = [] + try: + for case in TEST_CASES: + print(f"[ollama] {case['name']}") + entry = {"name": case["name"]} + try: + chat_resp = ollama_chat( + args.ollama_url, args.model_name, + case["messages"], case.get("tools"), + ) + entry["chat_prompt_eval_count"] = chat_resp.get("prompt_eval_count") + except Exception as e: + traceback.print_exc() + entry["chat_error"] = f"{type(e).__name__}: {e}" + + ref = transformers_by_name.get(case["name"]) + if ref and "rendered_prompt" in ref: + try: + raw_resp = ollama_generate_raw( + args.ollama_url, args.model_name, ref["rendered_prompt"], + ) + entry["raw_prompt_eval_count"] = raw_resp.get("prompt_eval_count") + except Exception as e: + traceback.print_exc() + entry["raw_error"] = f"{type(e).__name__}: {e}" + + results.append(entry) + finally: + ollama_delete(args.model_name) + + out = Path(args.output_json) + out.parent.mkdir(parents=True, exist_ok=True) + out.write_text(json.dumps(results, indent=2, ensure_ascii=False)) + print(f"[ollama] Wrote {len(results)} results to {out}") + + +if __name__ == "__main__": + main() diff --git a/test_conversion/run_transformers.py b/test_conversion/run_transformers.py new file mode 100644 index 00000000000..f98c2971088 --- /dev/null +++ b/test_conversion/run_transformers.py @@ -0,0 +1,81 @@ +""" +Render each test case with the transformers chat template + tokenize. + +Outputs a JSON file with, for each test case: + name, messages, tools, add_generation_prompt, + rendered_prompt, token_count, token_ids + +Usage: + python run_transformers.py +""" + +import argparse +import json +import sys +import traceback +from pathlib import Path + +from transformers import AutoTokenizer + +sys.path.insert(0, str(Path(__file__).parent)) +from test_cases import TEST_CASES # noqa: E402 + + +def render_case(tokenizer, case): + messages = case["messages"] + tools = case.get("tools") + + # If the conversation ends on an assistant turn, we are NOT prompting for + # another generation; otherwise we are (mirrors Ollama's behaviour). + add_generation_prompt = messages[-1]["role"] != "assistant" + + kwargs = {"add_generation_prompt": add_generation_prompt} + if tools is not None: + kwargs["tools"] = tools + + rendered = tokenizer.apply_chat_template(messages, tokenize=False, **kwargs) + token_ids = tokenizer.apply_chat_template(messages, tokenize=True, **kwargs) + + # apply_chat_template may return a tensor; normalize to list[int] + if hasattr(token_ids, "tolist"): + token_ids = token_ids.tolist() + if token_ids and isinstance(token_ids[0], list): + token_ids = token_ids[0] + + return { + "name": case["name"], + "messages": messages, + "tools": tools, + "add_generation_prompt": add_generation_prompt, + "rendered_prompt": rendered, + "token_count": len(token_ids), + "token_ids": token_ids, + } + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("hf_model_dir", help="Path to HuggingFace transformers model directory") + parser.add_argument("output_json", help="Path to write the JSON results") + args = parser.parse_args() + + print(f"[transformers] Loading tokenizer from {args.hf_model_dir}") + tokenizer = AutoTokenizer.from_pretrained(args.hf_model_dir, trust_remote_code=True) + + results = [] + for case in TEST_CASES: + print(f"[transformers] {case['name']}") + try: + results.append(render_case(tokenizer, case)) + except Exception as e: + traceback.print_exc() + results.append({"name": case["name"], "error": f"{type(e).__name__}: {e}"}) + + out = Path(args.output_json) + out.parent.mkdir(parents=True, exist_ok=True) + out.write_text(json.dumps(results, indent=2, ensure_ascii=False)) + print(f"[transformers] Wrote {len(results)} results to {out}") + + +if __name__ == "__main__": + main() diff --git a/test_conversion/test_cases.py b/test_conversion/test_cases.py new file mode 100644 index 00000000000..48b458ebf09 --- /dev/null +++ b/test_conversion/test_cases.py @@ -0,0 +1,253 @@ +""" +Test conversations used to compare the transformers chat template (jinja) +against the Ollama Modelfile template. + +Each test case is a dict with: + name : unique short identifier (used in filenames and reports) + messages : OpenAI-style list of message dicts + tools : list of OpenAI-style tool definitions, or None +""" + +WEATHER_TOOL = { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the current weather in a given location.", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "City name, e.g. 'Paris'.", + }, + "unit": { + "type": "string", + "enum": ["celsius", "fahrenheit"], + "description": "Temperature unit.", + }, + }, + "required": ["location"], + }, + }, +} + +CALCULATOR_TOOL = { + "type": "function", + "function": { + "name": "calculator", + "description": "Evaluate a math expression.", + "parameters": { + "type": "object", + "properties": { + "expression": { + "type": "string", + "description": "A math expression, e.g. '2 + 2 * 3'.", + }, + }, + "required": ["expression"], + }, + }, +} + + +TEST_CASES = [ + # --- No tools --- + { + "name": "01_user_only", + "messages": [ + {"role": "user", "content": "Bonjour, comment vas-tu ?"}, + ], + "tools": None, + }, + { + "name": "02_system_user", + "messages": [ + {"role": "system", "content": "Tu es un assistant qui répond en français."}, + {"role": "user", "content": "Quelle est la capitale de la France ?"}, + ], + "tools": None, + }, + { + "name": "03_multi_turn", + "messages": [ + {"role": "user", "content": "Hi!"}, + {"role": "assistant", "content": "Hello! How can I help you today?"}, + {"role": "user", "content": "What's 2 + 2?"}, + {"role": "assistant", "content": "2 + 2 equals 4."}, + {"role": "user", "content": "Thanks!"}, + ], + "tools": None, + }, + # --- Tools, no tool call yet --- + { + "name": "04_tools_available_no_call", + "messages": [ + {"role": "user", "content": "What's the weather in Paris?"}, + ], + "tools": [WEATHER_TOOL], + # Behavioural expectation: the model should emit a tool_call rather than text. + "expected_behavior": { + "tool_call": { + "name": "get_weather", + "required_args": ["location"], + "args_must_contain": {"location": "paris"}, # case-insensitive substring + }, + }, + }, + { + "name": "05_tools_with_system", + "messages": [ + {"role": "system", "content": "You are a weather assistant."}, + {"role": "user", "content": "Weather in Paris please."}, + ], + "tools": [WEATHER_TOOL, CALCULATOR_TOOL], + "expected_behavior": { + "tool_call": { + "name": "get_weather", + "required_args": ["location"], + "args_must_contain": {"location": "paris"}, + }, + }, + }, + # --- Tool call + tool response --- + { + "name": "06_single_tool_call_and_response", + "messages": [ + {"role": "user", "content": "What's the weather in Paris?"}, + { + "role": "assistant", + "content": "", + "tool_calls": [ + { + "type": "function", + "function": { + "name": "get_weather", + "arguments": '{"location": "Paris"}', + }, + }, + ], + }, + {"role": "tool", "content": '{"temperature": 18, "unit": "celsius"}'}, + {"role": "assistant", "content": "It's 18°C in Paris."}, + ], + "tools": [WEATHER_TOOL], + }, + # --- Multiple tool calls in one assistant turn --- + { + "name": "07_multiple_tool_calls", + "messages": [ + {"role": "user", "content": "Weather in Paris and London?"}, + { + "role": "assistant", + "content": "", + "tool_calls": [ + { + "type": "function", + "function": { + "name": "get_weather", + "arguments": '{"location": "Paris"}', + }, + }, + { + "type": "function", + "function": { + "name": "get_weather", + "arguments": '{"location": "London"}', + }, + }, + ], + }, + ], + "tools": [WEATHER_TOOL], + }, + # --- Consecutive tool responses (must batch into one user turn in jinja) --- + { + "name": "08_consecutive_tool_responses", + "messages": [ + {"role": "user", "content": "Weather in Paris and London?"}, + { + "role": "assistant", + "content": "", + "tool_calls": [ + { + "type": "function", + "function": { + "name": "get_weather", + "arguments": '{"location": "Paris"}', + }, + }, + { + "type": "function", + "function": { + "name": "get_weather", + "arguments": '{"location": "London"}', + }, + }, + ], + }, + {"role": "tool", "content": '{"location": "Paris", "temperature": 18}'}, + {"role": "tool", "content": '{"location": "London", "temperature": 15}'}, + ], + "tools": [WEATHER_TOOL], + }, + # --- Assistant with BOTH content AND tool_calls --- + { + "name": "09_assistant_content_and_tool_call", + "messages": [ + {"role": "user", "content": "What's the weather in Paris?"}, + { + "role": "assistant", + "content": "Let me check that for you.", + "tool_calls": [ + { + "type": "function", + "function": { + "name": "get_weather", + "arguments": '{"location": "Paris"}', + }, + }, + ], + }, + {"role": "tool", "content": '{"temperature": 18, "unit": "celsius"}'}, + ], + "tools": [WEATHER_TOOL], + }, + # --- Long-ish multi-turn with tools sprinkled in --- + { + "name": "10_full_tool_dialogue", + "messages": [ + {"role": "system", "content": "You are a helpful assistant with tools."}, + {"role": "user", "content": "Compute 12*34 then tell me the weather in Lyon."}, + { + "role": "assistant", + "content": "", + "tool_calls": [ + { + "type": "function", + "function": { + "name": "calculator", + "arguments": '{"expression": "12*34"}', + }, + }, + ], + }, + {"role": "tool", "content": '{"result": 408}'}, + { + "role": "assistant", + "content": "12*34 = 408. Now checking the weather.", + "tool_calls": [ + { + "type": "function", + "function": { + "name": "get_weather", + "arguments": '{"location": "Lyon"}', + }, + }, + ], + }, + {"role": "tool", "content": '{"temperature": 21, "unit": "celsius"}'}, + {"role": "assistant", "content": "12*34 = 408 and it's 21°C in Lyon."}, + ], + "tools": [WEATHER_TOOL, CALCULATOR_TOOL], + }, +] diff --git a/test_conversion/test_main.py b/test_conversion/test_main.py new file mode 100644 index 00000000000..75505c641b1 --- /dev/null +++ b/test_conversion/test_main.py @@ -0,0 +1,175 @@ +""" +Main orchestrator: compare the chat template + tokenizer of a transformers +model against an Ollama (GGUF + Modelfile) deployment. + +Pipeline: + 1. run_transformers.py -> /transformers.json + 2. run_ollama.py -> /ollama.json + 3. run_behavior.py -> /behavior.json (only cases with expected_behavior) + 4. run_logits.py -> /logits.json (per-case next-token logit comparison) + 5. compare.py -> prints per-test report, exit 1 on failure + +Each step is skipped if its output JSON already exists; delete the file (or +the whole ) to force recomputation. Or pass --force. Slow optional +steps can be turned off with --no-behavior and --no-logits. + +Requirements: + - transformers + torch (Python; transformers always; torch only for --logits) + - requests (Python) + - ollama (must be running: ollama serve) + +Usage: + python test_main.py + [--work-dir DIR] + [--ollama-model-name NAME] + [--ollama-url URL] + [--num-predict N] + [--logits-top-k K] + [--logits-device cuda|cpu] + [--no-behavior] + [--no-logits] + [--force] + +Where: + is a HuggingFace transformers model directory + (must contain tokenizer files + chat_template). + is a directory containing both: + - a 'Modelfile' file + - the .gguf file referenced by the Modelfile (FROM ./...) +""" + +import argparse +import subprocess +import sys +from pathlib import Path + + +def run_step(label, cmd, output_file, force): + if not force and output_file.exists(): + print(f"=== {label}: SKIP (using cached {output_file}) ===\n") + return + print(f"=== {label} ===") + print("$ " + " ".join(cmd)) + rc = subprocess.run(cmd).returncode + if rc != 0: + sys.exit(f"!!! {label} failed (exit {rc})") + print() + + +def main(): + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument("hf_model_dir", help="Path to the HuggingFace transformers model directory") + parser.add_argument("gguf_dir", help="Directory containing the Modelfile and the .gguf file") + parser.add_argument("--work-dir", default=None, + help="Where to store intermediate JSON files " + "(default: ./results/__vs__/)") + parser.add_argument("--ollama-model-name", default="test-chat-template-tmp", + help="Temporary ollama model name (created and removed by the script). " + "No underscores: ollama rejects them.") + parser.add_argument("--ollama-url", default="http://localhost:11434") + parser.add_argument("--num-predict", type=int, default=256, + help="Max tokens generated per behavioural case (default 256)") + parser.add_argument("--logits-top-k", type=int, default=20, + help="K for next-token top-K logit comparison (default 20)") + parser.add_argument("--logits-device", default=None, choices=["cuda", "cpu"], + help="Device for the transformers forward pass (default: cuda if available, else cpu)") + parser.add_argument("--no-behavior", action="store_true", + help="Skip the behavioural step (slower; requires the model to actually generate)") + parser.add_argument("--no-logits", action="store_true", + help="Skip the logit-comparison step (loads the full transformers model; slow)") + parser.add_argument("--force", action="store_true", + help="Recompute all intermediate outputs, ignoring caches") + args = parser.parse_args() + + here = Path(__file__).resolve().parent + hf_dir = Path(args.hf_model_dir).resolve() + gguf_dir = Path(args.gguf_dir).resolve() + modelfile = gguf_dir / "Modelfile" + + if not hf_dir.is_dir(): + sys.exit(f"ERROR: HF model dir not found: {hf_dir}") + if not modelfile.is_file(): + sys.exit(f"ERROR: Modelfile not found at {modelfile}") + + work_dir = (Path(args.work_dir).resolve() + if args.work_dir + else here / "results" / f"{hf_dir.name}__vs__{gguf_dir.name}") + work_dir.mkdir(parents=True, exist_ok=True) + transformers_json = work_dir / "transformers.json" + ollama_json = work_dir / "ollama.json" + behavior_json = work_dir / "behavior.json" + logits_json = work_dir / "logits.json" + + print(f"HF model dir : {hf_dir}") + print(f"GGUF dir : {gguf_dir}") + print(f"Modelfile : {modelfile}") + print(f"Work dir : {work_dir}") + print(f"Ollama URL : {args.ollama_url}") + print() + + # Step 1: transformers + run_step( + "Step 1/5 — transformers (render + tokenize)", + [sys.executable, str(here / "run_transformers.py"), str(hf_dir), str(transformers_json)], + transformers_json, + args.force, + ) + + # Step 2: ollama (depends on Step 1's JSON for the raw-tokenizer probe) + run_step( + "Step 2/5 — ollama (chat + raw tokenizer probes)", + [sys.executable, str(here / "run_ollama.py"), str(modelfile), str(ollama_json), + "--model-name", args.ollama_model_name, + "--transformers-output", str(transformers_json), + "--ollama-url", args.ollama_url], + ollama_json, + args.force, + ) + + # Step 3: behavioural check (optional). The model actually generates here. + if args.no_behavior: + print("=== Step 3/5 — behavioural check: SKIPPED (--no-behavior) ===\n") + else: + run_step( + "Step 3/5 — behavioural check (model generates tool_calls)", + [sys.executable, str(here / "run_behavior.py"), str(modelfile), str(behavior_json), + "--transformers-output", str(transformers_json), + "--model-name", args.ollama_model_name, + "--ollama-url", args.ollama_url, + "--num-predict", str(args.num_predict)], + behavior_json, + args.force, + ) + + # Step 4: logit comparison (optional, slow — loads the full transformers model). + if args.no_logits: + print("=== Step 4/5 — logit comparison: SKIPPED (--no-logits) ===\n") + else: + logits_cmd = [sys.executable, str(here / "run_logits.py"), + str(hf_dir), str(modelfile), str(logits_json), + "--transformers-output", str(transformers_json), + "--model-name", args.ollama_model_name, + "--ollama-url", args.ollama_url, + "--top-k", str(args.logits_top_k)] + if args.logits_device: + logits_cmd += ["--device", args.logits_device] + run_step( + "Step 4/5 — logit comparison (transformers vs Ollama, next-token top-K)", + logits_cmd, + logits_json, + args.force, + ) + + # Step 5: compare (always runs) + print("=== Step 5/5 — compare ===") + compare_cmd = [sys.executable, str(here / "compare.py"), str(transformers_json), str(ollama_json)] + if not args.no_behavior and behavior_json.exists(): + compare_cmd += ["--behavior", str(behavior_json)] + if not args.no_logits and logits_json.exists(): + compare_cmd += ["--logits", str(logits_json)] + rc = subprocess.run(compare_cmd).returncode + sys.exit(rc) + + +if __name__ == "__main__": + main() diff --git a/tests/test-tokenizer-random.py b/tests/test-tokenizer-random.py index 93e697607e6..374ae159c2e 100644 --- a/tests/test-tokenizer-random.py +++ b/tests/test-tokenizer-random.py @@ -30,7 +30,7 @@ class LibLlama: DEFAULT_PATH_LLAMA_H = "./include/llama.h" DEFAULT_PATH_INCLUDES = ["./ggml/include/", "./include/"] - DEFAULT_PATH_LIBLLAMA = "./build/src/libllama.so" # CMakeLists.txt: BUILD_SHARED_LIBS ON + DEFAULT_PATH_LIBLLAMA = "./build/bin/libllama.so" # CMakeLists.txt: BUILD_SHARED_LIBS ON def __init__(self, path_llama_h: str | None = None, path_includes: list[str] = [], path_libllama: str | None = None): path_llama_h = path_llama_h or self.DEFAULT_PATH_LLAMA_H @@ -79,6 +79,9 @@ def __init__(self, libllama: LibLlama, path_model: str, mparams={}, cparams={}): self.model = self.lib.llama_model_load_from_file(path_model.encode(), mparams) if not self.model: raise RuntimeError("error: failed to load model '%s'" % path_model) + self.vocab = self.lib.llama_model_get_vocab(self.model) + if not self.vocab: + raise RuntimeError("error: failed to get vocab for model '%s'" % path_model) if isinstance(cparams, dict): cparams = libllama.context_default_params(**cparams) self.ctx = self.lib.llama_new_context_with_model(self.model, cparams) @@ -99,10 +102,10 @@ def free(self): def tokenize(self, text: str, add_special: bool = False, parse_special: bool = False) -> list[int]: encoded_text: bytes = text.encode("utf-8") - num = self.lib.llama_tokenize(self.model, encoded_text, len(encoded_text), self.token_ids, len(self.token_ids), add_special, parse_special) + num = self.lib.llama_tokenize(self.vocab, encoded_text, len(encoded_text), self.token_ids, len(self.token_ids), add_special, parse_special) while num < 0 and len(self.token_ids) < (16 << 20): self.token_ids = self.ffi.new("llama_token[]", -2 * num) - num = self.lib.llama_tokenize(self.model, encoded_text, len(encoded_text), self.token_ids, len(self.token_ids), add_special, parse_special) + num = self.lib.llama_tokenize(self.vocab, encoded_text, len(encoded_text), self.token_ids, len(self.token_ids), add_special, parse_special) return list(self.token_ids[0:num]) def detokenize(self, ids: list[int], remove_special: bool = False, unparse_special: bool = False) -> str: @@ -110,10 +113,10 @@ def detokenize(self, ids: list[int], remove_special: bool = False, unparse_speci self.token_ids = self.ffi.new("llama_token[]", 2 * len(ids)) for i, id in enumerate(ids): self.token_ids[i] = id - num = self.lib.llama_detokenize(self.model, self.token_ids, len(ids), self.text_buff, len(self.text_buff), remove_special, unparse_special) + num = self.lib.llama_detokenize(self.vocab, self.token_ids, len(ids), self.text_buff, len(self.text_buff), remove_special, unparse_special) while num < 0 and len(self.text_buff) < (16 << 20): self.text_buff = self.ffi.new("uint8_t[]", -2 * num) - num = self.lib.llama_detokenize(self.model, self.token_ids, len(ids), self.text_buff, len(self.text_buff), remove_special, unparse_special) + num = self.lib.llama_detokenize(self.vocab, self.token_ids, len(ids), self.text_buff, len(self.text_buff), remove_special, unparse_special) return str(cast(Buffer, self.ffi.buffer(self.text_buff, num)), encoding="utf-8", errors="replace") # replace errors with '\uFFFD' @@ -152,6 +155,9 @@ def encode(self, text: str) -> list[int]: def decode(self, ids: list[int]) -> str: return self.model.decode(ids, skip_special_tokens=False) + + def convert_ids_to_tokens(self, ids: list[int]) -> list[str]: + return self.model.convert_ids_to_tokens(ids) class TokenizerLlamaCpp (Tokenizer): @@ -204,6 +210,12 @@ def generator_custom_text() -> Iterator[str]: "\n =", "' era", "Hello, y'all! How are you 😁 ?我想在apple工作1314151天~", + ] + + +def generator_digit() -> Iterator[str]: + """Digits""" + yield from [ "3", "33", "333", @@ -213,6 +225,20 @@ def generator_custom_text() -> Iterator[str]: "3333333", "33333333", "333333333", + "333333333+333", + ] + + +def generator_contractions() -> Iterator[str]: + """Contractions and apostrophes""" + yield from [ + "I'll", + "We've they're", + "Bonjour quoiqu'aujourd'hui", + "puisqu'après", + "j're", + "“Bonjour quoiqu'aujourd'hui”", + "puisqu’après", ] @@ -418,7 +444,7 @@ def find_first_mismatch(ids1: list[int] | str, ids2: list[int] | str): return min(len(ids1), len(ids2)) def check_detokenizer(text: str, text1: str, text2: str) -> bool: - if text1 == text2: # equal to TokenizerGroundtruth? + if text1 == text2 or text2 == text: # equal to TokenizerGroundtruth? return True # equal to source text? if tokenizer1.add_bos_token and tokenizer1.bos_token and isinstance(tokenizer1.bos_token, str): # remove BOS @@ -436,7 +462,7 @@ def check_detokenizer(text: str, text1: str, text2: str) -> bool: t_start = time.perf_counter() encode_errors = 0 decode_errors = 0 - MAX_ERRORS = 10 + MAX_ERRORS = 20 logger.info("%s: %s" % (generator.__qualname__, "ini")) for text in generator: @@ -455,23 +481,30 @@ def check_detokenizer(text: str, text1: str, text2: str) -> bool: t_encode2 += t2 - t1 t_decode1 += t3 - t2 t_decode2 += t4 - t3 - if encode_errors < MAX_ERRORS and ids1 != ids2: + had_error = False + if (MAX_ERRORS is None or encode_errors < MAX_ERRORS) and ids1 != ids2: i = find_first_mismatch(ids1, ids2) - ids1 = list(ids1)[max(0, i - 2) : i + 5 + 1] - ids2 = list(ids2)[max(0, i - 2) : i + 5 + 1] - logger.error(" Expected: " + str(ids1)) - logger.error(" Result: " + str(ids2)) + ids1_ctx = list(ids1)[max(0, i - 2) : i + 5 + 1] + ids2_ctx = list(ids2)[max(0, i - 2) : i + 5 + 1] + logger.error(f" Input: {repr(text[:100])}") + logger.error(" Expected: " + str(ids1_ctx) + " " + str(tokenizer1.convert_ids_to_tokens(ids1_ctx))) + logger.error(" Result: " + str(ids2_ctx) + " " + str(tokenizer1.convert_ids_to_tokens(ids2_ctx))) encode_errors += 1 - logger.error(f" {encode_errors=}") - if decode_errors < MAX_ERRORS and not check_detokenizer(text, text1, text2): + # logger.error(f" {encode_errors=}") + had_error = True + if (MAX_ERRORS is None or decode_errors < MAX_ERRORS) and not check_detokenizer(text, text1, text2): i = find_first_mismatch(text1, text2) - text1 = list(text1[max(0, i - 2) : i + 5 + 1]) - text2 = list(text2[max(0, i - 2) : i + 5 + 1]) - logger.error(" Expected: " + " ".join(hex(ord(x)) for x in text1)) - logger.error(" Result: " + " ".join(hex(ord(x)) for x in text2)) + text1_ctx = text1[max(0, i - 2) : i + 5 + 1] + text2_ctx = text2[max(0, i - 2) : i + 5 + 1] + logger.error(f" Input: {repr(text[:100])}") + logger.error(" Expected: " + repr(text1_ctx)) + logger.error(" Result: " + repr(text2_ctx)) decode_errors += 1 - logger.error(f" {decode_errors=}") - if encode_errors >= MAX_ERRORS and decode_errors >= MAX_ERRORS: + # logger.error(f" {decode_errors=}") + had_error = True + if had_error: + logger.error("") + if MAX_ERRORS is not None and encode_errors >= MAX_ERRORS and decode_errors >= MAX_ERRORS: logger.error(f" EXIT: {encode_errors=} {decode_errors=}") # raise Exception() break @@ -493,74 +526,76 @@ def main(argv: list[str] | None = None): tokenizer1 = TokenizerGroundtruth(args.dir_tokenizer) tokenizer2 = TokenizerLlamaCpp(args.vocab_file) - # compare_tokenizers(tokenizer1, tokenizer2, generator_custom_text()) - # compare_tokenizers(tokenizer1, tokenizer2, generator_custom_text_edge_cases()) + compare_tokenizers(tokenizer1, tokenizer2, generator_custom_text()) + compare_tokenizers(tokenizer1, tokenizer2, generator_digit()) + compare_tokenizers(tokenizer1, tokenizer2, generator_contractions()) + compare_tokenizers(tokenizer1, tokenizer2, generator_custom_text_edge_cases()) compare_tokenizers(tokenizer1, tokenizer2, generator_ascii_lr_strip()) compare_tokenizers(tokenizer1, tokenizer2, generator_apostrophe()) compare_tokenizers(tokenizer1, tokenizer2, generator_unicodes()) compare_tokenizers(tokenizer1, tokenizer2, generator_vocab_words(tokenizer1)) compare_tokenizers(tokenizer1, tokenizer2, generator_added_lr_strip(tokenizer1)) - # compare_tokenizers(tokenizer1, tokenizer2, generator_random_added_tokens(tokenizer1, 10_000)) - # compare_tokenizers(tokenizer1, tokenizer2, generator_random_chars(10_000)) - # compare_tokenizers(tokenizer1, tokenizer2, generator_random_unicodes(10_000)) - # compare_tokenizers(tokenizer1, tokenizer2, generator_random_vocab_chars(tokenizer1, 10_000)) - # compare_tokenizers(tokenizer1, tokenizer2, generator_random_vocab_words(tokenizer1, 5_000)) + compare_tokenizers(tokenizer1, tokenizer2, generator_random_added_tokens(tokenizer1, 10_000)) + compare_tokenizers(tokenizer1, tokenizer2, generator_random_chars(10_000)) + compare_tokenizers(tokenizer1, tokenizer2, generator_random_unicodes(10_000)) + compare_tokenizers(tokenizer1, tokenizer2, generator_random_vocab_chars(tokenizer1, 10_000)) + compare_tokenizers(tokenizer1, tokenizer2, generator_random_vocab_words(tokenizer1, 5_000)) tokenizer2.model.free() if __name__ == "__main__": - # main() - - if True: - logging.basicConfig( - level = logging.DEBUG, - format = "%(asctime)s.%(msecs)03d %(name)s %(levelname)s %(message)s", - datefmt = "%Y-%m-%d %H:%M:%S", - filename = logger.name + ".log", - filemode = "a" - ) - logging.basicConfig( - level = logging.DEBUG, - format = "%(levelname)s %(message)s", - ) - - path_tokenizers = Path("./models/tokenizers/") - path_vocab_format = "./models/ggml-vocab-%s.gguf" - - tokenizers = [ - "llama-spm", # SPM - "phi-3", # SPM - "gemma", # SPM - "gemma-2", # SPM - "baichuan", # SPM - "bert-bge", # WPM - "jina-v2-en", # WPM - "llama-bpe", # BPE - "phi-2", # BPE - "deepseek-llm", # BPE - "deepseek-coder", # BPE - "falcon", # BPE - "mpt", # BPE - "starcoder", # BPE - "gpt-2", # BPE - "stablelm2", # BPE - "refact", # BPE - "qwen2", # BPE - "olmo", # BPE - "jina-v2-es", # BPE - "jina-v2-de", # BPE - "smaug-bpe", # BPE - "poro-chat", # BPE - "jina-v2-code", # BPE - "viking", # BPE - "jais", # BPE - ] - - logger.info("=" * 50) - for tokenizer in tokenizers: - logger.info("-" * 50) - logger.info(f"TOKENIZER: '{tokenizer}'") - vocab_file = Path(path_vocab_format % tokenizer) - dir_tokenizer = path_tokenizers / tokenizer - main([str(vocab_file), str(dir_tokenizer), "--verbose"]) + main() + + # if True: + # logging.basicConfig( + # level = logging.DEBUG, + # format = "%(asctime)s.%(msecs)03d %(name)s %(levelname)s %(message)s", + # datefmt = "%Y-%m-%d %H:%M:%S", + # filename = logger.name + ".log", + # filemode = "a" + # ) + # logging.basicConfig( + # level = logging.DEBUG, + # format = "%(levelname)s %(message)s", + # ) + + # path_tokenizers = Path("./models/tokenizers/") + # path_vocab_format = "./models/ggml-vocab-%s.gguf" + + # tokenizers = [ + # "llama-spm", # SPM + # "phi-3", # SPM + # "gemma", # SPM + # "gemma-2", # SPM + # "baichuan", # SPM + # "bert-bge", # WPM + # "jina-v2-en", # WPM + # "llama-bpe", # BPE + # "phi-2", # BPE + # "deepseek-llm", # BPE + # "deepseek-coder", # BPE + # "falcon", # BPE + # "mpt", # BPE + # "starcoder", # BPE + # "gpt-2", # BPE + # "stablelm2", # BPE + # "refact", # BPE + # "qwen2", # BPE + # "olmo", # BPE + # "jina-v2-es", # BPE + # "jina-v2-de", # BPE + # "smaug-bpe", # BPE + # "poro-chat", # BPE + # "jina-v2-code", # BPE + # "viking", # BPE + # "jais", # BPE + # ] + + # logger.info("=" * 50) + # for tokenizer in tokenizers: + # logger.info("-" * 50) + # logger.info(f"TOKENIZER: '{tokenizer}'") + # vocab_file = Path(path_vocab_format % tokenizer) + # dir_tokenizer = path_tokenizers / tokenizer + # main([str(vocab_file), str(dir_tokenizer), "--verbose"])