diff --git a/common/debug.cpp b/common/debug.cpp
index 0df409a79db..4732f73c443 100644
--- a/common/debug.cpp
+++ b/common/debug.cpp
@@ -3,6 +3,10 @@
 #include "log.h"
 
 #include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <regex>
 #include <string>
 
 static std::string common_ggml_ne_string(const ggml_tensor * t) {
@@ -155,6 +159,53 @@ template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, b
     if (!ggml_is_quantized(t->type) && matches_filter) {
         uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
         common_debug_print_tensor<abort_on_nan>(data, t->type, t->ne, t->nb, 3);
+
+        // Optional full-tensor binary dump for layer-by-layer comparison work.
+        // Activated by setting env var LLAMA_DUMP_TENSORS_FILE=/path/to/out.bin.
+        // Optionally narrow what gets dumped with LLAMA_DUMP_TENSORS_REGEX
+        // (a single regex; anchored implicitly with regex_search). If unset,
+        // every tensor that already matched cb_data's filter gets dumped.
+        // Per-tensor binary record (little-endian):
+        //     u32 name_len, char name[name_len],
+        //     u32 dtype (ggml_type), i64 ne[4],
+        //     u64 n_bytes, u8 data[n_bytes]
+        const char * dump_path = std::getenv("LLAMA_DUMP_TENSORS_FILE");
+        if (dump_path) {
+            static std::regex   dump_regex;
+            static bool         dump_regex_set = false;
+            static bool         dump_regex_valid = false;
+            if (!dump_regex_set) {
+                dump_regex_set = true;
+                const char * pat = std::getenv("LLAMA_DUMP_TENSORS_REGEX");
+                if (pat && *pat) {
+                    try { dump_regex = std::regex(pat); dump_regex_valid = true; }
+                    catch (const std::regex_error &) { dump_regex_valid = false; }
+                }
+            }
+            bool should_dump = !dump_regex_valid || std::regex_search(t->name, dump_regex);
+            if (should_dump) {
+                static FILE * dump_fout = nullptr;
+                static std::string opened_path;
+                if (!dump_fout || opened_path != dump_path) {
+                    if (dump_fout) fclose(dump_fout);
+                    dump_fout = std::fopen(dump_path, "wb");
+                    opened_path = dump_path;
+                }
+                if (dump_fout) {
+                    uint32_t name_len = (uint32_t) std::strlen(t->name);
+                    std::fwrite(&name_len, 4, 1, dump_fout);
+                    std::fwrite(t->name, 1, name_len, dump_fout);
+                    uint32_t dtype = (uint32_t) t->type;
+                    std::fwrite(&dtype, 4, 1, dump_fout);
+                    int64_t ne[4] = { t->ne[0], t->ne[1], t->ne[2], t->ne[3] };
+                    std::fwrite(ne, 8, 4, dump_fout);
+                    uint64_t nbytes = (uint64_t) ggml_nbytes(t);
+                    std::fwrite(&nbytes, 8, 1, dump_fout);
+                    std::fwrite(data, 1, nbytes, dump_fout);
+                    std::fflush(dump_fout);
+                }
+            }
+        }
     }
 
     return true;
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 46469c86200..dbb11961d62 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -1490,6 +1490,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
         if chkhsh == "e4d54df1ebc1f2b91acd986c5b51aa50837d5faf7c7398e73c1f9e9ee5d19869":
             # ref: https://huggingface.co/kakaocorp/kanana-2-30b-a3b-instruct-2601
             res = "kanana2"
+        if chkhsh == "5f9861fd826d8e124b222f41f41b928e78d8f6c8fbdf25625d06cc1e8736662c":
+            # ref: https://huggingface.co/OpenLLM-France/Luciole-1B-Base
+            res = "qwen2"
 
         if res is None:
             logger.warning("\n")
@@ -1515,15 +1518,179 @@ def get_vocab_base_pre(self, tokenizer) -> str:
     def _set_vocab_none(self) -> None:
         self.gguf_writer.add_tokenizer_model("none")
 
-    def _set_vocab_gpt2(self) -> None:
+    @staticmethod
+    def _gpt2_bytes_to_unicode() -> dict[int, str]:
+        # Returns the GPT-2 byte-to-unicode mapping: each byte (0-255) maps to a
+        # printable unicode character. Printable ASCII and Latin-1 supplement bytes
+        # map to themselves; remaining bytes are shifted to 256+.
+        # This is the same as openai/gpt-2's bytes_to_unicode().
+        bs = list(range(ord("!"), ord("~") + 1)) + list(range(0xA1, 0xAC + 1)) + list(range(0xAE, 0xFF + 1))
+        cs = list(bs)
+        n = 0
+        for b in range(256):
+            if b not in bs:
+                bs.append(b)
+                cs.append(256 + n)
+                n += 1
+        return dict(zip(bs, (chr(c) for c in cs)))
+
+    def _set_vocab_gpt2(self, convert_metaspace_to_gpt2=False) -> None:
         tokens, toktypes, tokpre = self.get_vocab_base()
+
+        if convert_metaspace_to_gpt2:
+            # The tokenizer uses raw UTF-8 with Metaspace (▁ for spaces), but
+            # the "gpt2" tokenizer model in llama.cpp expects GPT-2 byte encoding
+            # (where each byte is mapped to a printable unicode char, e.g. space -> Ġ).
+            # Convert all tokens: replace ▁ back to space, then apply GPT-2 byte encoding.
+            byte_encoder = self._gpt2_bytes_to_unicode()
+            seen: set[str] = set()
+            for i, token in enumerate(tokens):
+                if toktypes[i] in (gguf.TokenType.NORMAL, gguf.TokenType.USER_DEFINED):
+                    if token == " ":
+                        # Useless token in Luciole
+                        encoded = "".join(byte_encoder[b] for b in "\u2581".encode("utf-8"))
+                    else:
+                        encoded = "".join(byte_encoder[b] for b in token.replace("\u2581", " ").encode("utf-8"))
+                    assert encoded not in seen, f"Unexpected collision in GPT-2 byte encoding: {encoded!r} for '{token}'"
+                    seen.add(encoded)
+                    tokens[i] = encoded
+                else: # gguf.TokenType.CONTROL
+                    print("NOCOMMIT", i, token, toktypes[i])
+                    assert token not in seen, f"Unexpected collision in GPT-2 byte encoding: {token}"
+                    seen.add(token)
+
         self.gguf_writer.add_tokenizer_model("gpt2")
         self.gguf_writer.add_tokenizer_pre(tokpre)
         self.gguf_writer.add_token_list(tokens)
         self.gguf_writer.add_token_types(toktypes)
-
         special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
+        if convert_metaspace_to_gpt2:
+            special_vocab.merges = [
+                " ".join(
+                    "".join(byte_encoder[b] for b in part.replace("\u2581", " ").encode("utf-8"))
+                    for part in merge.split(" ")
+                )
+                for merge in special_vocab.merges
+            ]
         special_vocab.add_to_gguf(self.gguf_writer)
+        return tokens
+
+    def _set_vocab_bpe_as_spm(self) -> None:
+        """Convert a HuggingFace BPE tokenizer (with Metaspace ▁) to SPM format for llama.cpp.
+
+        This reads the vocab from tokenizer.json, keeps tokens in their original
+        UTF-8 form (with ▁ preserved), assigns scores from merge ranks, and adds
+        byte fallback tokens <0x00>-<0xFF> required by the SPM tokenizer in C++.
+        """
+        from transformers import AutoTokenizer
+
+        tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
+        vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab))
+
+        reverse_vocab = {id_: tok for tok, id_ in tokenizer.vocab.items()}
+        added_vocab = tokenizer.get_added_vocab()
+        added_tokens_decoder = tokenizer.added_tokens_decoder
+
+        # Build merge rank lookup: token_text -> rank (lower rank = merged earlier = higher priority)
+        merge_ranks: dict[str, int] = {}
+        merges_file = self.dir_model / "tokenizer.json"
+        if merges_file.is_file():
+            import json as _json
+            with open(merges_file, "r", encoding="utf-8") as f:
+                tokenizer_json = _json.load(f)
+            merges = tokenizer_json.get("model", {}).get("merges", [])
+            for rank, merge in enumerate(merges):
+                # merge can be "token_a token_b" (str) or ["token_a", "token_b"] (list)
+                parts = merge.split(" ") if isinstance(merge, str) else merge
+                merged_token = "".join(parts)
+                if merged_token not in merge_ranks:
+                    merge_ranks[merged_token] = rank
+
+        # Prepare token arrays
+        tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
+        scores: list[float] = [-10000.0] * vocab_size
+        toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
+
+        # Track which byte values are covered (for byte fallback)
+        byte_token_ids: dict[int, int] = {}
+
+        for token_id in range(vocab_size):
+            if token_id not in reverse_vocab:
+                continue
+
+            token_text = reverse_vocab[token_id]
+
+            if token_id in added_tokens_decoder:
+                info = added_tokens_decoder[token_id]
+                if info.special or self.does_token_look_special(token_text):
+                    tokens[token_id] = token_text.encode("utf-8")
+                    scores[token_id] = 0.0
+                    toktypes[token_id] = SentencePieceTokenTypes.CONTROL
+                    continue
+
+            # Check if this is a byte fallback token (<0xHH>) or a single-byte token
+            import re as _re
+            raw_bytes = token_text.encode("utf-8")
+            byte_match = _re.fullmatch(r"<0x([0-9A-Fa-f]{2})>", token_text)
+            if byte_match:
+                byte_val = int(byte_match.group(1), 16)
+                byte_token_ids[byte_val] = token_id
+                tokens[token_id] = token_text.encode("utf-8")
+                scores[token_id] = -10000.0
+                toktypes[token_id] = SentencePieceTokenTypes.BYTE
+                continue
+            elif len(raw_bytes) == 1:
+                byte_token_ids[raw_bytes[0]] = token_id
+
+            # Assign score based on merge rank or token_id
+            if token_text in merge_ranks:
+                # Merged tokens: earlier merges get higher (less negative) scores
+                # Use negative rank so that rank 0 (first merge) gets highest score
+                score = -float(merge_ranks[token_text])
+            else:
+                # Base tokens (single chars) get high scores; unknown tokens get low scores
+                if len(raw_bytes) == 1:
+                    score = 0.0
+                else:
+                    score = -10000.0 + float(token_id)
+
+            tokens[token_id] = raw_bytes
+            scores[token_id] = score
+            toktypes[token_id] = SentencePieceTokenTypes.NORMAL
+
+        # Add byte fallback tokens for any missing byte values
+        # SPM in llama.cpp requires <0x00> through <0xFF> with BYTE type
+        next_pad_idx = 0
+        for byte_val in range(256):
+            if byte_val in byte_token_ids:
+                continue  # already handled above
+            hex_str = f"<0x{byte_val:02X}>"
+            if byte_val in byte_token_ids:
+                tid = byte_token_ids[byte_val]
+                tokens[tid] = hex_str.encode("utf-8")
+                toktypes[tid] = SentencePieceTokenTypes.BYTE
+                scores[tid] = -10000.0
+            else:
+                # Find an unused PAD slot
+                while next_pad_idx < len(tokens) and toktypes[next_pad_idx] != SentencePieceTokenTypes.UNUSED:
+                    next_pad_idx += 1
+                if next_pad_idx < vocab_size:
+                    tokens[next_pad_idx] = hex_str.encode("utf-8")
+                    toktypes[next_pad_idx] = SentencePieceTokenTypes.BYTE
+                    scores[next_pad_idx] = -10000.0
+                    next_pad_idx += 1
+                else:
+                    logger.warning(f"No room to add byte fallback token {hex_str}")
+
+        self.gguf_writer.add_tokenizer_model("llama")
+        self.gguf_writer.add_tokenizer_pre("default")
+        self.gguf_writer.add_token_list(tokens)
+        self.gguf_writer.add_token_scores(scores)
+        self.gguf_writer.add_token_types(toktypes)
+
+        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
+        special_vocab.add_to_gguf(self.gguf_writer)
+        return tokens
 
     def _set_vocab_qwen(self):
         dir_model = self.dir_model
@@ -9607,14 +9774,50 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         yield from super().modify_tensors(data_torch, name, bid)
 
 
+LUCIOLE_TO_BPE = False
+def set_vocab_luciole(self):
+    # Luciole
+    # Promote every entry of added_tokens_decoder to a control token, even those
+    # flagged "special": false in tokenizer_config.json (e.g. <tool_call>,
+    # </tool_call>, <tool_response>, </tool_response>). Otherwise llama.cpp's
+    # tokenizer BPE-splits them at inference, diverging from training.
+    from transformers import AutoTokenizer
+    tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
+    added_token_texts = {info.content for info in tokenizer.added_tokens_decoder.values()}
+    original_does_token_look_special = self.does_token_look_special
+
+    def does_token_look_special_with_added(token):
+        token_text = token.decode("utf-8") if isinstance(token, (bytes, bytearray)) else token
+        if token_text in added_token_texts:
+            return True
+        return original_does_token_look_special(token)
+
+    self.does_token_look_special = does_token_look_special_with_added
+    try:
+        if LUCIOLE_TO_BPE:
+            tokens = self._set_vocab_gpt2(convert_metaspace_to_gpt2=True)
+            self.gguf_writer.add_pad_token_id(tokens.index("<pad>"))
+            self.gguf_writer.add_unk_token_id(tokens.index("<unk>"))
+        else:
+            tokens = self._set_vocab_bpe_as_spm()
+            self.gguf_writer.add_pad_token_id(tokens.index(b"<pad>"))
+            self.gguf_writer.add_unk_token_id(tokens.index(b"<unk>"))
+    finally:
+        self.does_token_look_special = original_does_token_look_special
+    self.gguf_writer.add_add_space_prefix(True)
+
+
 @ModelBase.register("NemotronForCausalLM")
 class NemotronModel(TextModel):
     model_arch = gguf.MODEL_ARCH.NEMOTRON
 
     def set_vocab(self):
-        self._set_vocab_sentencepiece()
-        self.gguf_writer.add_pad_token_id(0)
-        self.gguf_writer.add_unk_token_id(1)
+        if (self.dir_model / "tokenizer.model").is_file():
+            self._set_vocab_sentencepiece()
+            self.gguf_writer.add_pad_token_id(0)
+            self.gguf_writer.add_unk_token_id(1)
+        else:
+            set_vocab_luciole(self)
 
     def set_gguf_parameters(self):
         super().set_gguf_parameters()
@@ -9642,8 +9845,20 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         #   model.layers.{l}.input_layernorm.weight
         #   model.layers.{l}.post_attention_layernorm.weight
         #   model.norm.weight
+        # NOTE: cast to fp32 BEFORE the +1 — source weights are bf16/fp16 and the
+        # add would otherwise happen at the source dtype, quantizing γ by ~3.9e-3
+        # (bf16) / ~9.8e-4 (fp16) per element. GGUF stores these tensors as F32,
+        # so doing the arithmetic at full precision is free.
         if name.endswith("norm.weight"):
-            data_torch = data_torch + 1
+            data_torch = data_torch.float() + 1
+
+        # for tied embeddings, duplicate token_embd as output.weight
+        if self.hparams.get("tie_word_embeddings", False) and name == "model.embed_tokens.weight":
+            yield (self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch)
+
+        # skip lm_head.weight if tie_word_embeddings is True (already emitted from embed_tokens above)
+        if self.hparams.get("tie_word_embeddings", False) and name == "lm_head.weight":
+            return
 
         yield from super().modify_tensors(data_torch, name, bid)
 
@@ -10091,6 +10306,8 @@ def __init__(self, *args, **kwargs):
             self.model_arch = gguf.MODEL_ARCH.NEMOTRON_H_MOE
             self.is_moe = True
 
+        self.is_luciole = hparams.get("bos_token_id", -1) == 0
+
         super().__init__(*args, **kwargs)
 
         # Save the top-level head_dim for later
@@ -10164,6 +10381,10 @@ def set_gguf_parameters(self):
                 self.gguf_writer.add_moe_latent_size(latent_size)
 
     def set_vocab(self):
+        if self.is_luciole:
+            set_vocab_luciole(self)
+            return
+
         super().set_vocab()
 
         # The tokenizer _does_ add a BOS token (via post_processor type
diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp
index 17d162d95d3..3f5e382b120 100644
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -15,7 +15,14 @@ static bool run(llama_context * ctx, const common_params & params) {
 
     const bool add_bos = llama_vocab_get_add_bos(vocab);
 
-    std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, add_bos);
+    // Opt-in atomic tokenization of control strings: set
+    // LLAMA_TOKENIZE_PARSE_SPECIAL=1 to make chat-template tokens like
+    // <|im_start|> / <|im_end|> / <tool_call> tokenize as a single id instead
+    // of being byte-split. Default behaviour (env var unset) is unchanged.
+    const char * env_parse_special = std::getenv("LLAMA_TOKENIZE_PARSE_SPECIAL");
+    const bool parse_special = env_parse_special != nullptr &&
+                               env_parse_special[0] != '\0' && env_parse_special[0] != '0';
+    std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, add_bos, parse_special);
 
     if (tokens.empty()) {
         LOG_ERR("%s : there are not input tokens to process - (try to provide a prompt with '-p')\n", __func__);
diff --git a/test_conversion/README.md b/test_conversion/README.md
new file mode 100644
index 00000000000..98ebf416ed6
--- /dev/null
+++ b/test_conversion/README.md
@@ -0,0 +1,283 @@
+# test_conversion
+
+Validate that an HF transformers model has been converted to GGUF
+faithfully — both the tokenizer/chat-template AND the model weights — by
+comparing the **transformers reference** against the **GGUF served by
+Ollama**. Used to catch tokenizer drift, broken chat templates,
+quantization regressions, conversion bugs in `convert_hf_to_gguf.py`,
+etc., before publishing a release.
+
+The suite has two parts:
+
+1. **`test_main.py`** — main 5-step pipeline (tokenizer, chat
+   template, behaviour, logits). Run this for every release.
+2. **`run_layer_diff.py` + `compare_layers.py`** — deeper layer-by-layer
+   activation comparison. Run when (1) flags a logit-level regression
+   and you need to localize which op causes it.
+
+## Prerequisites
+
+### Python
+
+```bash
+pip install transformers torch requests numpy matplotlib safetensors
+```
+
+### llama.cpp built (with our patches)
+
+The layer-diff tool uses two env-gated patches in `llama.cpp`:
+
+- `common/debug.cpp` — binary tensor dump triggered by
+  `LLAMA_DUMP_TENSORS_FILE` and `LLAMA_DUMP_TENSORS_REGEX`. Pure no-op
+  when those vars are unset.
+- `examples/eval-callback/eval-callback.cpp` — atomic tokenization of
+  control tokens (`<|im_start|>` etc.) when `LLAMA_TOKENIZE_PARSE_SPECIAL=1`.
+  Default behaviour unchanged.
+
+Build:
+
+```bash
+cd /path/to/llama.cpp
+cmake -B build
+cmake --build build --target llama-eval-callback -j$(nproc)
+```
+
+### Ollama server running
+
+```bash
+ollama serve     # in another terminal
+```
+
+The main pipeline talks to it on `http://localhost:11434`.
+
+---
+
+## Step 0 — Convert HF → GGUF
+
+Starting from a HuggingFace transformers checkpoint directory:
+
+```bash
+HF_MODEL=/path/to/Luciole-1B-SFT-1.2          # contains config.json, model.safetensors, tokenizer.*
+GGUF_DIR=/path/to/Luciole-1B-SFT-1.2-gguf      # output directory
+
+mkdir -p "$GGUF_DIR"
+python /path/to/llama.cpp/convert_hf_to_gguf.py \
+    "$HF_MODEL" \
+    --outfile "$GGUF_DIR/Luciole-1B-SFT-f16.gguf" \
+    --outtype f16
+```
+
+For a quantized variant (smaller, faster, with some precision loss):
+
+```bash
+/path/to/llama.cpp/build/bin/llama-quantize \
+    "$GGUF_DIR/Luciole-1B-SFT-f16.gguf" \
+    "$GGUF_DIR/Luciole-1B-SFT-q4_k_m.gguf" \
+    Q4_K_M
+```
+
+## Step 1 — Write the Ollama `Modelfile`
+
+In `$GGUF_DIR/Modelfile`:
+
+```
+FROM ./Luciole-1B-SFT-f16.gguf        # or your quantized variant
+PARAMETER seed 1234
+PARAMETER num_ctx 32000
+PARAMETER temperature 0.6
+SYSTEM "You are a helpful AI assistant named Luciole, trained by LINAGORA and OpenLLM France."
+TEMPLATE """
+…your Go-template version of the jinja chat template, including {{- range .Tools }}{{ . }}{{- end }} for tool support…
+"""
+PARAMETER stop "<|im_end|>"
+PARAMETER stop "<|im_start|>"
+…
+```
+
+Two pitfalls Ollama 0.24 hits silently:
+
+- `FROM` must be **relative** to the Modelfile directory. Absolute paths
+  fail with `no Modelfile or safetensors files found`.
+- Ollama detects tool-calling capability from the template body. For the
+  `nemotron` architecture only the literal `{{ . }}` form inside
+  `{{ range .Tools }}` is recognized — `{{ .Function }}` or
+  `{{ json . }}` will silently disable tool support (Ollama returns
+  `does not support tools` on any tool request).
+
+---
+
+## Step 2 — Run the main pipeline
+
+```bash
+cd /path/to/llama.cpp/test_conversion
+python test_main.py "$HF_MODEL" "$GGUF_DIR"
+```
+
+This runs five steps; each writes a JSON into
+`results/<hf_basename>__vs__<gguf_basename>/` and is skipped on rerun if
+its JSON already exists. Pass `--force` to recompute, or delete the
+JSON manually for a partial rerun. Slow steps can be turned off with
+`--no-behavior` and `--no-logits`.
+
+### What each step checks
+
+| step | script | output | what it tests |
+|------|--------|--------|---------------|
+| 1 | `run_transformers.py` | `transformers.json` | renders each test case with `tokenizer.apply_chat_template(...)` and tokenizes — the reference for everything else. |
+| 2 | `run_ollama.py` | `ollama.json` | per case, asks Ollama for `prompt_eval_count` two ways: (a) `/api/chat` (Ollama applies its Modelfile template + GGUF tokenizer); (b) `/api/generate raw=true` fed the transformers-rendered prompt (GGUF tokenizer only). |
+| 3 | `run_behavior.py` | `behavior.json` | for cases with `expected_behavior`, sends the prompt to the model via `/api/generate raw=true`, parses the generated text for `<tool_call>{…}</tool_call>`, verifies tool name + required args. Bypasses Ollama's `/api/chat` tool parser, which is unreliable on `nemotron`-arch models in 0.24. |
+| 4 | `run_logits.py` | `logits.json` | for the same prompts, runs transformers forward pass and Ollama with `logprobs=true`, compares next-token top-K distributions. Catches quantization / conversion regressions invisible to a binary tool-call test. |
+| 5 | `compare.py` | stdout + exit code | unified report. |
+
+### Reading the report
+
+`compare.py` prints three sections.
+
+**Token-count comparison** — per case, two columns:
+- `tokenizer` = transformers `apply_chat_template(tokenize=True)` length vs
+  Ollama's `prompt_eval_count` on the same rendered prompt via raw mode.
+- `chat template` = same but Ollama applies its own template.
+
+Some mismatches are flagged `[WARN]` (with note) instead of `[FAIL]`:
+- **tokenizer +1 in tool cases**: known llama.cpp SentencePiece quirk
+  — a single space following a special/added token gets segmented as a
+  spurious `▁▁` (two-space) piece. See the *Known issues* section
+  below.
+- **chat template −N in tool cases**: Ollama renders each tool via
+  Go's `json.Marshal` (compact JSON, no spaces); jinja's `tojson` uses
+  pretty JSON (with spaces). Same data, only whitespace differs.
+
+**Behavioural check** — for each `expected_behavior` case, did the
+model emit a valid `<tool_call>` block with the right name + args?
+
+**Logit comparison** — for each case (skipping ones without a generation
+prompt), how close are the next-token distributions?
+
+- `top1` ✓/✗: same most-likely next token (matched by vocab id)
+- `|Δlp_top1|`: absolute logprob diff on the chosen token
+  (fp16-vs-fp16: typically < 0.1; Q4_K_M: < 0.5 normal)
+- `mean|Δlp|_top3`: mean of `|Δlp|` over TF's top-3 tokens
+- `top5_overlap` / `miss`: how many of TF's top-5 are even in Ollama's top-K
+
+The aggregate thresholds for FAIL/WARN are documented at the top of
+`compare.py`.
+
+### Exit code
+
+- `0` — PASS, or PASS with known acceptable warnings.
+- `1` — at least one `[FAIL]` somewhere.
+
+---
+
+## Step 3 — Layer-by-layer diagnostic (optional)
+
+When the logit step flags an unexpected regression on a specific case,
+this localizes which layer (and which op type within the layer) is
+introducing the divergence.
+
+```bash
+python run_layer_diff.py "$HF_MODEL" "$GGUF_DIR/Luciole-1B-SFT-f16.gguf" \
+    --transformers-output results/<hf>__vs__<gguf>/transformers.json \
+    --case 02_system_user \
+    --work-dir results/<hf>__vs__<gguf>/layer_diff_02_system_user
+
+python compare_layers.py results/<hf>__vs__<gguf>/layer_diff_02_system_user --top-tail-only
+```
+
+Outputs:
+- `tf_layers.npz` — transformers per-layer hidden states + intermediate
+  hook outputs (input_layernorm, self_attn, post_attention_layernorm,
+  mlp, final_norm, logits).
+- `gguf_layers.bin` — llama.cpp per-layer activations
+  (`attn_norm-i`, `ffn_inp-i`, `ffn_norm-i`, `l_out-i`, `result_norm`,
+  `result_output`).
+- `layer_diff_report.txt` — per-pair max/mean abs diff, relative max,
+  cosine distance, l2 relative error.
+- `layer_diff_overview.png` — log-scale divergence vs layer index,
+  one series per op type.
+- `layer_diff_l_out.png` — focused view of per-layer block output drift.
+
+`--top-tail-only` restricts comparison to the **last token position** —
+this is what matters for next-token prediction and avoids confusion at
+the last layer where llama.cpp uses `inp_out_ids` to compute only the
+last position.
+
+### How to read the layer-diff
+
+If `L00 attn_norm` matches to floating-point precision but `L00 ffn_inp`
+diverges, the **attention block** is to blame. If `L00 attn_norm` already
+diverges, the **input embedding or tokenization** is to blame (or the
+input LN itself). And so on along the column of op types.
+
+---
+
+## Known issues / current findings (Luciole 1B SFT 1.2)
+
+- **Tokenizer +1 in tool cases** — llama.cpp's SentencePiece-style
+  tokenizer emits a spurious `▁▁` (double-space) piece when a special
+  token is followed by exactly one literal space then text. Affects the
+  fixed instruction string `function name and arguments within
+  <tool_call></tool_call> XML tags:` in the system prompt of every
+  tool-using conversation. Reported upstream; harmless (decoded string
+  unchanged), but the model sees one out-of-distribution token per
+  request. Flagged `[WARN]`.
+
+- **Chat template `−N` in tool cases** — Ollama renders each tool with
+  Go's `json.Marshal` (compact); jinja uses pretty JSON. ~21 tokens
+  saved per tool definition. Cosmetic; the model parses both
+  identically. Flagged `[WARN]`.
+
+- **Logit drift at layer 0, attention block** — even with f16 GGUF
+  matching f16 transformers, the attention output already diverges
+  significantly at L00 (cos_d ≈ 0.05 on case 02). Most likely
+  PyTorch SDPA vs llama.cpp attention kernel: different reduction
+  orders in fp16 give different accumulation. Drifts to ~0.3 cos_d by
+  the last layer. Top-1 token usually still matches.
+
+- **`convert_hf_to_gguf.py` precision pitfall** — for the Nemotron
+  LayerNorm1p hack, `data_torch + 1` must be done in fp32, otherwise
+  the bf16 source values round before storage. Use
+  `data_torch.float() + 1`. Other entries in the converter with the
+  same `+ 1` pattern (Gemma, Nemotron-H, line ~8887 MTP block, lines
+  5731+ for some mamba variant) should be audited similarly.
+
+---
+
+## File layout
+
+```
+test_conversion/
+├── README.md                  # this file
+├── test_main.py               # main orchestrator (steps 1–5)
+├── test_cases.py              # canonical test conversations
+├── run_transformers.py        # step 1
+├── run_ollama.py              # step 2
+├── run_behavior.py            # step 3
+├── run_logits.py              # step 4
+├── compare.py                 # step 5 — unified report
+├── run_layer_diff.py          # layer-diff tool
+├── compare_layers.py          # layer-diff report + plots
+├── test.sh                    # convenience wrapper (if present)
+└── results/                   # outputs land here, one subfolder per <hf>__vs__<gguf>
+    └── <hf>__vs__<gguf>/
+        ├── transformers.json
+        ├── ollama.json
+        ├── behavior.json
+        ├── logits.json
+        └── layer_diff_<case>/
+            ├── tf_layers.npz
+            ├── gguf_layers.bin
+            ├── meta.json
+            ├── layer_diff_report.txt
+            └── *.png
+```
+
+## Hard-coded paths to update
+
+`run_layer_diff.py` has the llama.cpp build path baked in at the top:
+
+```python
+LLAMA_BIN_DIR = Path("/home/jlouradour/src.nowsl/llama.cpp/build/bin")
+```
+
+Change this if your build directory is elsewhere.
diff --git a/test_conversion/compare.py b/test_conversion/compare.py
new file mode 100644
index 00000000000..1cd182881db
--- /dev/null
+++ b/test_conversion/compare.py
@@ -0,0 +1,348 @@
+"""
+Compare transformers.json and ollama.json (token counts) and optionally
+behavior.json (functional tool-call check), then print a per-test report.
+
+Token-count comparison (two checks per case):
+
+    Tokenizer
+        Pass the transformers-rendered prompt through Ollama with raw=true
+        and compare prompt_eval_count to len(transformers token_ids).
+        Tests just the GGUF tokenizer, isolated from the chat template.
+
+    Chat template
+        Pass the conversation through Ollama's /api/chat.
+        Tests Ollama's template + GGUF tokenizer together.
+
+Known acceptable divergences (reported as [WARN], not [FAIL]):
+
+    [tokenizer +1 in tool cases]
+        SentencePiece single-space-after-special-token quirk in llama.cpp's
+        BPE tokenizer. The model sees one extra space-prefix token where the
+        HF tokenizer didn't. Harmless (same decoded string). See llama.cpp
+        issue tracker for the upstream bug.
+
+    [chat template -N in tool cases]
+        Ollama renders each tool via Go's json.Marshal (compact JSON, no
+        spaces). The jinja template uses tojson (pretty JSON, with spaces).
+        Same data, same field order, just whitespace. The model parses both
+        identically; cosmetic.
+
+Behavioural section (only if --behavior path is provided):
+
+    For each case with an `expected_behavior` field, run_behavior.py asked
+    Ollama to actually generate a turn and checked whether the assistant's
+    response satisfied the expectation (correct tool_call name + args).
+
+Logits section (only if --logits path is provided):
+
+    Per-case next-token top-K log-probability comparison between
+    transformers (reference) and Ollama (GGUF). Catches subtle conversion
+    or quantization regressions that the binary behavioural test misses.
+    Aggregate metrics: top-1 agreement rate, mean KL divergence.
+
+Exit code is 0 iff no [FAIL] anywhere; [WARN]s are non-fatal.
+
+    Heuristic thresholds for the logits section:
+      top-1 agreement       < 50%   -> [FAIL]   (very likely conversion bug)
+      top-1 agreement       < 80%   -> [WARN]
+      aggregate |Δlp_top1|  > 1.0   -> [FAIL]   (model is very differently confident)
+      aggregate |Δlp_top1|  > 0.3   -> [WARN]
+      aggregate mean|Δlp|_top3  > 3.0  -> [WARN]   (not a FAIL — top-3 is
+                                                    sensitive to tail noise; use
+                                                    only as a soft signal)
+
+    Notes:
+      - Top-1 lp diff is the primary numeric signal. For fp16-vs-fp16 it
+        is typically < 0.1. For Q4_K_M, < 0.5 is normal.
+      - mean|Δlp|_top3 is reported for completeness but is noisy by nature:
+        fp16 softmax precision degrades for low-probability tokens, so a
+        single tail outlier can drag the mean up. Use top-1 metrics for
+        pass/fail; treat mean_top3 as informational.
+      - Top-1 mismatches often differ only in vocab variant (e.g. `▁The`
+        vs `The` for the same word) — those are real model behaviour
+        differences worth noting but typically caused by SP/llama.cpp
+        tokenization quirks, not gross conversion errors.
+
+Usage:
+    python compare.py <transformers_json> <ollama_json>
+                      [--behavior <behavior_json>]
+                      [--logits   <logits_json>]
+"""
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+
+def load(path):
+    return {r["name"]: r for r in json.loads(Path(path).read_text())}
+
+
+def fmt(status):
+    return {"ok": "[ OK ]", "warn": "[WARN]", "fail": "[FAIL]"}[status]
+
+
+def classify(case_has_tools, tf_count, actual_count, kind):
+    """Return (status, label, note) for one column."""
+    if actual_count is None:
+        return "ok", "skipped", None  # neutral; not a failure if intentionally skipped
+    diff = actual_count - tf_count
+    if diff == 0:
+        return "ok", f"{tf_count} vs {actual_count}", None
+
+    if kind == "tokenizer" and diff == 1 and case_has_tools:
+        return ("warn",
+                f"{tf_count} vs {actual_count}  (+1)",
+                "SPM single-space-after-special-token quirk (llama.cpp tokenizer bug; harmless)")
+    if kind == "chat" and diff < 0 and case_has_tools:
+        return ("warn",
+                f"{tf_count} vs {actual_count}  ({diff:+d})",
+                "Ollama renders tools as compact JSON; jinja uses pretty JSON (whitespace only; cosmetic)")
+    return "fail", f"{tf_count} vs {actual_count}  ({diff:+d})", None
+
+
+def report_counts(tf_map, ol_map):
+    """Returns (has_any_fail, has_any_warn, collected_notes_by_name)."""
+    names = sorted(set(tf_map) | set(ol_map))
+    notes_by_name = {}  # name -> list of strings
+    any_fail = False
+    any_warn = False
+
+    print(f"\n{'name':<40}  {'tokenizer':<32}  {'chat template':<32}")
+    print("-" * 110)
+
+    for name in names:
+        tf_r = tf_map.get(name)
+        ol_r = ol_map.get(name)
+
+        if tf_r is None or "error" in tf_r:
+            err = tf_r.get("error", "missing") if tf_r else "missing"
+            print(f"{name:<40}  transformers ERROR: {err}")
+            any_fail = True
+            continue
+        if ol_r is None:
+            print(f"{name:<40}  ollama ERROR: missing")
+            any_fail = True
+            continue
+
+        tf_count = tf_r["token_count"]
+        has_tools = tf_r.get("tools") is not None
+
+        # Tokenizer probe
+        raw_err = ol_r.get("raw_error")
+        if raw_err:
+            tok_status, tok_label = "fail", f"err: {raw_err[:24]}"
+            tok_note = None
+        else:
+            tok_status, tok_label, tok_note = classify(
+                has_tools, tf_count, ol_r.get("raw_prompt_eval_count"), "tokenizer")
+
+        # Chat template probe
+        chat_err = ol_r.get("chat_error")
+        if chat_err:
+            chat_status, chat_label = "fail", f"err: {chat_err[:24]}"
+            chat_note = None
+        else:
+            chat_status, chat_label, chat_note = classify(
+                has_tools, tf_count, ol_r.get("chat_prompt_eval_count"), "chat")
+
+        tok_cell = f"{fmt(tok_status)} {tok_label}"
+        chat_cell = f"{fmt(chat_status)} {chat_label}"
+        print(f"{name:<40}  {tok_cell:<32}  {chat_cell:<32}")
+
+        notes = []
+        if tok_note: notes.append(f"tokenizer: {tok_note}")
+        if chat_note: notes.append(f"chat: {chat_note}")
+        if notes:
+            notes_by_name[name] = notes
+
+        any_fail = any_fail or (tok_status == "fail" or chat_status == "fail")
+        any_warn = any_warn or (tok_status == "warn" or chat_status == "warn")
+
+    # Print accumulated WARN notes (each unique note once is more readable)
+    if notes_by_name:
+        print()
+        print("WARN notes:")
+        printed = set()
+        for name, notes in notes_by_name.items():
+            for n in notes:
+                if n not in printed:
+                    print(f"  - {n}")
+                    printed.add(n)
+        print("  (cases with these warnings: " +
+              ", ".join(sorted(notes_by_name)) + ")")
+
+    return any_fail, any_warn
+
+
+def report_logits(logits_list):
+    """Logits section. Returns (has_any_fail, has_any_warn)."""
+    print(f"\n{'name':<40}  {'top1':<5}  {'|Δlp_top1|':<11}  {'mean|Δlp|_top3':<15}  "
+          f"{'top5':<5}  {'miss':<5}  {'tf top-1':<22}  {'ol top-1':<22}")
+    print("-" * 140)
+
+    top1_total = 0
+    top1_matches = 0
+    top1_diffs = []
+    top3_means = []
+    skipped = []
+    any_error = False
+
+    for r in logits_list:
+        name = r["name"]
+        if "skipped" in r:
+            skipped.append((name, r["skipped"]))
+            continue
+        if "error" in r:
+            print(f"{name:<40}  ERROR: {r['error']}")
+            any_error = True
+            continue
+        cmp = r.get("comparison")
+        if not cmp:
+            print(f"{name:<40}  no comparison (empty logprobs?)")
+            any_error = True
+            continue
+
+        top1_total += 1
+        if cmp["top1_match"]:
+            top1_matches += 1
+        if cmp.get("top1_lp_diff") is not None:
+            top1_diffs.append(cmp["top1_lp_diff"])
+        if cmp.get("mean_lp_diff_top3") is not None:
+            top3_means.append(cmp["mean_lp_diff_top3"])
+
+        f = lambda v: (f"{v:.4f}" if v is not None else "n/a")
+        miss_s = f"{cmp['tf_top5_missing_in_ollama_topk']}/5"
+        tf1 = cmp["tf_top1"]; ol1 = cmp["ol_top1"]
+        tf_lbl = f"{tf1['tok']!r}@{tf1['lp']}"
+        ol_lbl = f"{ol1['tok']!r}@{ol1['lp']}"
+        marker = "✓" if cmp["top1_match"] else "✗"
+        print(f"{name:<40}  {marker:<5}  {f(cmp.get('top1_lp_diff')):<11}  "
+              f"{f(cmp.get('mean_lp_diff_top3')):<15}  {cmp['top5_overlap']}/5    "
+              f"{miss_s:<5}  {tf_lbl[:21]:<22}  {ol_lbl[:21]:<22}")
+
+    if skipped:
+        print()
+        print("  Skipped (no add_generation_prompt — no canonical next token):")
+        for n, why in skipped:
+            print(f"    - {n}  ({why})")
+
+    print()
+    if top1_total == 0:
+        print("  (no logit comparisons completed)")
+        return any_error, False
+
+    top1_rate = top1_matches / top1_total
+    agg_top1 = (sum(top1_diffs) / len(top1_diffs)) if top1_diffs else None
+    agg_top3 = (sum(top3_means) / len(top3_means)) if top3_means else None
+
+    print(f"  aggregate over {top1_total} comparable case(s):")
+    print(f"    top-1 agreement      = {top1_matches}/{top1_total} = {top1_rate*100:.1f}%")
+    if agg_top1 is not None:
+        print(f"    aggregate |Δlp_top1|      = {agg_top1:.4f}")
+    if agg_top3 is not None:
+        print(f"    aggregate mean|Δlp|_top3  = {agg_top3:.4f}")
+
+    fail = False
+    warn = False
+    if top1_rate < 0.5:
+        print(f"  [FAIL] top-1 agreement {top1_rate*100:.1f}% < 50% — likely a conversion bug")
+        fail = True
+    elif top1_rate < 0.8:
+        print(f"  [WARN] top-1 agreement {top1_rate*100:.1f}% < 80% — investigate the mismatching cases")
+        warn = True
+    if agg_top1 is not None:
+        if agg_top1 > 1.0:
+            print(f"  [FAIL] |Δlp_top1| {agg_top1:.4f} > 1.0 — confidence on top token diverges sharply")
+            fail = True
+        elif agg_top1 > 0.3:
+            print(f"  [WARN] |Δlp_top1| {agg_top1:.4f} > 0.3 — model is less confident on chosen tokens; investigate")
+            warn = True
+    if agg_top3 is not None and agg_top3 > 3.0:
+        # WARN-only: top-3 mean is noisy by nature (fp16 softmax tail).
+        print(f"  [WARN] mean|Δlp|_top3 {agg_top3:.4f} > 3.0 — distribution shifted in top-3 (soft signal)")
+        warn = True
+    return (fail or any_error), warn
+
+
+def report_behavior(behavior_map):
+    """Behavioural section. Returns has_any_fail."""
+    print(f"\n{'name':<40}  {'behaviour':<60}")
+    print("-" * 110)
+
+    any_fail = False
+    for name in sorted(behavior_map):
+        r = behavior_map[name]
+        ok = r.get("pass") is True
+        reason = r.get("fail_reason") or ""
+        status = "ok" if ok else "fail"
+        print(f"{name:<40}  {fmt(status)} {reason[:55]}")
+        if not ok:
+            any_fail = True
+            # Print actual model output details for debugging
+            raw = r.get("raw_output")
+            if raw is not None:
+                snippet = raw if len(raw) <= 200 else raw[:200] + "...(truncated)"
+                print(f"  raw model output: {snippet!r}")
+            parsed = r.get("parsed_tool_call")
+            if parsed:
+                print(f"  parsed tool_call: name={parsed.get('name')!r} args={parsed.get('arguments')!r}")
+    return any_fail
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("transformers_json")
+    parser.add_argument("ollama_json")
+    parser.add_argument("--behavior", default=None,
+                        help="Optional behaviour JSON from run_behavior.py")
+    parser.add_argument("--logits", default=None,
+                        help="Optional logits JSON from run_logits.py")
+    args = parser.parse_args()
+
+    tf_map = load(args.transformers_json)
+    ol_map = load(args.ollama_json)
+
+    print("=" * 120)
+    print("TOKEN COUNT COMPARISON")
+    print("=" * 120)
+    count_fail, count_warn = report_counts(tf_map, ol_map)
+
+    behavior_fail = False
+    if args.behavior and Path(args.behavior).exists():
+        beh_map = load(args.behavior)
+        if beh_map:
+            print()
+            print("=" * 120)
+            print("BEHAVIOURAL CHECK  (model actually called the right tool)")
+            print("=" * 120)
+            behavior_fail = report_behavior(beh_map)
+
+    logits_fail = False
+    logits_warn = False
+    if args.logits and Path(args.logits).exists():
+        logits_list = json.loads(Path(args.logits).read_text())
+        if logits_list:
+            print()
+            print("=" * 120)
+            print("LOGIT COMPARISON  (next-token top-K distribution; transformers vs Ollama)")
+            print("=" * 120)
+            logits_fail, logits_warn = report_logits(logits_list)
+
+    print()
+    any_fail = count_fail or behavior_fail or logits_fail
+    any_warn = count_warn or logits_warn
+    if any_fail:
+        print(">>> RESULT: FAIL")
+        sys.exit(1)
+    elif any_warn:
+        print(">>> RESULT: PASS (with known acceptable warnings)")
+        sys.exit(0)
+    else:
+        print(">>> RESULT: PASS")
+        sys.exit(0)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test_conversion/compare_layers.py b/test_conversion/compare_layers.py
new file mode 100644
index 00000000000..2522343068c
--- /dev/null
+++ b/test_conversion/compare_layers.py
@@ -0,0 +1,283 @@
+"""
+Compare per-layer activations from tf_layers.npz and gguf_layers.bin
+and produce a divergence plot.
+
+For each pair (transformers tensor, GGUF tensor) referring to the same
+position in the graph, we compute:
+
+  max_abs_diff   max |x_tf - x_gg|  in fp32
+  mean_abs_diff  mean over all elements
+  rel_max        max_abs_diff / max(|x_tf|, |x_gg|, 1e-8)
+  cosine         1 - cos(x_tf, x_gg)  (smaller = closer)
+  l2_rel         ||x_tf - x_gg|| / ||x_tf||
+
+Mappings used (nemotron architecture):
+  GGUF                 transformers
+  l_out-i              hidden-(i+1)               per-layer output (post-residual)
+  attn_norm-i          attn_norm-i                input_layernorm output
+  ffn_inp-i            (attn-residual sum)        not directly hookable in transformers;
+                                                  approximated as hidden-i + self_attn-i
+  ffn_norm-i           post_norm-i                post_attention_layernorm output
+  ffn_out-i            (mlp + residual)           approximated as ffn_inp + mlp output
+  result_norm          final_norm                 output of model.model.norm
+  result_output        logits                     final LM head
+
+The first three columns of the report use l_out / final_norm / logits, which
+are the most reliable (no hook approximation). The fine-grained per-op
+analysis uses the rest.
+
+Output:
+  <work_dir>/layer_diff_report.txt      text report
+  <work_dir>/layer_diff_overview.png    log-scale per-layer divergence plot
+  <work_dir>/layer_diff_by_op.png       per-op-type breakdown
+
+Usage:
+    python compare_layers.py <work_dir>
+"""
+
+import argparse
+import json
+import struct
+import sys
+from collections import defaultdict
+from pathlib import Path
+
+import numpy as np
+
+
+# GGML type ids → numpy dtype (for the ones we'll encounter on activation tensors).
+GGML_TYPE = {
+    0:  ("f32",  np.float32),
+    1:  ("f16",  np.float16),
+    24: ("bf16", None),    # handled specially
+    26: ("i32",  np.int32),
+    30: ("i64",  np.int64),
+}
+
+
+def parse_gguf_dump(path: Path):
+    """Yield (name, np_array_fp32) from the binary dump."""
+    with open(path, "rb") as f:
+        data = f.read()
+    i = 0
+    n = 0
+    while i < len(data):
+        if i + 4 > len(data):
+            break
+        name_len = struct.unpack_from("<I", data, i)[0]; i += 4
+        if name_len == 0 or name_len > 1024:
+            break
+        name = data[i:i+name_len].decode("utf-8", errors="replace"); i += name_len
+        dtype = struct.unpack_from("<I", data, i)[0]; i += 4
+        ne = struct.unpack_from("<4q", data, i); i += 32
+        nbytes = struct.unpack_from("<Q", data, i)[0]; i += 8
+        raw = data[i:i+nbytes]; i += nbytes
+        info = GGML_TYPE.get(dtype)
+        if info is None:
+            # skip unknown
+            n += 1
+            continue
+        _, np_dtype = info
+        if np_dtype is not None:
+            arr = np.frombuffer(raw, dtype=np_dtype).astype(np.float32)
+        else:
+            # bf16: upper 16 bits of fp32
+            u16 = np.frombuffer(raw, dtype=np.uint16).astype(np.uint32)
+            arr = (u16 << 16).view(np.float32).copy()
+        # reshape per ggml's column-major-ish shape: ne is contiguous-first;
+        # for our use cases tensors are 1D (vocab) or [embed, tokens, 1, 1].
+        # Strip trailing singletons.
+        shape = tuple(int(s) for s in ne if s > 1) or (1,)
+        try:
+            arr = arr.reshape(shape[::-1])  # ggml stores ne in element-stride order
+        except ValueError:
+            # fallback: leave as flat
+            pass
+        yield name, arr
+        n += 1
+
+
+def compute_diff_metrics(x_tf, x_gg):
+    """Both inputs flattened to fp32 and same total element count."""
+    if x_tf.size != x_gg.size:
+        return None
+    a = x_tf.astype(np.float32).reshape(-1)
+    b = x_gg.astype(np.float32).reshape(-1)
+    diff = a - b
+    abs_diff = np.abs(diff)
+    max_abs = float(abs_diff.max())
+    mean_abs = float(abs_diff.mean())
+    denom = float(max(np.abs(a).max(), np.abs(b).max(), 1e-8))
+    rel_max = max_abs / denom
+    # cosine-distance
+    na = float(np.linalg.norm(a))
+    nb = float(np.linalg.norm(b))
+    cos = 1.0 - float(a @ b) / (na * nb + 1e-12)
+    l2_rel = float(np.linalg.norm(diff)) / (na + 1e-12)
+    return dict(max_abs=max_abs, mean_abs=mean_abs, rel_max=rel_max, cosine=cos, l2_rel=l2_rel)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("work_dir")
+    parser.add_argument("--top-tail-only", action="store_true",
+                        help="For multi-token tensors, only compare the LAST token position "
+                             "(matches what next-token prediction uses).")
+    args = parser.parse_args()
+
+    work_dir = Path(args.work_dir).resolve()
+    tf_path = work_dir / "tf_layers.npz"
+    gg_path = work_dir / "gguf_layers.bin"
+    meta = json.loads((work_dir / "meta.json").read_text())
+
+    print(f"Comparing layer activations for case {meta['case']!r}")
+    print(f"  HF dir: {meta['hf_model_dir']}")
+    print(f"  GGUF:   {meta['gguf_path']}")
+    print()
+
+    tf = dict(np.load(tf_path))
+    gg = dict(parse_gguf_dump(gg_path))
+    print(f"transformers: {len(tf)} arrays")
+    print(f"gguf:         {len(gg)} arrays")
+
+    n_layers = max(int(k.split("-")[1]) for k in tf if k.startswith("hidden-")) + 1 - 1
+    # hidden_states has N+1 entries (0..N); N = num_layers
+    print(f"layers: {n_layers}")
+
+    T = int(tf["tokens"].shape[-1])
+    print(f"tokens: {T}")
+
+    # Mapping rules: each entry is (label, op_type, tf_array_or_callable, gg_key)
+    # tf entry can be a string (npz key) or a callable taking the tf dict and
+    # returning an array — to combine multiple hook outputs.
+    def add_resid(i):
+        """ffn_inp = hidden[i] + self_attn[i]  (post-attention, pre-MLP, with residual)"""
+        return lambda tfd: tfd[f"hidden-{i}"] + tfd[f"self_attn-{i}"]
+
+    mappings = []
+    for i in range(n_layers):
+        mappings.append((f"L{i:02d} attn_norm", "norm",      f"attn_norm-{i}",  f"attn_norm-{i}"))
+        mappings.append((f"L{i:02d} ffn_inp",   "post_attn", add_resid(i),      f"ffn_inp-{i}"))
+        mappings.append((f"L{i:02d} ffn_norm",  "norm",      f"post_norm-{i}",  f"ffn_norm-{i}"))
+        mappings.append((f"L{i:02d} l_out",     "block_out", f"hidden-{i+1}",   f"l_out-{i}"))
+    mappings.append(("final_norm",  "norm",       "final_norm",  "result_norm"))
+    mappings.append(("logits",      "head",       "logits",      "result_output"))
+
+    rows = []
+    for label, op, tk, gk in mappings:
+        # Resolve transformers tensor (string key or callable on the dict)
+        if callable(tk):
+            try:
+                x_tf = tk(tf)
+            except KeyError as e:
+                rows.append({"label": label, "op": op, "skip": f"missing tf key {e}"})
+                continue
+        else:
+            if tk not in tf:
+                rows.append({"label": label, "op": op, "skip": f"missing tf={tk}"})
+                continue
+            x_tf = tf[tk]
+        if gk not in gg:
+            rows.append({"label": label, "op": op, "skip": f"missing gg={gk}"})
+            continue
+        x_gg = gg[gk]
+        if args.top_tail_only:
+            # Pick last token slice
+            x_tf = x_tf.reshape(-1, x_tf.shape[-1])[-1] if x_tf.ndim >= 2 else x_tf
+            x_gg = x_gg.reshape(-1, x_gg.shape[-1])[-1] if x_gg.ndim >= 2 else x_gg
+        m = compute_diff_metrics(x_tf, x_gg)
+        if m is None:
+            rows.append({"label": label, "op": op, "skip": f"shape mismatch tf={x_tf.shape} gg={x_gg.shape}"})
+            continue
+        rows.append({"label": label, "op": op, **m, "shape": tuple(x_tf.shape)})
+
+    # ─── Text report ───
+    out_txt = work_dir / "layer_diff_report.txt"
+    lines = []
+    lines.append(f"Layer-by-layer activation comparison — case {meta['case']!r}")
+    lines.append(f"HF dtype: {meta['dtype']} on {meta['device']}")
+    lines.append("")
+    lines.append(f"{'label':<25}  {'op':<10}  {'shape':<22}  {'max|Δ|':<10}  {'mean|Δ|':<10}  {'rel_max':<9}  {'cos_d':<10}  {'l2_rel':<10}")
+    lines.append("-" * 120)
+    for r in rows:
+        if "skip" in r:
+            lines.append(f"{r['label']:<25}  {r['op']:<10}  SKIP: {r['skip']}")
+            continue
+        shape = str(r["shape"])
+        lines.append(f"{r['label']:<25}  {r['op']:<10}  {shape:<22}  "
+                     f"{r['max_abs']:<10.4g}  {r['mean_abs']:<10.4g}  "
+                     f"{r['rel_max']:<9.3g}  {r['cosine']:<10.4g}  {r['l2_rel']:<10.4g}")
+    report = "\n".join(lines)
+    out_txt.write_text(report + "\n")
+    print()
+    print(report)
+    print()
+    print(f"Wrote {out_txt}")
+
+    # ─── Plots ───
+    try:
+        import matplotlib
+        matplotlib.use("Agg")
+        import matplotlib.pyplot as plt
+    except ImportError:
+        print("matplotlib not installed; skipping plots (pip install matplotlib)")
+        return
+
+    valid = [r for r in rows if "skip" not in r]
+    layer_rows = [r for r in valid if r["label"].startswith("L")]
+
+    # Plot 1: overview, divergence vs layer index, separate lines per op
+    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
+    by_op = defaultdict(list)
+    for r in layer_rows:
+        # label is "L00 attn_norm" / "L00 ffn_norm" / "L00 l_out"
+        op_label = r["label"].split()[1]
+        layer_idx = int(r["label"][1:3])
+        by_op[op_label].append((layer_idx, r))
+
+    for ax, metric in zip(axes, ["max_abs", "l2_rel"]):
+        for op_label, lst in by_op.items():
+            lst.sort()
+            xs = [li for li, _ in lst]
+            ys = [r[metric] for _, r in lst]
+            ax.plot(xs, ys, marker="o", label=op_label)
+        # add final_norm and logits as scatter at x = N
+        for r in valid:
+            if r["label"] == "final_norm":
+                ax.scatter([n_layers], [r[metric]], marker="*", s=200, label="final_norm")
+            if r["label"] == "logits":
+                ax.scatter([n_layers + 0.3], [r[metric]], marker="X", s=150, label="logits")
+        ax.set_xlabel("layer index")
+        ax.set_ylabel(metric)
+        ax.set_yscale("log")
+        ax.set_title(f"divergence per layer  ({metric})")
+        ax.grid(True, which="both", alpha=0.3)
+        ax.legend(fontsize=8)
+    fig.suptitle(f"transformers vs GGUF — case {meta['case']!r}")
+    fig.tight_layout()
+    p1 = work_dir / "layer_diff_overview.png"
+    fig.savefig(p1, dpi=110, bbox_inches="tight")
+    print(f"Wrote {p1}")
+
+    # Plot 2: cumulative growth of l_out divergence to highlight where drift accumulates
+    fig, ax = plt.subplots(figsize=(10, 5))
+    l_out_rows = sorted(by_op.get("l_out", []))
+    if l_out_rows:
+        xs = [li for li, _ in l_out_rows]
+        for metric in ["max_abs", "l2_rel", "cosine"]:
+            ys = [r[metric] for _, r in l_out_rows]
+            ax.plot(xs, ys, marker="o", label=metric)
+        ax.set_xlabel("layer index")
+        ax.set_ylabel("metric")
+        ax.set_yscale("log")
+        ax.set_title(f"l_out divergence over depth — case {meta['case']!r}")
+        ax.grid(True, which="both", alpha=0.3)
+        ax.legend()
+    fig.tight_layout()
+    p2 = work_dir / "layer_diff_l_out.png"
+    fig.savefig(p2, dpi=110, bbox_inches="tight")
+    print(f"Wrote {p2}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test_conversion/run_behavior.py b/test_conversion/run_behavior.py
new file mode 100644
index 00000000000..8c1762d2194
--- /dev/null
+++ b/test_conversion/run_behavior.py
@@ -0,0 +1,224 @@
+"""
+Behavioural test: for each test case that has an `expected_behavior` field,
+actually run the model and verify the assistant response satisfies the
+expectation (e.g. emits a tool_call with the right name + args).
+
+IMPORTANT design note — why /api/generate raw=true:
+
+    Ollama's /api/chat tool-call parser (at least in 0.24) silently drops
+    model output when it detects a tool_call tag but fails to extract a
+    valid call mid-stream. The model can emit a perfectly well-formed
+    <tool_call>...</tool_call> block and the chat API still returns
+    {"content": "", "tool_calls": null}. That hides whether the *model*
+    works.
+
+    To test the model behind Ollama, we bypass the chat layer entirely:
+      1. Take the transformers-rendered prompt (already computed in
+         transformers.json — same exact string the model would see at
+         training time).
+      2. Feed it via /api/generate with raw=true (no template, no parsing).
+      3. Read back the raw text the model emitted, parse <tool_call> blocks
+         ourselves with a regex, and check the expectation.
+
+    This tests the GGUF model + tokenizer end-to-end without Ollama's chat
+    quirks getting in the way.
+
+Currently supports one kind of expectation:
+
+    expected_behavior = {
+        "tool_call": {
+            "name":              "get_weather",
+            "required_args":     ["location"],
+            "args_must_contain": {"location": "paris"},  # case-insensitive substring
+        }
+    }
+
+Output JSON: one entry per case with expected_behavior, including
+{name, expected, raw_output, parsed_tool_call, pass, fail_reason}.
+
+Usage:
+    python run_behavior.py <modelfile_path> <output_json>
+                           --transformers-output <transformers.json>
+                           [--model-name NAME]
+                           [--ollama-url URL]
+                           [--num-predict N]
+"""
+
+import argparse
+import json
+import re
+import sys
+import traceback
+from pathlib import Path
+
+try:
+    import requests
+except ImportError:
+    sys.exit("ERROR: this script needs the 'requests' package (pip install requests).")
+
+sys.path.insert(0, str(Path(__file__).parent))
+from test_cases import TEST_CASES  # noqa: E402
+from run_ollama import check_ollama_alive, ollama_create, ollama_delete, _post  # noqa: E402
+
+
+# Matches "<tool_call> ... </tool_call>" with any whitespace inside.
+TOOL_CALL_RE = re.compile(r"<tool_call>\s*(\{.*?\})\s*</tool_call>", re.DOTALL)
+
+
+def ollama_generate_raw(url, model, prompt, num_predict):
+    payload = {
+        "model": model,
+        "prompt": prompt,
+        "raw": True,
+        "stream": False,
+        "options": {
+            "num_predict": num_predict,
+            "temperature": 0,
+            "seed": 0,
+            # Stop at the chat-message terminator so we don't waste tokens
+            # generating into the next turn.
+            "stop": ["<|im_end|>"],
+        },
+    }
+    return _post(f"{url}/api/generate", payload)
+
+
+def parse_tool_call_from_text(text):
+    """Find the first <tool_call>...</tool_call> block and parse its JSON.
+
+    Returns (call_dict_or_None, error_str_or_None) where call_dict is
+    {"name": str, "arguments": dict} on success.
+    """
+    m = TOOL_CALL_RE.search(text)
+    if not m:
+        return None, "no <tool_call>...</tool_call> block in output"
+    body = m.group(1)
+    try:
+        obj = json.loads(body)
+    except json.JSONDecodeError as e:
+        return None, f"<tool_call> body is not valid JSON: {e}; body={body!r}"
+    if "name" not in obj:
+        return None, f"<tool_call> body has no 'name' field; body={obj!r}"
+    args = obj.get("arguments", {})
+    if isinstance(args, str):
+        # Some templates emit arguments as a JSON-encoded string.
+        try:
+            args = json.loads(args)
+        except json.JSONDecodeError:
+            pass
+    return {"name": obj["name"], "arguments": args}, None
+
+
+def evaluate_tool_call(expected, parsed):
+    """Check the parsed tool call matches expectation.
+
+    Returns (pass: bool, reason: str | None).
+    """
+    if parsed["name"] != expected["name"]:
+        return False, f"wrong tool name: got {parsed['name']!r}, expected {expected['name']!r}"
+
+    args = parsed["arguments"]
+    if not isinstance(args, dict):
+        return False, f"arguments not a dict: {args!r}"
+
+    for key in expected.get("required_args", []):
+        if key not in args:
+            return False, f"missing required arg {key!r} (got args: {list(args)})"
+
+    for key, needle in expected.get("args_must_contain", {}).items():
+        val = args.get(key)
+        if val is None:
+            return False, f"missing arg {key!r}"
+        if needle.lower() not in str(val).lower():
+            return False, f"arg {key!r}={val!r} does not contain {needle!r}"
+
+    return True, None
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("modelfile_path")
+    parser.add_argument("output_json")
+    parser.add_argument("--transformers-output", required=True,
+                        help="Path to transformers.json (provides the exact "
+                             "rendered prompt for each case)")
+    parser.add_argument("--model-name", default="test-chat-template-tmp",
+                        help="Temporary ollama model name (created/deleted by the script).")
+    parser.add_argument("--ollama-url", default="http://localhost:11434")
+    parser.add_argument("--num-predict", type=int, default=256,
+                        help="Max tokens the model may generate per case.")
+    args = parser.parse_args()
+
+    version = check_ollama_alive(args.ollama_url)
+    print(f"[behavior] Ollama reachable (version {version})")
+
+    tf_path = Path(args.transformers_output)
+    if not tf_path.exists():
+        sys.exit(f"ERROR: transformers output not found at {tf_path}. "
+                 "Run run_transformers.py first.")
+    transformers_by_name = {r["name"]: r for r in json.loads(tf_path.read_text())}
+
+    cases_with_behavior = [c for c in TEST_CASES if c.get("expected_behavior")]
+    if not cases_with_behavior:
+        print("[behavior] No test cases have an 'expected_behavior' field; nothing to do.")
+        Path(args.output_json).write_text("[]")
+        return
+
+    print(f"[behavior] {len(cases_with_behavior)} behavioural case(s) to run")
+
+    ollama_create(args.model_name, args.modelfile_path)
+    try:
+        results = []
+        for case in cases_with_behavior:
+            print(f"[behavior]   {case['name']}")
+            entry = {
+                "name": case["name"],
+                "expected_behavior": case["expected_behavior"],
+            }
+            try:
+                tf = transformers_by_name.get(case["name"])
+                if tf is None or "rendered_prompt" not in tf:
+                    raise RuntimeError(
+                        f"no rendered_prompt for {case['name']} in transformers output"
+                    )
+                resp = ollama_generate_raw(
+                    args.ollama_url, args.model_name,
+                    tf["rendered_prompt"], num_predict=args.num_predict,
+                )
+                raw_output = resp.get("response", "")
+                entry["raw_output"] = raw_output
+                entry["eval_count"] = resp.get("eval_count")
+
+                eb = case["expected_behavior"]
+                if "tool_call" in eb:
+                    parsed, parse_err = parse_tool_call_from_text(raw_output)
+                    entry["parsed_tool_call"] = parsed
+                    if parsed is None:
+                        entry["pass"] = False
+                        entry["fail_reason"] = parse_err
+                    else:
+                        ok, reason = evaluate_tool_call(eb["tool_call"], parsed)
+                        entry["pass"] = ok
+                        entry["fail_reason"] = reason
+                else:
+                    entry["pass"] = False
+                    entry["fail_reason"] = f"unknown expected_behavior keys: {list(eb)}"
+            except Exception as e:
+                traceback.print_exc()
+                entry["pass"] = False
+                entry["fail_reason"] = f"{type(e).__name__}: {e}"
+
+            marker = "OK  " if entry.get("pass") else "FAIL"
+            print(f"               -> {marker}  {entry.get('fail_reason') or ''}")
+            results.append(entry)
+    finally:
+        ollama_delete(args.model_name)
+
+    out = Path(args.output_json)
+    out.parent.mkdir(parents=True, exist_ok=True)
+    out.write_text(json.dumps(results, indent=2, ensure_ascii=False))
+    print(f"[behavior] Wrote {len(results)} results to {out}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test_conversion/run_layer_diff.py b/test_conversion/run_layer_diff.py
new file mode 100644
index 00000000000..167eca64c19
--- /dev/null
+++ b/test_conversion/run_layer_diff.py
@@ -0,0 +1,180 @@
+"""
+Layer-by-layer activation dump for both backends, on a single anchor prompt.
+
+Outputs two files in <work_dir>:
+
+  tf_layers.npz      — numpy archive with one array per intermediate tensor:
+                       "tokens"                        : input token ids (1, T)
+                       "hidden-i"  for i in 0..N        : per-layer output of the i-th block
+                                                          (transformers .hidden_states)
+                       "attn_norm-i"  for i in 0..N-1   : output of input_layernorm
+                       "self_attn-i"  for i in 0..N-1   : output of the attention block (without residual)
+                       "post_norm-i"  for i in 0..N-1   : output of post_attention_layernorm
+                       "mlp-i"        for i in 0..N-1   : output of MLP (without residual)
+                       "final_norm"                    : after model.model.norm
+                       "logits"                        : final LM head output
+
+  gguf_layers.bin    — binary dump from llama-eval-callback (env-gated).
+                       Records of: u32 name_len, name, u32 dtype, i64 ne[4], u64 nbytes, data.
+
+The companion compare_layers.py loads both and computes per-layer divergence.
+
+Usage:
+    python run_layer_diff.py <hf_model_dir> <gguf_path>
+                             --transformers-output <transformers.json>
+                             --case <case_name>
+                             --work-dir <dir>
+                             [--device cuda|cpu]
+                             [--dtype fp16|fp32|bf16]
+"""
+
+import argparse
+import json
+import os
+import shutil
+import subprocess
+import sys
+from pathlib import Path
+
+import numpy as np
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+
+LLAMA_BIN_DIR = Path("/home/jlouradour/src.nowsl/llama.cpp/build/bin")
+EVAL_CALLBACK_BIN = LLAMA_BIN_DIR / "llama-eval-callback"
+
+# Tensor name regex passed to the patched llama.cpp dumper.
+# Keep aligned with the names cb()'d by src/models/nemotron.cpp.
+GGUF_DUMP_REGEX = r'^(attn_norm|ffn_inp|ffn_norm|ffn_out|l_out|result_norm|result_output)-?[0-9]*$'
+
+
+def transformers_dump(model_dir: Path, prompt: str, device: str, dtype: torch.dtype, out_path: Path):
+    print(f"[tf] Loading model from {model_dir} ({device}, {dtype})")
+    tokenizer = AutoTokenizer.from_pretrained(str(model_dir), trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained(
+        str(model_dir), torch_dtype=dtype, trust_remote_code=True,
+    ).to(device)
+    model.eval()
+
+    # Match the way our test renders prompts: tokenize the raw rendered prompt
+    # without adding extra special tokens — the prompt already contains them.
+    inputs = tokenizer(prompt, return_tensors="pt", add_special_tokens=False).to(device)
+    input_ids = inputs["input_ids"]
+    print(f"[tf] Input tokens: {input_ids.shape}, last-pos id = {int(input_ids[0,-1])}")
+
+    captures = {}
+
+    def hook_for(key):
+        def fn(_mod, _inp, out):
+            t = out[0] if isinstance(out, tuple) else out
+            captures[key] = t.detach().cpu().float().numpy()
+        return fn
+
+    handles = []
+    for i, layer in enumerate(model.model.layers):
+        handles.append(layer.input_layernorm.register_forward_hook(hook_for(f"attn_norm-{i}")))
+        handles.append(layer.self_attn.register_forward_hook(hook_for(f"self_attn-{i}")))
+        handles.append(layer.post_attention_layernorm.register_forward_hook(hook_for(f"post_norm-{i}")))
+        handles.append(layer.mlp.register_forward_hook(hook_for(f"mlp-{i}")))
+    handles.append(model.model.norm.register_forward_hook(hook_for("final_norm")))
+
+    with torch.no_grad():
+        out = model(**inputs, output_hidden_states=True)
+
+    for h in handles:
+        h.remove()
+
+    # Per-layer hidden states (hidden[i] is the output of layer i; hidden[0] = embeddings)
+    for i, h in enumerate(out.hidden_states):
+        captures[f"hidden-{i}"] = h.detach().cpu().float().numpy()
+    captures["logits"] = out.logits.detach().cpu().float().numpy()
+    captures["tokens"] = input_ids.cpu().numpy()
+
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    np.savez(str(out_path), **captures)
+    print(f"[tf] Saved {len(captures)} arrays to {out_path}")
+
+    # Free memory: model is no longer needed
+    del model
+    if device == "cuda":
+        torch.cuda.empty_cache()
+
+
+def gguf_dump(gguf_path: Path, prompt: str, out_path: Path):
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    if out_path.exists():
+        out_path.unlink()
+    env = os.environ.copy()
+    env["LD_LIBRARY_PATH"] = str(LLAMA_BIN_DIR) + ":" + env.get("LD_LIBRARY_PATH", "")
+    env["LLAMA_DUMP_TENSORS_FILE"] = str(out_path)
+    env["LLAMA_DUMP_TENSORS_REGEX"] = GGUF_DUMP_REGEX
+    # Atomic tokenization of <|im_start|> etc., so the token count matches
+    # what HF transformers produces on a fully-rendered chat-template prompt.
+    env["LLAMA_TOKENIZE_PARSE_SPECIAL"] = "1"
+
+    cmd = [str(EVAL_CALLBACK_BIN),
+           "-m", str(gguf_path),
+           "-p", prompt,
+           "-n", "1"]
+    print(f"[gguf] Running {EVAL_CALLBACK_BIN.name} with regex={GGUF_DUMP_REGEX!r}")
+    res = subprocess.run(cmd, env=env, capture_output=True, text=True)
+    if res.returncode != 0:
+        print(res.stdout[-1000:])
+        print(res.stderr[-1000:])
+        sys.exit(f"[gguf] llama-eval-callback failed (exit {res.returncode})")
+    if not out_path.exists() or out_path.stat().st_size == 0:
+        sys.exit(f"[gguf] dump file empty: {out_path}")
+    print(f"[gguf] Dump size: {out_path.stat().st_size/1024:.1f} KB")
+
+
+def main():
+    parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+    parser.add_argument("hf_model_dir")
+    parser.add_argument("gguf_path")
+    parser.add_argument("--transformers-output", required=True,
+                        help="Path to existing transformers.json (provides rendered_prompt per case)")
+    parser.add_argument("--case", required=True,
+                        help="Test case name from test_cases.py to use as the anchor prompt")
+    parser.add_argument("--work-dir", required=True)
+    parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu", choices=["cuda", "cpu"])
+    parser.add_argument("--dtype", default=None, choices=["fp16", "fp32", "bf16"],
+                        help="Defaults to fp16 on cuda, fp32 on cpu")
+    args = parser.parse_args()
+
+    hf_dir = Path(args.hf_model_dir).resolve()
+    gguf_path = Path(args.gguf_path).resolve()
+    work_dir = Path(args.work_dir).resolve()
+    work_dir.mkdir(parents=True, exist_ok=True)
+
+    tf_json = json.loads(Path(args.transformers_output).read_text())
+    case = next((c for c in tf_json if c["name"] == args.case), None)
+    if case is None:
+        sys.exit(f"case {args.case!r} not found in {args.transformers_output}")
+    prompt = case["rendered_prompt"]
+
+    if args.dtype is None:
+        args.dtype = "fp16" if args.device == "cuda" else "fp32"
+    torch_dtype = {"fp16": torch.float16, "fp32": torch.float32, "bf16": torch.bfloat16}[args.dtype]
+
+    tf_out = work_dir / "tf_layers.npz"
+    gg_out = work_dir / "gguf_layers.bin"
+
+    transformers_dump(hf_dir, prompt, args.device, torch_dtype, tf_out)
+    gguf_dump(gguf_path, prompt, gg_out)
+
+    # Also save the prompt + tokens so compare_layers can sanity check.
+    meta = {
+        "case": args.case,
+        "prompt": prompt,
+        "device": args.device,
+        "dtype": args.dtype,
+        "hf_model_dir": str(hf_dir),
+        "gguf_path": str(gguf_path),
+    }
+    (work_dir / "meta.json").write_text(json.dumps(meta, indent=2, ensure_ascii=False))
+    print(f"Saved meta to {work_dir / 'meta.json'}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test_conversion/run_logits.py b/test_conversion/run_logits.py
new file mode 100644
index 00000000000..c1dd875f3e2
--- /dev/null
+++ b/test_conversion/run_logits.py
@@ -0,0 +1,308 @@
+"""
+Logit-level comparison: for each test case, compute the next-token
+top-K log-probability distribution from
+  (a) the transformers model (forward pass on the rendered prompt)
+  (b) Ollama serving the GGUF (logprobs API on the same prompt)
+and report top-1 agreement, top-5 overlap, and KL divergence.
+
+This catches subtle numerical regressions in the GGUF (quantization,
+conversion bugs, wrong activation, etc.) that the binary tool-call
+behavioural test would not notice.
+
+Per-case metrics:
+
+  top1_match           bool  — same most-likely next token (most important)
+  top1_lp_diff         float — |TF top-1 logprob − Ollama top-1 logprob|.
+                               Concrete confidence delta on the chosen token.
+                               fp16-vs-fp16: typically < 0.1.
+                               Q4_K_M: typically < 0.5.
+  top5_overlap         int   — how many of TF's top-5 are in Ollama's top-5 (0..5).
+  mean_lp_diff_top3    float — primary aggregate metric: mean |Δlp| over TF's
+                               top-3 tokens (aligned by token ID). Top-3 covers
+                               the bulk of the probability mass; excluding the
+                               4-5 tail tokens avoids the high noise that fp16
+                               softmax has on low-probability logits.
+  mean_lp_diff_top5    float — same but over top-5; reported for completeness.
+                               Naturally noisier; use top-3 for judgement.
+  tf_top5_missing      int   — count of TF's top-5 tokens not in Ollama's
+                               top-K. High counts mean Ollama wasn't even close
+                               on those tokens (significant divergence).
+  kl_div_renorm        float — secondary: KL on renormalized common-top-K.
+                               Can be inflated; ignore unless other signals
+                               also flag.
+
+Cases whose rendered prompt does NOT end at a generation point (i.e.
+add_generation_prompt was False — last message was assistant text/tool_calls,
+no `<|im_start|>assistant\\n` suffix) are SKIPPED: there is no canonical
+"next token" to predict there.
+
+Usage:
+    python run_logits.py <hf_model_dir> <modelfile_path> <output_json>
+                         --transformers-output <transformers.json>
+                         [--model-name NAME]
+                         [--ollama-url URL]
+                         [--top-k K]
+                         [--device cuda|cpu]
+                         [--dtype fp16|fp32]
+"""
+
+import argparse
+import json
+import math
+import sys
+import traceback
+from pathlib import Path
+
+try:
+    import requests  # noqa: F401  (imported via run_ollama as well, but be explicit)
+except ImportError:
+    sys.exit("ERROR: this script needs the 'requests' package (pip install requests).")
+
+try:
+    import torch
+    from transformers import AutoModelForCausalLM, AutoTokenizer
+except ImportError as e:
+    sys.exit(f"ERROR: this script needs torch + transformers ({e}).")
+
+sys.path.insert(0, str(Path(__file__).parent))
+from run_ollama import check_ollama_alive, ollama_create, ollama_delete, _post  # noqa: E402
+
+
+def transformers_topk(model, tokenizer, prompt, top_k):
+    """Forward-pass the model and return top-K (token_id, token, logprob)."""
+    inputs = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)
+    device = next(model.parameters()).device
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    with torch.no_grad():
+        out = model(**inputs)
+    logits = out.logits[0, -1].float()
+    logprobs = torch.log_softmax(logits, dim=-1)
+    vals, idxs = torch.topk(logprobs, top_k)
+    return [
+        {
+            "token_id": int(i),
+            "token": tokenizer.convert_ids_to_tokens(int(i)),
+            "logprob": float(lp),
+        }
+        for i, lp in zip(idxs.tolist(), vals.tolist())
+    ]
+
+
+def ollama_topk(url, model_name, prompt, top_k):
+    """Get Ollama's top-K next-token logprobs (raw=true so it doesn't apply the chat template)."""
+    payload = {
+        "model": model_name,
+        "prompt": prompt,
+        "raw": True,
+        "stream": False,
+        "options": {"num_predict": 1, "temperature": 0, "seed": 0},
+        "logprobs": True,
+        "top_logprobs": top_k,
+    }
+    resp = _post(f"{url}/api/generate", payload)
+    lps = resp.get("logprobs") or []
+    if not lps:
+        return None
+    first = lps[0]
+    return [
+        {"token": t["token"], "logprob": t["logprob"], "bytes": t.get("bytes")}
+        for t in first.get("top_logprobs", [])
+    ]
+
+
+_SPM_SPACE = "▁"  # ▁ — SentencePiece's word-boundary marker
+
+
+def ollama_token_to_id(tokenizer, ol_entry, vocab):
+    """Map an Ollama-reported token to the transformers vocab ID.
+
+    Critical: we look the token up DIRECTLY in the vocab dict, not via
+    tokenizer.encode(). The encoder normalizes (e.g. always converts a
+    leading literal-space `' Bonjour'` to `▁Bonjour`), which would COLLIDE
+    distinct GGUF vocab entries (`' Bonjour'`/▁Bonjour id=34362 vs
+    `'Bonjour'` id=21327) and cause ol_by_id[id] to be set to the WRONG
+    logprob (whichever distinct token appears later in the top-K list).
+    """
+    s = ol_entry["token"]
+
+    # 1. Try the SentencePiece form: leading literal-space → ▁ prefix.
+    spm_form = (_SPM_SPACE + s[1:]) if s.startswith(" ") else s
+    if spm_form in vocab:
+        return vocab[spm_form]
+
+    # 2. Try the raw string (for non-space-prefixed tokens like 'Bonjour').
+    if s in vocab:
+        return vocab[s]
+
+    # 3. Last resort: lossy re-tokenize. May collide; logged via caller.
+    ids = tokenizer.encode(s, add_special_tokens=False)
+    if len(ids) == 1:
+        return ids[0]
+    return None
+
+
+def compare_topk(tf_top, ol_top, tokenizer):
+    """Compare next-token top-K distributions and return per-case metrics."""
+    if not tf_top or not ol_top:
+        return None
+
+    # Annotate Ollama entries with transformers vocab IDs (direct vocab lookup).
+    vocab = tokenizer.get_vocab()
+    ol_with_ids = [
+        {**t, "token_id": ollama_token_to_id(tokenizer, t, vocab)} for t in ol_top
+    ]
+
+    tf1 = tf_top[0]
+    ol1 = ol_with_ids[0]
+    top1_match = tf1["token_id"] == ol1["token_id"]
+
+    tf_top5_ids = {t["token_id"] for t in tf_top[:5]}
+    ol_top5_ids = {t["token_id"] for t in ol_with_ids[:5] if t["token_id"] is not None}
+    top5_overlap = len(tf_top5_ids & ol_top5_ids)
+
+    # PRIMARY: absolute logprob differences on TF's top-N tokens (aligned
+    # by token ID via Ollama's top-K). Reported on top-1 (concrete) and
+    # top-3 (aggregate). Top-5 also computed for completeness but is
+    # naturally noisy because fp16 softmax precision is lowest in the tail.
+    ol_by_id = {t["token_id"]: t["logprob"] for t in ol_with_ids if t["token_id"] is not None}
+
+    def diffs_over(n):
+        d = []
+        miss = 0
+        for t in tf_top[:n]:
+            ol_lp = ol_by_id.get(t["token_id"])
+            if ol_lp is None:
+                miss += 1
+            else:
+                d.append(abs(t["logprob"] - ol_lp))
+        return d, miss
+
+    d3, _ = diffs_over(3)
+    d5, missing5 = diffs_over(5)
+    mean3 = (sum(d3) / len(d3)) if d3 else None
+    mean5 = (sum(d5) / len(d5)) if d5 else None
+
+    # Top-1 logprob diff specifically (most interpretable; same token assumed).
+    top1_lp_diff = None
+    if top1_match:
+        ol_top1_lp = ol_by_id.get(tf1["token_id"])
+        if ol_top1_lp is not None:
+            top1_lp_diff = abs(tf1["logprob"] - ol_top1_lp)
+
+    # SECONDARY METRIC: KL on renormalized common-top-K (kept for reference;
+    # can be inflated when overlap is small).
+    tf_by_id = {t["token_id"]: t["logprob"] for t in tf_top}
+    common = set(tf_by_id) & set(ol_by_id)
+    kl = None
+    if common:
+        tf_p = {i: math.exp(tf_by_id[i]) for i in common}
+        ol_p = {i: math.exp(ol_by_id[i]) for i in common}
+        s_tf = sum(tf_p.values())
+        s_ol = sum(ol_p.values())
+        if s_tf > 0 and s_ol > 0:
+            kl = 0.0
+            for i in common:
+                p = tf_p[i] / s_tf
+                q = ol_p[i] / s_ol
+                if p > 1e-12 and q > 1e-12:
+                    kl += p * math.log(p / q)
+
+    return {
+        "top1_match": top1_match,
+        "tf_top1": {"id": tf1["token_id"], "tok": tf1["token"], "lp": round(tf1["logprob"], 4)},
+        "ol_top1": {"id": ol1["token_id"], "tok": ol1["token"], "lp": round(ol1["logprob"], 4)},
+        "top1_lp_diff": top1_lp_diff,
+        "top5_overlap": top5_overlap,
+        "tf_top5_missing_in_ollama_topk": missing5,
+        "mean_lp_diff_top3": mean3,
+        "mean_lp_diff_top5": mean5,
+        "kl_div_renorm": kl,
+    }
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("hf_model_dir", help="Path to HuggingFace transformers model directory")
+    parser.add_argument("modelfile_path", help="Path to the GGUF Modelfile (for ollama create)")
+    parser.add_argument("output_json")
+    parser.add_argument("--transformers-output", required=True,
+                        help="Path to transformers.json (provides the rendered prompts)")
+    parser.add_argument("--model-name", default="test-chat-template-tmp")
+    parser.add_argument("--ollama-url", default="http://localhost:11434")
+    parser.add_argument("--top-k", type=int, default=20)
+    parser.add_argument("--device",
+                        default="cuda" if torch.cuda.is_available() else "cpu",
+                        choices=["cuda", "cpu"])
+    parser.add_argument("--dtype", default=None, choices=["fp16", "fp32", "bf16"],
+                        help="Defaults to fp16 on cuda, fp32 on cpu")
+    args = parser.parse_args()
+
+    version = check_ollama_alive(args.ollama_url)
+    print(f"[logits] Ollama reachable (version {version})")
+
+    tf_path = Path(args.transformers_output)
+    if not tf_path.exists():
+        sys.exit(f"ERROR: transformers output not found at {tf_path}")
+    transformers_data = json.loads(tf_path.read_text())
+
+    if args.dtype is None:
+        args.dtype = "fp16" if args.device == "cuda" else "fp32"
+    torch_dtype = {"fp16": torch.float16, "fp32": torch.float32, "bf16": torch.bfloat16}[args.dtype]
+
+    print(f"[logits] Loading transformers model from {args.hf_model_dir} ({args.device}, {args.dtype})")
+    tokenizer = AutoTokenizer.from_pretrained(args.hf_model_dir, trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained(
+        args.hf_model_dir, torch_dtype=torch_dtype, trust_remote_code=True,
+    ).to(args.device)
+    model.eval()
+
+    ollama_create(args.model_name, args.modelfile_path)
+    try:
+        results = []
+        for entry in transformers_data:
+            name = entry["name"]
+            prompt = entry.get("rendered_prompt")
+            if not prompt:
+                continue
+            if entry.get("add_generation_prompt") is False:
+                # No canonical next-token prediction: the conversation ends
+                # on the assistant's own message (closed by <|im_end|>).
+                # Skip — there's nothing meaningful to compare.
+                print(f"[logits]   {name}  SKIP (no add_generation_prompt)")
+                results.append({"name": name, "skipped": "no add_generation_prompt"})
+                continue
+            print(f"[logits]   {name}")
+            try:
+                tf_top = transformers_topk(model, tokenizer, prompt, args.top_k)
+                ol_top = ollama_topk(args.ollama_url, args.model_name, prompt, args.top_k)
+                cmp = compare_topk(tf_top, ol_top, tokenizer)
+                results.append({
+                    "name": name,
+                    "comparison": cmp,
+                    "tf_top5": tf_top[:5],
+                    "ol_top5": (ol_top or [])[:5],
+                })
+                if cmp:
+                    fmt = lambda v: (f"{v:.4f}" if v is not None else "n/a")
+                    print(f"    top1_match={cmp['top1_match']}  "
+                          f"|Δlp_top1|={fmt(cmp['top1_lp_diff'])}  "
+                          f"mean|Δlp|_top3={fmt(cmp['mean_lp_diff_top3'])}  "
+                          f"top5_overlap={cmp['top5_overlap']}/5  "
+                          f"missing={cmp['tf_top5_missing_in_ollama_topk']}/5")
+            except Exception as e:
+                traceback.print_exc()
+                results.append({"name": name, "error": f"{type(e).__name__}: {e}"})
+    finally:
+        ollama_delete(args.model_name)
+        del model
+        if args.device == "cuda":
+            torch.cuda.empty_cache()
+
+    out = Path(args.output_json)
+    out.parent.mkdir(parents=True, exist_ok=True)
+    out.write_text(json.dumps(results, indent=2, ensure_ascii=False))
+    print(f"[logits] Wrote {len(results)} results to {out}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test_conversion/run_ollama.py b/test_conversion/run_ollama.py
new file mode 100644
index 00000000000..d7e836c01ea
--- /dev/null
+++ b/test_conversion/run_ollama.py
@@ -0,0 +1,228 @@
+"""
+For each test case, query Ollama and collect the input-token count.
+
+Two probes per case:
+    chat_prompt_eval_count : tokens fed to the model when the conversation
+                             is passed through Ollama's /api/chat (i.e.
+                             Ollama applies its Modelfile TEMPLATE, then
+                             tokenizes with the GGUF tokenizer).
+    raw_prompt_eval_count  : tokens fed when the *transformers-rendered*
+                             prompt is passed through /api/generate with
+                             raw=true (i.e. only the GGUF tokenizer runs;
+                             no chat template applied). This isolates the
+                             tokenizer from the template.
+
+The Ollama model is created from the Modelfile at startup and deleted at exit.
+
+Usage:
+    python run_ollama.py <modelfile_path> <output_json>
+                         [--model-name NAME]
+                         [--transformers-output JSON]
+                         [--ollama-url URL]
+"""
+
+import argparse
+import json
+import subprocess
+import sys
+import traceback
+from pathlib import Path
+
+try:
+    import requests
+except ImportError:
+    sys.exit("ERROR: this script needs the 'requests' package (pip install requests).")
+
+sys.path.insert(0, str(Path(__file__).parent))
+from test_cases import TEST_CASES  # noqa: E402
+
+
+def check_ollama_alive(url):
+    try:
+        r = requests.get(f"{url}/api/version", timeout=3)
+        r.raise_for_status()
+        return r.json().get("version", "?")
+    except Exception as e:
+        sys.exit(
+            f"ERROR: cannot reach Ollama at {url} ({e}).\n"
+            "       Start it first with:  ollama serve"
+        )
+
+
+def validate_from_target(modelfile_path):
+    """Check that the Modelfile's FROM target exists (relative to the Modelfile)."""
+    modelfile_path = Path(modelfile_path).resolve()
+    for line in modelfile_path.read_text().splitlines():
+        line = line.strip()
+        if line.startswith("FROM "):
+            target = line[len("FROM "):].strip().strip('"')
+            # Ollama requires a relative path for local FROM files; absolute
+            # paths trigger a misleading "no Modelfile or safetensors files
+            # found" error.
+            if target.startswith("/"):
+                sys.exit(
+                    f"ERROR: Modelfile {modelfile_path} has an absolute FROM path "
+                    f"({target}). Ollama needs it relative to the Modelfile."
+                )
+            resolved = (modelfile_path.parent / target).resolve()
+            if not resolved.exists():
+                sys.exit(
+                    f"ERROR: Modelfile {modelfile_path} references\n"
+                    f"  FROM {target}\n"
+                    f"but {resolved} does not exist.\n"
+                    f"Either rename the GGUF or edit the FROM line in the Modelfile.\n"
+                    f"(NB: ollama reports this as 'invalid model name' — misleading.)"
+                )
+            return
+    sys.exit(f"ERROR: no FROM line found in {modelfile_path}")
+
+
+def ollama_create(name, modelfile_path):
+    """Create (or overwrite) an ollama model from the given Modelfile."""
+    modelfile_path = Path(modelfile_path).resolve()
+    validate_from_target(modelfile_path)
+    print(f"[ollama] Creating model '{name}' from {modelfile_path}")
+    # Run from the modelfile's directory so its FROM ./X.gguf resolves.
+    result = subprocess.run(
+        ["ollama", "create", name, "-f", modelfile_path.name],
+        cwd=str(modelfile_path.parent),
+        capture_output=True,
+        text=True,
+    )
+    if result.returncode != 0:
+        sys.exit(
+            "ERROR: 'ollama create' failed:\n"
+            f"  STDOUT: {result.stdout}\n"
+            f"  STDERR: {result.stderr}"
+        )
+
+
+def ollama_delete(name):
+    print(f"[ollama] Removing temporary model '{name}'")
+    subprocess.run(["ollama", "rm", name], capture_output=True, text=True)
+
+
+def normalize_for_ollama(messages):
+    """Convert OpenAI-style messages to the variant Ollama's /api/chat accepts.
+
+    Differences observed empirically:
+      - tool_calls[].function.arguments must be an OBJECT, not a JSON string.
+      - The OpenAI-style {"type": "function", "function": {...}} wrapper around
+        each tool_call is tolerated, but we strip it to be safe.
+    """
+    out = []
+    for msg in messages:
+        m = dict(msg)
+        tcs = m.get("tool_calls")
+        if tcs:
+            new_tcs = []
+            for tc in tcs:
+                fn = tc.get("function", tc)
+                args = fn.get("arguments")
+                if isinstance(args, str):
+                    try:
+                        args = json.loads(args)
+                    except json.JSONDecodeError:
+                        pass  # leave it; ollama will complain again
+                new_tcs.append({"function": {"name": fn["name"], "arguments": args}})
+            m["tool_calls"] = new_tcs
+        out.append(m)
+    return out
+
+
+def _post(url, payload):
+    r = requests.post(url, json=payload, timeout=300)
+    if r.status_code >= 400:
+        # Surface Ollama's actual complaint instead of a bare HTTPError.
+        raise RuntimeError(f"HTTP {r.status_code} from {url}: {r.text}")
+    return r.json()
+
+
+def ollama_chat(url, model, messages, tools):
+    payload = {
+        "model": model,
+        "messages": normalize_for_ollama(messages),
+        "stream": False,
+        "options": {"num_predict": 1, "temperature": 0, "seed": 0},
+    }
+    if tools is not None:
+        payload["tools"] = tools
+    return _post(f"{url}/api/chat", payload)
+
+
+def ollama_generate_raw(url, model, prompt):
+    payload = {
+        "model": model,
+        "prompt": prompt,
+        "raw": True,
+        "stream": False,
+        "options": {"num_predict": 1, "temperature": 0, "seed": 0},
+    }
+    return _post(f"{url}/api/generate", payload)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("modelfile_path", help="Path to the Modelfile")
+    parser.add_argument("output_json", help="Path to write the JSON results")
+    parser.add_argument("--model-name", default="test-chat-template-tmp",
+                        help="Temporary ollama model name (will be created and removed). "
+                             "Must match ollama's naming rules: lowercase letters, digits, "
+                             "hyphens and periods only (no underscores).")
+    parser.add_argument("--transformers-output", default=None,
+                        help="Path to the transformers.json (enables the raw tokenizer probe)")
+    parser.add_argument("--ollama-url", default="http://localhost:11434")
+    args = parser.parse_args()
+
+    version = check_ollama_alive(args.ollama_url)
+    print(f"[ollama] Server reachable (version {version})")
+
+    transformers_by_name = {}
+    if args.transformers_output and Path(args.transformers_output).exists():
+        ref = json.loads(Path(args.transformers_output).read_text())
+        transformers_by_name = {r["name"]: r for r in ref}
+        print(f"[ollama] Loaded {len(transformers_by_name)} transformers references "
+              f"(will probe tokenizer with raw=true)")
+    else:
+        print("[ollama] No transformers reference available; skipping raw-tokenizer probe")
+
+    ollama_create(args.model_name, args.modelfile_path)
+
+    results = []
+    try:
+        for case in TEST_CASES:
+            print(f"[ollama]   {case['name']}")
+            entry = {"name": case["name"]}
+            try:
+                chat_resp = ollama_chat(
+                    args.ollama_url, args.model_name,
+                    case["messages"], case.get("tools"),
+                )
+                entry["chat_prompt_eval_count"] = chat_resp.get("prompt_eval_count")
+            except Exception as e:
+                traceback.print_exc()
+                entry["chat_error"] = f"{type(e).__name__}: {e}"
+
+            ref = transformers_by_name.get(case["name"])
+            if ref and "rendered_prompt" in ref:
+                try:
+                    raw_resp = ollama_generate_raw(
+                        args.ollama_url, args.model_name, ref["rendered_prompt"],
+                    )
+                    entry["raw_prompt_eval_count"] = raw_resp.get("prompt_eval_count")
+                except Exception as e:
+                    traceback.print_exc()
+                    entry["raw_error"] = f"{type(e).__name__}: {e}"
+
+            results.append(entry)
+    finally:
+        ollama_delete(args.model_name)
+
+    out = Path(args.output_json)
+    out.parent.mkdir(parents=True, exist_ok=True)
+    out.write_text(json.dumps(results, indent=2, ensure_ascii=False))
+    print(f"[ollama] Wrote {len(results)} results to {out}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test_conversion/run_transformers.py b/test_conversion/run_transformers.py
new file mode 100644
index 00000000000..f98c2971088
--- /dev/null
+++ b/test_conversion/run_transformers.py
@@ -0,0 +1,81 @@
+"""
+Render each test case with the transformers chat template + tokenize.
+
+Outputs a JSON file with, for each test case:
+    name, messages, tools, add_generation_prompt,
+    rendered_prompt, token_count, token_ids
+
+Usage:
+    python run_transformers.py <hf_model_dir> <output_json>
+"""
+
+import argparse
+import json
+import sys
+import traceback
+from pathlib import Path
+
+from transformers import AutoTokenizer
+
+sys.path.insert(0, str(Path(__file__).parent))
+from test_cases import TEST_CASES  # noqa: E402
+
+
+def render_case(tokenizer, case):
+    messages = case["messages"]
+    tools = case.get("tools")
+
+    # If the conversation ends on an assistant turn, we are NOT prompting for
+    # another generation; otherwise we are (mirrors Ollama's behaviour).
+    add_generation_prompt = messages[-1]["role"] != "assistant"
+
+    kwargs = {"add_generation_prompt": add_generation_prompt}
+    if tools is not None:
+        kwargs["tools"] = tools
+
+    rendered = tokenizer.apply_chat_template(messages, tokenize=False, **kwargs)
+    token_ids = tokenizer.apply_chat_template(messages, tokenize=True, **kwargs)
+
+    # apply_chat_template may return a tensor; normalize to list[int]
+    if hasattr(token_ids, "tolist"):
+        token_ids = token_ids.tolist()
+    if token_ids and isinstance(token_ids[0], list):
+        token_ids = token_ids[0]
+
+    return {
+        "name": case["name"],
+        "messages": messages,
+        "tools": tools,
+        "add_generation_prompt": add_generation_prompt,
+        "rendered_prompt": rendered,
+        "token_count": len(token_ids),
+        "token_ids": token_ids,
+    }
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("hf_model_dir", help="Path to HuggingFace transformers model directory")
+    parser.add_argument("output_json", help="Path to write the JSON results")
+    args = parser.parse_args()
+
+    print(f"[transformers] Loading tokenizer from {args.hf_model_dir}")
+    tokenizer = AutoTokenizer.from_pretrained(args.hf_model_dir, trust_remote_code=True)
+
+    results = []
+    for case in TEST_CASES:
+        print(f"[transformers]   {case['name']}")
+        try:
+            results.append(render_case(tokenizer, case))
+        except Exception as e:
+            traceback.print_exc()
+            results.append({"name": case["name"], "error": f"{type(e).__name__}: {e}"})
+
+    out = Path(args.output_json)
+    out.parent.mkdir(parents=True, exist_ok=True)
+    out.write_text(json.dumps(results, indent=2, ensure_ascii=False))
+    print(f"[transformers] Wrote {len(results)} results to {out}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test_conversion/test_cases.py b/test_conversion/test_cases.py
new file mode 100644
index 00000000000..48b458ebf09
--- /dev/null
+++ b/test_conversion/test_cases.py
@@ -0,0 +1,253 @@
+"""
+Test conversations used to compare the transformers chat template (jinja)
+against the Ollama Modelfile template.
+
+Each test case is a dict with:
+    name      : unique short identifier (used in filenames and reports)
+    messages  : OpenAI-style list of message dicts
+    tools     : list of OpenAI-style tool definitions, or None
+"""
+
+WEATHER_TOOL = {
+    "type": "function",
+    "function": {
+        "name": "get_weather",
+        "description": "Get the current weather in a given location.",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "location": {
+                    "type": "string",
+                    "description": "City name, e.g. 'Paris'.",
+                },
+                "unit": {
+                    "type": "string",
+                    "enum": ["celsius", "fahrenheit"],
+                    "description": "Temperature unit.",
+                },
+            },
+            "required": ["location"],
+        },
+    },
+}
+
+CALCULATOR_TOOL = {
+    "type": "function",
+    "function": {
+        "name": "calculator",
+        "description": "Evaluate a math expression.",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "expression": {
+                    "type": "string",
+                    "description": "A math expression, e.g. '2 + 2 * 3'.",
+                },
+            },
+            "required": ["expression"],
+        },
+    },
+}
+
+
+TEST_CASES = [
+    # --- No tools ---
+    {
+        "name": "01_user_only",
+        "messages": [
+            {"role": "user", "content": "Bonjour, comment vas-tu ?"},
+        ],
+        "tools": None,
+    },
+    {
+        "name": "02_system_user",
+        "messages": [
+            {"role": "system", "content": "Tu es un assistant qui répond en français."},
+            {"role": "user", "content": "Quelle est la capitale de la France ?"},
+        ],
+        "tools": None,
+    },
+    {
+        "name": "03_multi_turn",
+        "messages": [
+            {"role": "user", "content": "Hi!"},
+            {"role": "assistant", "content": "Hello! How can I help you today?"},
+            {"role": "user", "content": "What's 2 + 2?"},
+            {"role": "assistant", "content": "2 + 2 equals 4."},
+            {"role": "user", "content": "Thanks!"},
+        ],
+        "tools": None,
+    },
+    # --- Tools, no tool call yet ---
+    {
+        "name": "04_tools_available_no_call",
+        "messages": [
+            {"role": "user", "content": "What's the weather in Paris?"},
+        ],
+        "tools": [WEATHER_TOOL],
+        # Behavioural expectation: the model should emit a tool_call rather than text.
+        "expected_behavior": {
+            "tool_call": {
+                "name": "get_weather",
+                "required_args": ["location"],
+                "args_must_contain": {"location": "paris"},  # case-insensitive substring
+            },
+        },
+    },
+    {
+        "name": "05_tools_with_system",
+        "messages": [
+            {"role": "system", "content": "You are a weather assistant."},
+            {"role": "user", "content": "Weather in Paris please."},
+        ],
+        "tools": [WEATHER_TOOL, CALCULATOR_TOOL],
+        "expected_behavior": {
+            "tool_call": {
+                "name": "get_weather",
+                "required_args": ["location"],
+                "args_must_contain": {"location": "paris"},
+            },
+        },
+    },
+    # --- Tool call + tool response ---
+    {
+        "name": "06_single_tool_call_and_response",
+        "messages": [
+            {"role": "user", "content": "What's the weather in Paris?"},
+            {
+                "role": "assistant",
+                "content": "",
+                "tool_calls": [
+                    {
+                        "type": "function",
+                        "function": {
+                            "name": "get_weather",
+                            "arguments": '{"location": "Paris"}',
+                        },
+                    },
+                ],
+            },
+            {"role": "tool", "content": '{"temperature": 18, "unit": "celsius"}'},
+            {"role": "assistant", "content": "It's 18°C in Paris."},
+        ],
+        "tools": [WEATHER_TOOL],
+    },
+    # --- Multiple tool calls in one assistant turn ---
+    {
+        "name": "07_multiple_tool_calls",
+        "messages": [
+            {"role": "user", "content": "Weather in Paris and London?"},
+            {
+                "role": "assistant",
+                "content": "",
+                "tool_calls": [
+                    {
+                        "type": "function",
+                        "function": {
+                            "name": "get_weather",
+                            "arguments": '{"location": "Paris"}',
+                        },
+                    },
+                    {
+                        "type": "function",
+                        "function": {
+                            "name": "get_weather",
+                            "arguments": '{"location": "London"}',
+                        },
+                    },
+                ],
+            },
+        ],
+        "tools": [WEATHER_TOOL],
+    },
+    # --- Consecutive tool responses (must batch into one user turn in jinja) ---
+    {
+        "name": "08_consecutive_tool_responses",
+        "messages": [
+            {"role": "user", "content": "Weather in Paris and London?"},
+            {
+                "role": "assistant",
+                "content": "",
+                "tool_calls": [
+                    {
+                        "type": "function",
+                        "function": {
+                            "name": "get_weather",
+                            "arguments": '{"location": "Paris"}',
+                        },
+                    },
+                    {
+                        "type": "function",
+                        "function": {
+                            "name": "get_weather",
+                            "arguments": '{"location": "London"}',
+                        },
+                    },
+                ],
+            },
+            {"role": "tool", "content": '{"location": "Paris", "temperature": 18}'},
+            {"role": "tool", "content": '{"location": "London", "temperature": 15}'},
+        ],
+        "tools": [WEATHER_TOOL],
+    },
+    # --- Assistant with BOTH content AND tool_calls ---
+    {
+        "name": "09_assistant_content_and_tool_call",
+        "messages": [
+            {"role": "user", "content": "What's the weather in Paris?"},
+            {
+                "role": "assistant",
+                "content": "Let me check that for you.",
+                "tool_calls": [
+                    {
+                        "type": "function",
+                        "function": {
+                            "name": "get_weather",
+                            "arguments": '{"location": "Paris"}',
+                        },
+                    },
+                ],
+            },
+            {"role": "tool", "content": '{"temperature": 18, "unit": "celsius"}'},
+        ],
+        "tools": [WEATHER_TOOL],
+    },
+    # --- Long-ish multi-turn with tools sprinkled in ---
+    {
+        "name": "10_full_tool_dialogue",
+        "messages": [
+            {"role": "system", "content": "You are a helpful assistant with tools."},
+            {"role": "user", "content": "Compute 12*34 then tell me the weather in Lyon."},
+            {
+                "role": "assistant",
+                "content": "",
+                "tool_calls": [
+                    {
+                        "type": "function",
+                        "function": {
+                            "name": "calculator",
+                            "arguments": '{"expression": "12*34"}',
+                        },
+                    },
+                ],
+            },
+            {"role": "tool", "content": '{"result": 408}'},
+            {
+                "role": "assistant",
+                "content": "12*34 = 408. Now checking the weather.",
+                "tool_calls": [
+                    {
+                        "type": "function",
+                        "function": {
+                            "name": "get_weather",
+                            "arguments": '{"location": "Lyon"}',
+                        },
+                    },
+                ],
+            },
+            {"role": "tool", "content": '{"temperature": 21, "unit": "celsius"}'},
+            {"role": "assistant", "content": "12*34 = 408 and it's 21°C in Lyon."},
+        ],
+        "tools": [WEATHER_TOOL, CALCULATOR_TOOL],
+    },
+]
diff --git a/test_conversion/test_main.py b/test_conversion/test_main.py
new file mode 100644
index 00000000000..75505c641b1
--- /dev/null
+++ b/test_conversion/test_main.py
@@ -0,0 +1,175 @@
+"""
+Main orchestrator: compare the chat template + tokenizer of a transformers
+model against an Ollama (GGUF + Modelfile) deployment.
+
+Pipeline:
+    1. run_transformers.py  ->  <work_dir>/transformers.json
+    2. run_ollama.py        ->  <work_dir>/ollama.json
+    3. run_behavior.py      ->  <work_dir>/behavior.json   (only cases with expected_behavior)
+    4. run_logits.py        ->  <work_dir>/logits.json     (per-case next-token logit comparison)
+    5. compare.py           ->  prints per-test report, exit 1 on failure
+
+Each step is skipped if its output JSON already exists; delete the file (or
+the whole <work_dir>) to force recomputation. Or pass --force. Slow optional
+steps can be turned off with --no-behavior and --no-logits.
+
+Requirements:
+    - transformers + torch    (Python; transformers always; torch only for --logits)
+    - requests                (Python)
+    - ollama                  (must be running:  ollama serve)
+
+Usage:
+    python test_main.py <hf_model_dir> <gguf_dir>
+                        [--work-dir DIR]
+                        [--ollama-model-name NAME]
+                        [--ollama-url URL]
+                        [--num-predict N]
+                        [--logits-top-k K]
+                        [--logits-device cuda|cpu]
+                        [--no-behavior]
+                        [--no-logits]
+                        [--force]
+
+Where:
+    <hf_model_dir>  is a HuggingFace transformers model directory
+                    (must contain tokenizer files + chat_template).
+    <gguf_dir>      is a directory containing both:
+                      - a 'Modelfile' file
+                      - the .gguf file referenced by the Modelfile (FROM ./...)
+"""
+
+import argparse
+import subprocess
+import sys
+from pathlib import Path
+
+
+def run_step(label, cmd, output_file, force):
+    if not force and output_file.exists():
+        print(f"=== {label}: SKIP (using cached {output_file}) ===\n")
+        return
+    print(f"=== {label} ===")
+    print("$ " + " ".join(cmd))
+    rc = subprocess.run(cmd).returncode
+    if rc != 0:
+        sys.exit(f"!!! {label} failed (exit {rc})")
+    print()
+
+
+def main():
+    parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+    parser.add_argument("hf_model_dir", help="Path to the HuggingFace transformers model directory")
+    parser.add_argument("gguf_dir", help="Directory containing the Modelfile and the .gguf file")
+    parser.add_argument("--work-dir", default=None,
+                        help="Where to store intermediate JSON files "
+                             "(default: ./results/<hf_basename>__vs__<gguf_basename>/)")
+    parser.add_argument("--ollama-model-name", default="test-chat-template-tmp",
+                        help="Temporary ollama model name (created and removed by the script). "
+                             "No underscores: ollama rejects them.")
+    parser.add_argument("--ollama-url", default="http://localhost:11434")
+    parser.add_argument("--num-predict", type=int, default=256,
+                        help="Max tokens generated per behavioural case (default 256)")
+    parser.add_argument("--logits-top-k", type=int, default=20,
+                        help="K for next-token top-K logit comparison (default 20)")
+    parser.add_argument("--logits-device", default=None, choices=["cuda", "cpu"],
+                        help="Device for the transformers forward pass (default: cuda if available, else cpu)")
+    parser.add_argument("--no-behavior", action="store_true",
+                        help="Skip the behavioural step (slower; requires the model to actually generate)")
+    parser.add_argument("--no-logits", action="store_true",
+                        help="Skip the logit-comparison step (loads the full transformers model; slow)")
+    parser.add_argument("--force", action="store_true",
+                        help="Recompute all intermediate outputs, ignoring caches")
+    args = parser.parse_args()
+
+    here = Path(__file__).resolve().parent
+    hf_dir = Path(args.hf_model_dir).resolve()
+    gguf_dir = Path(args.gguf_dir).resolve()
+    modelfile = gguf_dir / "Modelfile"
+
+    if not hf_dir.is_dir():
+        sys.exit(f"ERROR: HF model dir not found: {hf_dir}")
+    if not modelfile.is_file():
+        sys.exit(f"ERROR: Modelfile not found at {modelfile}")
+
+    work_dir = (Path(args.work_dir).resolve()
+                if args.work_dir
+                else here / "results" / f"{hf_dir.name}__vs__{gguf_dir.name}")
+    work_dir.mkdir(parents=True, exist_ok=True)
+    transformers_json = work_dir / "transformers.json"
+    ollama_json = work_dir / "ollama.json"
+    behavior_json = work_dir / "behavior.json"
+    logits_json = work_dir / "logits.json"
+
+    print(f"HF model dir   : {hf_dir}")
+    print(f"GGUF dir       : {gguf_dir}")
+    print(f"Modelfile      : {modelfile}")
+    print(f"Work dir       : {work_dir}")
+    print(f"Ollama URL     : {args.ollama_url}")
+    print()
+
+    # Step 1: transformers
+    run_step(
+        "Step 1/5 — transformers (render + tokenize)",
+        [sys.executable, str(here / "run_transformers.py"), str(hf_dir), str(transformers_json)],
+        transformers_json,
+        args.force,
+    )
+
+    # Step 2: ollama (depends on Step 1's JSON for the raw-tokenizer probe)
+    run_step(
+        "Step 2/5 — ollama (chat + raw tokenizer probes)",
+        [sys.executable, str(here / "run_ollama.py"), str(modelfile), str(ollama_json),
+         "--model-name", args.ollama_model_name,
+         "--transformers-output", str(transformers_json),
+         "--ollama-url", args.ollama_url],
+        ollama_json,
+        args.force,
+    )
+
+    # Step 3: behavioural check (optional). The model actually generates here.
+    if args.no_behavior:
+        print("=== Step 3/5 — behavioural check: SKIPPED (--no-behavior) ===\n")
+    else:
+        run_step(
+            "Step 3/5 — behavioural check (model generates tool_calls)",
+            [sys.executable, str(here / "run_behavior.py"), str(modelfile), str(behavior_json),
+             "--transformers-output", str(transformers_json),
+             "--model-name", args.ollama_model_name,
+             "--ollama-url", args.ollama_url,
+             "--num-predict", str(args.num_predict)],
+            behavior_json,
+            args.force,
+        )
+
+    # Step 4: logit comparison (optional, slow — loads the full transformers model).
+    if args.no_logits:
+        print("=== Step 4/5 — logit comparison: SKIPPED (--no-logits) ===\n")
+    else:
+        logits_cmd = [sys.executable, str(here / "run_logits.py"),
+                      str(hf_dir), str(modelfile), str(logits_json),
+                      "--transformers-output", str(transformers_json),
+                      "--model-name", args.ollama_model_name,
+                      "--ollama-url", args.ollama_url,
+                      "--top-k", str(args.logits_top_k)]
+        if args.logits_device:
+            logits_cmd += ["--device", args.logits_device]
+        run_step(
+            "Step 4/5 — logit comparison (transformers vs Ollama, next-token top-K)",
+            logits_cmd,
+            logits_json,
+            args.force,
+        )
+
+    # Step 5: compare (always runs)
+    print("=== Step 5/5 — compare ===")
+    compare_cmd = [sys.executable, str(here / "compare.py"), str(transformers_json), str(ollama_json)]
+    if not args.no_behavior and behavior_json.exists():
+        compare_cmd += ["--behavior", str(behavior_json)]
+    if not args.no_logits and logits_json.exists():
+        compare_cmd += ["--logits", str(logits_json)]
+    rc = subprocess.run(compare_cmd).returncode
+    sys.exit(rc)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/test-tokenizer-random.py b/tests/test-tokenizer-random.py
index 93e697607e6..374ae159c2e 100644
--- a/tests/test-tokenizer-random.py
+++ b/tests/test-tokenizer-random.py
@@ -30,7 +30,7 @@ class LibLlama:
 
     DEFAULT_PATH_LLAMA_H = "./include/llama.h"
     DEFAULT_PATH_INCLUDES = ["./ggml/include/", "./include/"]
-    DEFAULT_PATH_LIBLLAMA = "./build/src/libllama.so"  # CMakeLists.txt: BUILD_SHARED_LIBS ON
+    DEFAULT_PATH_LIBLLAMA = "./build/bin/libllama.so"  # CMakeLists.txt: BUILD_SHARED_LIBS ON
 
     def __init__(self, path_llama_h: str | None = None, path_includes: list[str] = [], path_libllama: str | None = None):
         path_llama_h = path_llama_h or self.DEFAULT_PATH_LLAMA_H
@@ -79,6 +79,9 @@ def __init__(self, libllama: LibLlama, path_model: str, mparams={}, cparams={}):
         self.model = self.lib.llama_model_load_from_file(path_model.encode(), mparams)
         if not self.model:
             raise RuntimeError("error: failed to load model '%s'" % path_model)
+        self.vocab = self.lib.llama_model_get_vocab(self.model)
+        if not self.vocab:
+            raise RuntimeError("error: failed to get vocab for model '%s'" % path_model)
         if isinstance(cparams, dict):
             cparams = libllama.context_default_params(**cparams)
         self.ctx = self.lib.llama_new_context_with_model(self.model, cparams)
@@ -99,10 +102,10 @@ def free(self):
 
     def tokenize(self, text: str, add_special: bool = False, parse_special: bool = False) -> list[int]:
         encoded_text: bytes = text.encode("utf-8")
-        num = self.lib.llama_tokenize(self.model, encoded_text, len(encoded_text), self.token_ids, len(self.token_ids), add_special, parse_special)
+        num = self.lib.llama_tokenize(self.vocab, encoded_text, len(encoded_text), self.token_ids, len(self.token_ids), add_special, parse_special)
         while num < 0 and len(self.token_ids) < (16 << 20):
             self.token_ids = self.ffi.new("llama_token[]", -2 * num)
-            num = self.lib.llama_tokenize(self.model, encoded_text, len(encoded_text), self.token_ids, len(self.token_ids), add_special, parse_special)
+            num = self.lib.llama_tokenize(self.vocab, encoded_text, len(encoded_text), self.token_ids, len(self.token_ids), add_special, parse_special)
         return list(self.token_ids[0:num])
 
     def detokenize(self, ids: list[int], remove_special: bool = False, unparse_special: bool = False) -> str:
@@ -110,10 +113,10 @@ def detokenize(self, ids: list[int], remove_special: bool = False, unparse_speci
             self.token_ids = self.ffi.new("llama_token[]", 2 * len(ids))
         for i, id in enumerate(ids):
             self.token_ids[i] = id
-        num = self.lib.llama_detokenize(self.model, self.token_ids, len(ids), self.text_buff, len(self.text_buff), remove_special, unparse_special)
+        num = self.lib.llama_detokenize(self.vocab, self.token_ids, len(ids), self.text_buff, len(self.text_buff), remove_special, unparse_special)
         while num < 0 and len(self.text_buff) < (16 << 20):
             self.text_buff = self.ffi.new("uint8_t[]", -2 * num)
-            num = self.lib.llama_detokenize(self.model, self.token_ids, len(ids), self.text_buff, len(self.text_buff), remove_special, unparse_special)
+            num = self.lib.llama_detokenize(self.vocab, self.token_ids, len(ids), self.text_buff, len(self.text_buff), remove_special, unparse_special)
         return str(cast(Buffer, self.ffi.buffer(self.text_buff, num)), encoding="utf-8", errors="replace")  # replace errors with '\uFFFD'
 
 
@@ -152,6 +155,9 @@ def encode(self, text: str) -> list[int]:
 
     def decode(self, ids: list[int]) -> str:
         return self.model.decode(ids, skip_special_tokens=False)
+    
+    def convert_ids_to_tokens(self, ids: list[int]) -> list[str]:
+        return self.model.convert_ids_to_tokens(ids)
 
 
 class TokenizerLlamaCpp (Tokenizer):
@@ -204,6 +210,12 @@ def generator_custom_text() -> Iterator[str]:
         "\n =",
         "' era",
         "Hello, y'all! How are you 😁 ?我想在apple工作1314151天～",
+    ]
+
+
+def generator_digit() -> Iterator[str]:
+    """Digits"""
+    yield from [
         "3",
         "33",
         "333",
@@ -213,6 +225,20 @@ def generator_custom_text() -> Iterator[str]:
         "3333333",
         "33333333",
         "333333333",
+        "333333333+333",
+    ]
+
+
+def generator_contractions() -> Iterator[str]:
+    """Contractions and apostrophes"""
+    yield from [
+        "I'll",
+        "We've they're",
+        "Bonjour quoiqu'aujourd'hui",
+        "puisqu'après",
+        "j're",
+        "“Bonjour quoiqu'aujourd'hui”",
+        "puisqu’après",
     ]
 
 
@@ -418,7 +444,7 @@ def find_first_mismatch(ids1: list[int] | str, ids2: list[int] | str):
         return min(len(ids1), len(ids2))
 
     def check_detokenizer(text: str, text1: str, text2: str) -> bool:
-        if text1 == text2:  # equal to TokenizerGroundtruth?
+        if text1 == text2 or text2 == text:  # equal to TokenizerGroundtruth?
             return True
         # equal to source text?
         if tokenizer1.add_bos_token and tokenizer1.bos_token and isinstance(tokenizer1.bos_token, str):  # remove BOS
@@ -436,7 +462,7 @@ def check_detokenizer(text: str, text1: str, text2: str) -> bool:
     t_start = time.perf_counter()
     encode_errors = 0
     decode_errors = 0
-    MAX_ERRORS = 10
+    MAX_ERRORS = 20
 
     logger.info("%s: %s" % (generator.__qualname__, "ini"))
     for text in generator:
@@ -455,23 +481,30 @@ def check_detokenizer(text: str, text1: str, text2: str) -> bool:
         t_encode2 += t2 - t1
         t_decode1 += t3 - t2
         t_decode2 += t4 - t3
-        if encode_errors < MAX_ERRORS and ids1 != ids2:
+        had_error = False
+        if (MAX_ERRORS is None or encode_errors < MAX_ERRORS) and ids1 != ids2:
             i = find_first_mismatch(ids1, ids2)
-            ids1 = list(ids1)[max(0, i - 2) : i + 5 + 1]
-            ids2 = list(ids2)[max(0, i - 2) : i + 5 + 1]
-            logger.error(" Expected: " + str(ids1))
-            logger.error("   Result: " + str(ids2))
+            ids1_ctx = list(ids1)[max(0, i - 2) : i + 5 + 1]
+            ids2_ctx = list(ids2)[max(0, i - 2) : i + 5 + 1]
+            logger.error(f"  Input: {repr(text[:100])}")
+            logger.error(" Expected: " + str(ids1_ctx) + "  " + str(tokenizer1.convert_ids_to_tokens(ids1_ctx)))
+            logger.error("   Result: " + str(ids2_ctx) + "  " + str(tokenizer1.convert_ids_to_tokens(ids2_ctx)))
             encode_errors += 1
-            logger.error(f" {encode_errors=}")
-        if decode_errors < MAX_ERRORS and not check_detokenizer(text, text1, text2):
+            # logger.error(f" {encode_errors=}")
+            had_error = True
+        if (MAX_ERRORS is None or decode_errors < MAX_ERRORS) and not check_detokenizer(text, text1, text2):
             i = find_first_mismatch(text1, text2)
-            text1 = list(text1[max(0, i - 2) : i + 5 + 1])
-            text2 = list(text2[max(0, i - 2) : i + 5 + 1])
-            logger.error(" Expected: " + " ".join(hex(ord(x)) for x in text1))
-            logger.error("   Result: " + " ".join(hex(ord(x)) for x in text2))
+            text1_ctx = text1[max(0, i - 2) : i + 5 + 1]
+            text2_ctx = text2[max(0, i - 2) : i + 5 + 1]
+            logger.error(f"  Input: {repr(text[:100])}")
+            logger.error(" Expected: " + repr(text1_ctx))
+            logger.error("   Result: " + repr(text2_ctx))
             decode_errors += 1
-            logger.error(f" {decode_errors=}")
-        if encode_errors >= MAX_ERRORS and decode_errors >= MAX_ERRORS:
+            # logger.error(f" {decode_errors=}")
+            had_error = True
+        if had_error:
+            logger.error("")
+        if MAX_ERRORS is not None and encode_errors >= MAX_ERRORS and decode_errors >= MAX_ERRORS:
             logger.error(f" EXIT: {encode_errors=} {decode_errors=}")
             # raise Exception()
             break
@@ -493,74 +526,76 @@ def main(argv: list[str] | None = None):
     tokenizer1 = TokenizerGroundtruth(args.dir_tokenizer)
     tokenizer2 = TokenizerLlamaCpp(args.vocab_file)
 
-    # compare_tokenizers(tokenizer1, tokenizer2, generator_custom_text())
-    # compare_tokenizers(tokenizer1, tokenizer2, generator_custom_text_edge_cases())
+    compare_tokenizers(tokenizer1, tokenizer2, generator_custom_text())
+    compare_tokenizers(tokenizer1, tokenizer2, generator_digit())
+    compare_tokenizers(tokenizer1, tokenizer2, generator_contractions())
+    compare_tokenizers(tokenizer1, tokenizer2, generator_custom_text_edge_cases())
     compare_tokenizers(tokenizer1, tokenizer2, generator_ascii_lr_strip())
     compare_tokenizers(tokenizer1, tokenizer2, generator_apostrophe())
     compare_tokenizers(tokenizer1, tokenizer2, generator_unicodes())
     compare_tokenizers(tokenizer1, tokenizer2, generator_vocab_words(tokenizer1))
     compare_tokenizers(tokenizer1, tokenizer2, generator_added_lr_strip(tokenizer1))
-    # compare_tokenizers(tokenizer1, tokenizer2, generator_random_added_tokens(tokenizer1, 10_000))
-    # compare_tokenizers(tokenizer1, tokenizer2, generator_random_chars(10_000))
-    # compare_tokenizers(tokenizer1, tokenizer2, generator_random_unicodes(10_000))
-    # compare_tokenizers(tokenizer1, tokenizer2, generator_random_vocab_chars(tokenizer1, 10_000))
-    # compare_tokenizers(tokenizer1, tokenizer2, generator_random_vocab_words(tokenizer1, 5_000))
+    compare_tokenizers(tokenizer1, tokenizer2, generator_random_added_tokens(tokenizer1, 10_000))
+    compare_tokenizers(tokenizer1, tokenizer2, generator_random_chars(10_000))
+    compare_tokenizers(tokenizer1, tokenizer2, generator_random_unicodes(10_000))
+    compare_tokenizers(tokenizer1, tokenizer2, generator_random_vocab_chars(tokenizer1, 10_000))
+    compare_tokenizers(tokenizer1, tokenizer2, generator_random_vocab_words(tokenizer1, 5_000))
 
     tokenizer2.model.free()
 
 
 if __name__ == "__main__":
-    # main()
-
-    if True:
-        logging.basicConfig(
-            level    = logging.DEBUG,
-            format   = "%(asctime)s.%(msecs)03d %(name)s %(levelname)s %(message)s",
-            datefmt  = "%Y-%m-%d %H:%M:%S",
-            filename = logger.name + ".log",
-            filemode = "a"
-        )
-    logging.basicConfig(
-        level    = logging.DEBUG,
-        format   = "%(levelname)s %(message)s",
-    )
-
-    path_tokenizers   = Path("./models/tokenizers/")
-    path_vocab_format = "./models/ggml-vocab-%s.gguf"
-
-    tokenizers = [
-        "llama-spm",      # SPM
-        "phi-3",          # SPM
-        "gemma",          # SPM
-        "gemma-2",        # SPM
-        "baichuan",       # SPM
-        "bert-bge",       # WPM
-        "jina-v2-en",     # WPM
-        "llama-bpe",      # BPE
-        "phi-2",          # BPE
-        "deepseek-llm",   # BPE
-        "deepseek-coder", # BPE
-        "falcon",         # BPE
-        "mpt",            # BPE
-        "starcoder",      # BPE
-        "gpt-2",          # BPE
-        "stablelm2",      # BPE
-        "refact",         # BPE
-        "qwen2",          # BPE
-        "olmo",           # BPE
-        "jina-v2-es",     # BPE
-        "jina-v2-de",     # BPE
-        "smaug-bpe",      # BPE
-        "poro-chat",      # BPE
-        "jina-v2-code",   # BPE
-        "viking",         # BPE
-        "jais",           # BPE
-    ]
-
-    logger.info("=" * 50)
-    for tokenizer in tokenizers:
-        logger.info("-" * 50)
-        logger.info(f"TOKENIZER: '{tokenizer}'")
-        vocab_file = Path(path_vocab_format % tokenizer)
-        dir_tokenizer = path_tokenizers / tokenizer
-        main([str(vocab_file), str(dir_tokenizer), "--verbose"])
+    main()
+
+    # if True:
+    #     logging.basicConfig(
+    #         level    = logging.DEBUG,
+    #         format   = "%(asctime)s.%(msecs)03d %(name)s %(levelname)s %(message)s",
+    #         datefmt  = "%Y-%m-%d %H:%M:%S",
+    #         filename = logger.name + ".log",
+    #         filemode = "a"
+    #     )
+    # logging.basicConfig(
+    #     level    = logging.DEBUG,
+    #     format   = "%(levelname)s %(message)s",
+    # )
+
+    # path_tokenizers   = Path("./models/tokenizers/")
+    # path_vocab_format = "./models/ggml-vocab-%s.gguf"
+
+    # tokenizers = [
+    #     "llama-spm",      # SPM
+    #     "phi-3",          # SPM
+    #     "gemma",          # SPM
+    #     "gemma-2",        # SPM
+    #     "baichuan",       # SPM
+    #     "bert-bge",       # WPM
+    #     "jina-v2-en",     # WPM
+    #     "llama-bpe",      # BPE
+    #     "phi-2",          # BPE
+    #     "deepseek-llm",   # BPE
+    #     "deepseek-coder", # BPE
+    #     "falcon",         # BPE
+    #     "mpt",            # BPE
+    #     "starcoder",      # BPE
+    #     "gpt-2",          # BPE
+    #     "stablelm2",      # BPE
+    #     "refact",         # BPE
+    #     "qwen2",          # BPE
+    #     "olmo",           # BPE
+    #     "jina-v2-es",     # BPE
+    #     "jina-v2-de",     # BPE
+    #     "smaug-bpe",      # BPE
+    #     "poro-chat",      # BPE
+    #     "jina-v2-code",   # BPE
+    #     "viking",         # BPE
+    #     "jais",           # BPE
+    # ]
+
+    # logger.info("=" * 50)
+    # for tokenizer in tokenizers:
+    #     logger.info("-" * 50)
+    #     logger.info(f"TOKENIZER: '{tokenizer}'")
+    #     vocab_file = Path(path_vocab_format % tokenizer)
+    #     dir_tokenizer = path_tokenizers / tokenizer
+    #     main([str(vocab_file), str(dir_tokenizer), "--verbose"])