From 6d186cb9f91a4ff8a69427675875852e1254a1c1 Mon Sep 17 00:00:00 2001
From: isHuangXin <huangxin.hust@gmail.com>
Date: Thu, 21 May 2026 12:16:23 +0800
Subject: [PATCH 1/3] Add bitnet-embeddings-0.6b model adaptation with F16 and
 I2_S GGUF conversion

- Add GGUF conversion tool for bitnet-embeddings-0.6b (safetensors -> F16/I2_S GGUF)
- Add Qwen3 architecture support in llama.cpp submodule with per-projection RMSNorm
- Add I2_S ternary quantization (2-bit packed -1/0/+1) for lossless precision
- Add f16 norm weight support for correct embedding inference
- Add AVX512BW SIMD paths for I2_S kernel (~2x throughput on AVX512-capable CPUs)
- Guard bitnet-lut-kernels.h include with TL1/TL2 preprocessor checks
- Update llama.cpp submodule to dev-bitnet-embedding-0.6b branch
- Document F16 (from multilingual-e5-0.6b) and I2_S (from bitnet-embeddings-0.6b) conversion process
---
 3rdparty/llama.cpp                            |   2 +-
 ...bitnet-embeddings-qwen3-gguf-conversion.md | 302 +++++++++++
 src/ggml-bitnet-lut.cpp                       |   7 +
 src/ggml-bitnet-mad.cpp                       | 469 +++++++++++++++-
 utils/convert-bitnet-embedding-to-gguf.py     | 502 ++++++++++++++++++
 5 files changed, 1277 insertions(+), 5 deletions(-)
 create mode 100644 docs/bitnet-embeddings-qwen3-gguf-conversion.md
 create mode 100644 utils/convert-bitnet-embedding-to-gguf.py

diff --git a/3rdparty/llama.cpp b/3rdparty/llama.cpp
index 1f86f058d..13e129947 160000
--- a/3rdparty/llama.cpp
+++ b/3rdparty/llama.cpp
@@ -1 +1 @@
-Subproject commit 1f86f058de0c3f4098dedae2ae8653c335c868a1
+Subproject commit 13e129947db43cbbcbfa985c72c443c2f2757f15
diff --git a/docs/bitnet-embeddings-qwen3-gguf-conversion.md b/docs/bitnet-embeddings-qwen3-gguf-conversion.md
new file mode 100644
index 000000000..9d63c9300
--- /dev/null
+++ b/docs/bitnet-embeddings-qwen3-gguf-conversion.md
@@ -0,0 +1,302 @@
+# BitNet Embeddings (Qwen3) GGUF Conversion Implementation
+
+## 1. Background
+
+`bitnet-embeddings-0.6b` is a Qwen3-based embedding model with BitNet per-projection RMSNorm (`BitLinear`). Each linear projection (q/k/v/o/gate/up/down) has a `.norm.weight` that applies RMSNorm to the input **before** the matmul:
+
+```
+x → RMSNorm(x, norm.weight) → activation_quant(8bit) → matmul(weight_quant(ternary))
+```
+
+This pattern does **not** exist in any standard llama.cpp architecture:
+- Standard Qwen3: no per-projection norms
+- Standard BitNet: has `attn_sub_norm`/`ffn_sub_norm` at different positions (after attention/gate*up, not before each projection)
+
+### Model Config
+
+- Architecture: `Qwen3Model`
+- hidden_size: 1024, num_attention_heads: 16, num_key_value_heads: 8
+- head_dim: 128 (note: != hidden_size/num_heads = 64)
+- intermediate_size: 3072, num_hidden_layers: 28
+- tie_word_embeddings: true
+- rope_theta: 1000000, rms_norm_eps: 1e-06
+
+### Per-Layer Tensors (7 extra norm tensors per layer)
+
+| Tensor | Shape |
+|--------|-------|
+| `self_attn.q_proj.norm.weight` | [1024] |
+| `self_attn.k_proj.norm.weight` | [1024] |
+| `self_attn.v_proj.norm.weight` | [1024] |
+| `self_attn.o_proj.norm.weight` | [2048] |
+| `mlp.gate_proj.norm.weight` | [1024] |
+| `mlp.up_proj.norm.weight` | [1024] |
+| `mlp.down_proj.norm.weight` | [3072] |
+
+---
+
+## 2. GGUF Tensor Name Mapping
+
+| HF Name | GGUF Name | Notes |
+|----------|-----------|-------|
+| `embed_tokens.weight` | `token_embd.weight` | |
+| `norm.weight` | `output_norm.weight` | |
+| `layers.{i}.input_layernorm.weight` | `blk.{i}.attn_norm.weight` | |
+| `layers.{i}.post_attention_layernorm.weight` | `blk.{i}.ffn_norm.weight` | |
+| `layers.{i}.self_attn.q_proj.weight` | `blk.{i}.attn_q.weight` | |
+| `layers.{i}.self_attn.k_proj.weight` | `blk.{i}.attn_k.weight` | |
+| `layers.{i}.self_attn.v_proj.weight` | `blk.{i}.attn_v.weight` | |
+| `layers.{i}.self_attn.o_proj.weight` | `blk.{i}.attn_output.weight` | |
+| `layers.{i}.self_attn.q_norm.weight` | `blk.{i}.attn_q_norm.weight` | QK head norm |
+| `layers.{i}.self_attn.k_norm.weight` | `blk.{i}.attn_k_norm.weight` | QK head norm |
+| `layers.{i}.self_attn.q_proj.norm.weight` | `blk.{i}.attn_q_norm_in.weight` | BitNet per-projection |
+| `layers.{i}.self_attn.k_proj.norm.weight` | `blk.{i}.attn_k_norm_in.weight` | BitNet per-projection |
+| `layers.{i}.self_attn.v_proj.norm.weight` | `blk.{i}.attn_v_norm_in.weight` | BitNet per-projection |
+| `layers.{i}.self_attn.o_proj.norm.weight` | `blk.{i}.attn_output_norm_in.weight` | BitNet per-projection |
+| `layers.{i}.mlp.gate_proj.weight` | `blk.{i}.ffn_gate.weight` | |
+| `layers.{i}.mlp.up_proj.weight` | `blk.{i}.ffn_up.weight` | |
+| `layers.{i}.mlp.down_proj.weight` | `blk.{i}.ffn_down.weight` | |
+| `layers.{i}.mlp.gate_proj.norm.weight` | `blk.{i}.ffn_gate_norm_in.weight` | BitNet per-projection |
+| `layers.{i}.mlp.up_proj.norm.weight` | `blk.{i}.ffn_up_norm_in.weight` | BitNet per-projection |
+| `layers.{i}.mlp.down_proj.norm.weight` | `blk.{i}.ffn_down_norm_in.weight` | BitNet per-projection |
+
+---
+
+## 3. Conversion Script
+
+### `utils/convert-bitnet-embedding-to-gguf.py`
+
+Standalone conversion script (safetensors → GGUF). Key features:
+
+- Hardcoded HF→GGUF tensor name mapping (no dependency on llama.cpp's Python converter)
+- Supports three output types:
+  - `--outtype f32`: all weights in float32
+  - `--outtype f16`: 2D weights and embeddings as float16, norms as float16
+  - `--outtype i2_s`: ternary weights packed in I2_S layout, non-ternary weights as float16
+- Writes `key_length` and `value_length` metadata for head_dim=128 (critical: default calculation would give wrong value 64)
+- GPT-2 BPE tokenizer handling with pre-tokenizer hash verification
+- Pooling type auto-detection from `modules.json` / `1_Pooling/config.json` (sentence-transformers convention)
+- EOS token override: uses `<|endoftext|>` (151643) for correct last-token pooling
+- Architecture string: `"qwen3"`
+
+### I2_S Ternary Packing
+
+The I2_S format packs ternary weights {-1, 0, +1} into 2-bit representation:
+
+- Quantization: `scale = 1/mean(|w|)`, `q = round(w * scale).clamp(-1, 1)`
+- Encoding: `-1 → 0`, `0 → 1`, `+1 → 2`
+- Every 128 values form a block, packed into 32 bytes
+- Each byte stores 4 values: `byte = (c0 << 6) | (c1 << 4) | (c2 << 2) | c3`
+- Scale (float32) is appended at the end of the packed data buffer
+
+### Tensor Type Assignment
+
+| Tensor Type | f16 mode | i2_s mode |
+|-------------|----------|-----------|
+| 2D linear weights | float16 | I2_S ternary packed |
+| Embedding weights | float16 | float16 |
+| Norm weights (1D) | float16 | float16 |
+
+Note: `output.weight` (lm_head) is skipped for embedding models — it is not needed (no token generation).
+
+---
+
+## 4. C++ Modifications (`3rdparty/llama.cpp/src/llama.cpp`)
+
+### 4.1 New Tensor Enums
+
+Added 7 new entries after `LLM_TENSOR_FFN_SUB_NORM`:
+
+```cpp
+LLM_TENSOR_ATTN_Q_NORM_IN,
+LLM_TENSOR_ATTN_K_NORM_IN,
+LLM_TENSOR_ATTN_V_NORM_IN,
+LLM_TENSOR_ATTN_OUT_NORM_IN,
+LLM_TENSOR_FFN_GATE_NORM_IN,
+LLM_TENSOR_FFN_UP_NORM_IN,
+LLM_TENSOR_FFN_DOWN_NORM_IN,
+```
+
+### 4.2 Tensor Name Mappings
+
+Added to `LLM_ARCH_QWEN3` tensor name map:
+
+```cpp
+{ LLM_TENSOR_ATTN_Q_NORM_IN,   "blk.%d.attn_q_norm_in" },
+{ LLM_TENSOR_ATTN_K_NORM_IN,   "blk.%d.attn_k_norm_in" },
+{ LLM_TENSOR_ATTN_V_NORM_IN,   "blk.%d.attn_v_norm_in" },
+{ LLM_TENSOR_ATTN_OUT_NORM_IN, "blk.%d.attn_output_norm_in" },
+{ LLM_TENSOR_FFN_GATE_NORM_IN, "blk.%d.ffn_gate_norm_in" },
+{ LLM_TENSOR_FFN_UP_NORM_IN,   "blk.%d.ffn_up_norm_in" },
+{ LLM_TENSOR_FFN_DOWN_NORM_IN, "blk.%d.ffn_down_norm_in" },
+```
+
+### 4.3 Layer Struct Fields
+
+Added to `struct llama_layer`:
+
+```cpp
+struct ggml_tensor * attn_q_norm_in;
+struct ggml_tensor * attn_k_norm_in;
+struct ggml_tensor * attn_v_norm_in;
+struct ggml_tensor * attn_out_norm_in;
+struct ggml_tensor * ffn_gate_norm_in;
+struct ggml_tensor * ffn_up_norm_in;
+struct ggml_tensor * ffn_down_norm_in;
+```
+
+### 4.4 load_tensors (LLM_ARCH_QWEN3)
+
+Added optional loading with `TENSOR_NOT_REQUIRED`:
+
+```cpp
+layer.attn_q_norm_in   = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM_IN,   "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
+layer.attn_k_norm_in   = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM_IN,   "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
+layer.attn_v_norm_in   = create_tensor(tn(LLM_TENSOR_ATTN_V_NORM_IN,   "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
+layer.attn_out_norm_in = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM_IN, "weight", i), {n_embd_head_k * n_head},    TENSOR_NOT_REQUIRED);
+layer.ffn_gate_norm_in = create_tensor(tn(LLM_TENSOR_FFN_GATE_NORM_IN, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
+layer.ffn_up_norm_in   = create_tensor(tn(LLM_TENSOR_FFN_UP_NORM_IN,   "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
+layer.ffn_down_norm_in = create_tensor(tn(LLM_TENSOR_FFN_DOWN_NORM_IN, "weight", i), {n_ff},   TENSOR_NOT_REQUIRED);
+```
+
+Note: `o_proj.norm` input dimension is `n_embd_head_k * n_head` (=2048), `down_proj.norm` input dimension is `n_ff` (=3072).
+
+### 4.5 build_qwen3() Graph Modifications
+
+The `build_qwen3()` function was modified to conditionally apply per-projection RMSNorm. The logic is fully backward compatible — when no `*_norm_in` tensors exist, behavior is identical to original.
+
+**Attention per-projection norms:**
+```
+// Before Q/K/V matmul:
+if (layer.attn_q_norm_in) {
+    cur_q = ggml_rms_norm(ctx, cur, hparams.f_norm_rms_eps);
+    cur_q = ggml_mul(ctx, cur_q, layer.attn_q_norm_in);
+} else {
+    cur_q = cur;
+}
+Qcur = ggml_mul_mat(ctx, layer.wq, cur_q);
+// Similarly for K, V
+```
+
+**O_proj norm** requires special handling because `llm_build_kv()` normally applies `wo` internally. Solution: pass `wo=NULL` to `llm_build_kv()`, then apply norm + wo manually:
+
+```
+cur = llm_build_kv(..., wo=NULL, ...);  // returns attention output without o_proj
+if (layer.attn_out_norm_in) {
+    cur = ggml_rms_norm(ctx, cur, hparams.f_norm_rms_eps);
+    cur = ggml_mul(ctx, cur, layer.attn_out_norm_in);
+}
+cur = ggml_mul_mat(ctx, layer.wo, cur);
+```
+
+**FFN per-projection norms:**
+```
+// Instead of llm_build_ffn(), manually:
+if (layer.ffn_gate_norm_in) {
+    tmp_gate = rms_norm(cur) * gate_norm_in;
+} else {
+    tmp_gate = cur;
+}
+tmp_gate = matmul(gate_proj, tmp_gate);
+// Similarly for up_proj
+tmp = silu(tmp_gate) * tmp_up;
+
+if (layer.ffn_down_norm_in) {
+    tmp = rms_norm(tmp) * down_norm_in;
+}
+cur = matmul(down_proj, tmp);
+```
+
+---
+
+## 5. GGUF Conversion Process
+
+There are two GGUF files to produce, from **two different source models**:
+
+| GGUF Output | Source Model | Description |
+|-------------|-------------|-------------|
+| `embeddings-0.6b-f16.gguf` | `multilingual-e5-0.6b` (standard Qwen3) | F16 baseline, standard float16 weights |
+| `bitnet-embeddings-0.6b-f16-i2_s.gguf` | `bitnet-embeddings-0.6b` (BitNet ternary) | I2_S ternary packed weights |
+
+### 5.1 F16 GGUF: from multilingual-e5-0.6b
+
+The F16 GGUF is converted from the **standard (non-BitNet) model** `multilingual-e5-0.6b`, which has normal float weights and no per-projection RMSNorm. This uses llama.cpp's standard converter since it is a vanilla Qwen3 model:
+
+```bash
+python3 /path/to/llama.cpp/convert_hf_to_gguf.py \
+  /path/to/multilingual-e5-0.6b \
+  --outtype f16 \
+  --outfile embeddings-0.6b-f16.gguf
+```
+
+**What happens:**
+1. Load `model.safetensors` (standard Qwen3 weights, bfloat16)
+2. Convert all 2D weights (projections, embeddings) to float16
+3. Convert norm weights to float32
+4. Write GGUF with `qwen3` architecture metadata and tokenizer
+
+**Output:** ~1.11 GiB (595.78M params)
+
+### 5.2 I2_S GGUF: from bitnet-embeddings-0.6b
+
+The I2_S GGUF is converted from the **BitNet ternary model** `bitnet-embeddings-0.6b`, which has ternary weights {-1, 0, +1} and 7 extra per-projection RMSNorm tensors per layer. This uses the custom converter because the standard llama.cpp converter does not handle per-projection norms or I2_S quantization:
+
+```bash
+python3 utils/convert-bitnet-embedding-to-gguf.py \
+  /path/to/bitnet-embeddings-0.6b \
+  --outfile bitnet-embeddings-0.6b-f16-i2_s.gguf --outtype i2_s
+```
+
+**What happens:**
+1. Load `model.safetensors` (BitNet ternary weights, bfloat16)
+2. Map HF tensor names to GGUF names, including 7 extra `*_norm_in` tensors per layer (see Section 2)
+3. For each 2D linear weight (q/k/v/o/gate/up/down projections):
+   - Compute scale: `scale = 1 / mean(|w|)`
+   - Quantize: `q = round(w * scale).clamp(-1, 1)`
+   - Encode: `-1 -> 0`, `0 -> 1`, `+1 -> 2`
+   - Pack every 128 values into 32 bytes (4 values per byte, 2 bits each)
+   - Append per-row float32 scale
+4. Keep embeddings (`token_embd.weight`) in float16 (not ternary)
+5. Keep all norm weights in float16
+6. Skip `output.weight` (lm_head, not needed for embedding models)
+7. Write GGUF with `I2_S` type tag for quantized tensors
+
+**Output:** ~699 MiB (~50% of F16 size)
+
+### 5.3 Why Two Different Source Models?
+
+- `multilingual-e5-0.6b` is the **teacher/baseline model** with standard float weights, used as the F16 performance reference
+- `bitnet-embeddings-0.6b` is the **1-bit quantized student model** with ternary weights and per-projection BitLinear norms, converted to I2_S for efficient CPU inference
+- Benchmarking compares both to measure the throughput gain and quality trade-off of ternary quantization
+
+### 5.4 Tensor Type Summary
+
+| Tensor | F16 (from e5-0.6b) | I2_S (from bitnet-0.6b) |
+|--------|---------------------|-------------------------|
+| Linear projections (q/k/v/o/gate/up/down) | float16 | I2_S (2-bit packed + float32 scale) |
+| Embedding (`token_embd.weight`) | float16 | float16 |
+| Per-projection norms (`*_norm_in`) | N/A (not present) | float16 |
+| Layer norms (`attn_norm`, `ffn_norm`) | float32 | float16 |
+| QK head norms (`attn_q_norm`, `attn_k_norm`) | float32 | float32 |
+| `output.weight` (lm_head) | present | skipped |
+
+---
+
+## 6. Build and Run
+
+```bash
+# Build with BitNet repo (includes I2_S support)
+cmake -S /path/to/BitNet -B build -DCMAKE_BUILD_TYPE=Release
+cmake --build build --target llama-embedding llama-bench -j$(nproc)
+
+# Run embedding inference
+build/bin/llama-embedding -m bitnet-embeddings-0.6b-f16-i2_s.gguf \
+  -p "hello world" --embd-normalize 2 --embd-output-format array
+
+# Benchmark: F16 vs I2_S
+build/bin/llama-bench -m embeddings-0.6b-f16.gguf \
+  -t 8 -p 128,256,512,1024,2048 -n 32,64 -r 3 -ngl 0
+
+build/bin/llama-bench -m bitnet-embeddings-0.6b-f16-i2_s.gguf \
+  -t 8 -p 128,256,512,1024,2048 -n 32,64 -r 3 -ngl 0
+```
diff --git a/src/ggml-bitnet-lut.cpp b/src/ggml-bitnet-lut.cpp
index 59422d548..beef726f7 100644
--- a/src/ggml-bitnet-lut.cpp
+++ b/src/ggml-bitnet-lut.cpp
@@ -5,9 +5,16 @@
 #include <stdio.h>
 #include <stdlib.h>
 
+#ifdef __x86_64__
+#include <immintrin.h>
+#endif
+
 #include "ggml-bitnet.h"
 #include "ggml-quants.h"
+
+#if defined(GGML_BITNET_ARM_TL1) || defined(GGML_BITNET_X86_TL2)
 #include "bitnet-lut-kernels.h"
+#endif
 
 #if defined(GGML_BITNET_ARM_TL1)
 
diff --git a/src/ggml-bitnet-mad.cpp b/src/ggml-bitnet-mad.cpp
index 4ba9d6509..f99368bbd 100644
--- a/src/ggml-bitnet-mad.cpp
+++ b/src/ggml-bitnet-mad.cpp
@@ -24,6 +24,12 @@ static inline int hsum_i32_8(const __m256i a) {
     const __m128i hi32  = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
     return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
 }
+#if defined(__AVX512F__) && defined(__AVX512BW__)
+// horizontally add 16 int32_t
+static inline int hsum_i32_16(const __m512i a) {
+    return _mm512_reduce_add_epi32(a);
+}
+#endif
 #elif defined(__loongarch_asx)
 // horizontally add 8 int32_t
 static inline int hsum_i32_8(const __m256i a) {
@@ -196,7 +202,153 @@ size_t quantize_i2_s(const float * src, void * dst, int64_t nrow, int64_t n_per_
 }
 
 void ggml_vec_dot_i2_i8_s_1x1(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) {
-#if defined(__AVX2__)
+#if defined(__AVX512F__) && defined(__AVX512BW__)
+    const uint8_t *    x = (uint8_t *)vx;
+    const int8_t  *    y = (int8_t *)vy;
+
+    const int nb = n / QK_I2_S;
+    const int group32_num = nb / 32;
+    const int la_num = nb % 32;
+    const int groupla_num = nb % 32 != 0 ? 1 : 0;
+
+    const __m512i mask = _mm512_set1_epi8(0x03);
+    const __m512i one16 = _mm512_set1_epi16(1);
+
+    for (int row = 0; row < nrc; row++) {
+        __m512i accu = _mm512_setzero_si512();
+
+        const uint8_t * x_row = x + row * bx / 4;
+
+        for (int i = 0; i < group32_num; i++) {
+            const uint8_t *px = x_row + i * 1024;
+            const int8_t  *py = y + i * 4096;
+            __m512i accu32 = _mm512_setzero_si512();
+
+            // Process 2 blocks per iteration (j+=2), 16 iterations instead of 32
+            int j = 0;
+            for (; j + 1 < 32; j += 2) {
+                // Load 2 consecutive 32-byte weight blocks into one 512-bit register
+                __m512i xq8_3 = _mm512_loadu_si512((const __m512i*)(px));
+                __m512i xq8_2 = _mm512_srli_epi16(xq8_3, 2);
+                __m512i xq8_1 = _mm512_srli_epi16(xq8_3, 4);
+                __m512i xq8_0 = _mm512_srli_epi16(xq8_3, 6);
+
+                xq8_3 = _mm512_and_si512(xq8_3, mask);
+                xq8_2 = _mm512_and_si512(xq8_2, mask);
+                xq8_1 = _mm512_and_si512(xq8_1, mask);
+                xq8_0 = _mm512_and_si512(xq8_0, mask);
+
+                // Load 2 consecutive 128-byte activation blocks (256 bytes total = 4 x 64)
+                __m512i yq8_0 = _mm512_loadu_si512((const __m512i*)(py));
+                __m512i yq8_1 = _mm512_loadu_si512((const __m512i*)(py + 64));
+                __m512i yq8_2 = _mm512_loadu_si512((const __m512i*)(py + 128));
+                __m512i yq8_3 = _mm512_loadu_si512((const __m512i*)(py + 192));
+
+                xq8_0 = _mm512_maddubs_epi16(xq8_0, yq8_0);
+                xq8_1 = _mm512_maddubs_epi16(xq8_1, yq8_1);
+                xq8_2 = _mm512_maddubs_epi16(xq8_2, yq8_2);
+                xq8_3 = _mm512_maddubs_epi16(xq8_3, yq8_3);
+
+                accu32 = _mm512_add_epi16(accu32, _mm512_add_epi16(xq8_0, xq8_1));
+                accu32 = _mm512_add_epi16(accu32, _mm512_add_epi16(xq8_2, xq8_3));
+
+                px += 64;
+                py += 256;
+            }
+            // Handle odd remaining block
+            if (j < 32) {
+                __m256i xq8_3_256 = _mm256_loadu_si256((const __m256i*)(px));
+                __m512i xq8_3 = _mm512_castsi256_si512(xq8_3_256);
+                __m512i xq8_2 = _mm512_srli_epi16(xq8_3, 2);
+                __m512i xq8_1 = _mm512_srli_epi16(xq8_3, 4);
+                __m512i xq8_0 = _mm512_srli_epi16(xq8_3, 6);
+
+                xq8_3 = _mm512_and_si512(xq8_3, mask);
+                xq8_2 = _mm512_and_si512(xq8_2, mask);
+                xq8_1 = _mm512_and_si512(xq8_1, mask);
+                xq8_0 = _mm512_and_si512(xq8_0, mask);
+
+                __m256i yq8_0_256 = _mm256_loadu_si256((const __m256i*)(py));
+                __m256i yq8_1_256 = _mm256_loadu_si256((const __m256i*)(py + 32));
+                __m256i yq8_2_256 = _mm256_loadu_si256((const __m256i*)(py + 64));
+                __m256i yq8_3_256 = _mm256_loadu_si256((const __m256i*)(py + 96));
+
+                xq8_0 = _mm512_maddubs_epi16(xq8_0, _mm512_castsi256_si512(yq8_0_256));
+                xq8_1 = _mm512_maddubs_epi16(xq8_1, _mm512_castsi256_si512(yq8_1_256));
+                xq8_2 = _mm512_maddubs_epi16(xq8_2, _mm512_castsi256_si512(yq8_2_256));
+                xq8_3 = _mm512_maddubs_epi16(xq8_3, _mm512_castsi256_si512(yq8_3_256));
+
+                accu32 = _mm512_add_epi16(accu32, _mm512_add_epi16(xq8_0, xq8_1));
+                accu32 = _mm512_add_epi16(accu32, _mm512_add_epi16(xq8_2, xq8_3));
+            }
+            accu = _mm512_add_epi32(_mm512_madd_epi16(accu32, one16), accu);
+        }
+
+        for (int i = 0; i < groupla_num; i++) {
+            __m512i accula = _mm512_setzero_si512();
+            const uint8_t *px = x_row + group32_num * 1024;
+            const int8_t  *py = y + group32_num * 4096;
+
+            int j = 0;
+            for (; j + 1 < la_num; j += 2) {
+                __m512i xq8_3 = _mm512_loadu_si512((const __m512i*)(px));
+                __m512i xq8_2 = _mm512_srli_epi16(xq8_3, 2);
+                __m512i xq8_1 = _mm512_srli_epi16(xq8_3, 4);
+                __m512i xq8_0 = _mm512_srli_epi16(xq8_3, 6);
+
+                xq8_3 = _mm512_and_si512(xq8_3, mask);
+                xq8_2 = _mm512_and_si512(xq8_2, mask);
+                xq8_1 = _mm512_and_si512(xq8_1, mask);
+                xq8_0 = _mm512_and_si512(xq8_0, mask);
+
+                __m512i yq8_0 = _mm512_loadu_si512((const __m512i*)(py));
+                __m512i yq8_1 = _mm512_loadu_si512((const __m512i*)(py + 64));
+                __m512i yq8_2 = _mm512_loadu_si512((const __m512i*)(py + 128));
+                __m512i yq8_3 = _mm512_loadu_si512((const __m512i*)(py + 192));
+
+                xq8_0 = _mm512_maddubs_epi16(xq8_0, yq8_0);
+                xq8_1 = _mm512_maddubs_epi16(xq8_1, yq8_1);
+                xq8_2 = _mm512_maddubs_epi16(xq8_2, yq8_2);
+                xq8_3 = _mm512_maddubs_epi16(xq8_3, yq8_3);
+
+                accula = _mm512_add_epi16(accula, _mm512_add_epi16(xq8_0, xq8_1));
+                accula = _mm512_add_epi16(accula, _mm512_add_epi16(xq8_2, xq8_3));
+
+                px += 64;
+                py += 256;
+            }
+            if (j < la_num) {
+                __m256i xq8_3_256 = _mm256_loadu_si256((const __m256i*)(px));
+                __m512i xq8_3 = _mm512_castsi256_si512(xq8_3_256);
+                __m512i xq8_2 = _mm512_srli_epi16(xq8_3, 2);
+                __m512i xq8_1 = _mm512_srli_epi16(xq8_3, 4);
+                __m512i xq8_0 = _mm512_srli_epi16(xq8_3, 6);
+
+                xq8_3 = _mm512_and_si512(xq8_3, mask);
+                xq8_2 = _mm512_and_si512(xq8_2, mask);
+                xq8_1 = _mm512_and_si512(xq8_1, mask);
+                xq8_0 = _mm512_and_si512(xq8_0, mask);
+
+                __m256i yq8_0_256 = _mm256_loadu_si256((const __m256i*)(py));
+                __m256i yq8_1_256 = _mm256_loadu_si256((const __m256i*)(py + 32));
+                __m256i yq8_2_256 = _mm256_loadu_si256((const __m256i*)(py + 64));
+                __m256i yq8_3_256 = _mm256_loadu_si256((const __m256i*)(py + 96));
+
+                xq8_0 = _mm512_maddubs_epi16(xq8_0, _mm512_castsi256_si512(yq8_0_256));
+                xq8_1 = _mm512_maddubs_epi16(xq8_1, _mm512_castsi256_si512(yq8_1_256));
+                xq8_2 = _mm512_maddubs_epi16(xq8_2, _mm512_castsi256_si512(yq8_2_256));
+                xq8_3 = _mm512_maddubs_epi16(xq8_3, _mm512_castsi256_si512(yq8_3_256));
+
+                accula = _mm512_add_epi16(accula, _mm512_add_epi16(xq8_0, xq8_1));
+                accula = _mm512_add_epi16(accula, _mm512_add_epi16(xq8_2, xq8_3));
+            }
+            accu = _mm512_add_epi32(accu, _mm512_madd_epi16(accula, one16));
+        }
+
+        int sumi = hsum_i32_16(accu);
+        s[row] = (float)sumi;
+    }
+#elif defined(__AVX2__)
     const uint8_t *    x = (uint8_t *)vx;
     const int8_t  *    y = (int8_t *)vy;
 
@@ -510,7 +662,184 @@ void ggml_vec_dot_i2_i8_s_1x4_32W(int n, float * s, size_t bs, const void * vx,
 }
 
 void ggml_vec_dot_i2_i8_s_1xN(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) {
-#if defined(__AVX2__)
+#if defined(__AVX512F__) && defined(__AVX512BW__)
+    const uint8_t *    x = (uint8_t *)vx;
+    const int8_t  *    y = (int8_t *)vy;
+
+    const int nb = n / QK_I2_S;
+    const int group32_num = nb / 32;
+    const int la_num = nb % 32;
+    const int groupla_num = nb % 32 != 0 ? 1 : 0;
+
+    const __m512i mask = _mm512_set1_epi8(0x03);
+    const __m512i one16 = _mm512_set1_epi16(1);
+
+    for (int row = 0; row < nrc; row += PARALLEL_SIZE) {
+        __m512i accu[PARALLEL_SIZE];
+        const uint8_t * x_row[PARALLEL_SIZE];
+        for (int rb = 0; rb < PARALLEL_SIZE; rb++) {
+            accu[rb] = _mm512_setzero_si512();
+            x_row[rb] = x + (row + rb) * bx / 4;
+        }
+
+        for (int i = 0; i < group32_num; i++) {
+            const uint8_t * px[PARALLEL_SIZE];
+            __m512i accu32[PARALLEL_SIZE];
+            for (int rb = 0; rb < PARALLEL_SIZE; rb++) {
+                px[rb] = x_row[rb] + i * 1024;
+                accu32[rb] = _mm512_setzero_si512();
+            }
+            const int8_t  *py = y + i * 4096;
+
+            int j = 0;
+            for (; j + 1 < 32; j += 2) {
+                __m512i yq8_0 = _mm512_loadu_si512((const __m512i*)(py));
+                __m512i yq8_1 = _mm512_loadu_si512((const __m512i*)(py + 64));
+                __m512i yq8_2 = _mm512_loadu_si512((const __m512i*)(py + 128));
+                __m512i yq8_3 = _mm512_loadu_si512((const __m512i*)(py + 192));
+                for (int rb = 0; rb < PARALLEL_SIZE; rb++) {
+                    __m512i xq8_3 = _mm512_loadu_si512((const __m512i*)(px[rb]));
+                    __m512i xq8_2 = _mm512_srli_epi16(xq8_3, 2);
+                    __m512i xq8_1 = _mm512_srli_epi16(xq8_3, 4);
+                    __m512i xq8_0 = _mm512_srli_epi16(xq8_3, 6);
+
+                    xq8_3 = _mm512_and_si512(xq8_3, mask);
+                    xq8_2 = _mm512_and_si512(xq8_2, mask);
+                    xq8_1 = _mm512_and_si512(xq8_1, mask);
+                    xq8_0 = _mm512_and_si512(xq8_0, mask);
+
+                    xq8_0 = _mm512_maddubs_epi16(xq8_0, yq8_0);
+                    xq8_1 = _mm512_maddubs_epi16(xq8_1, yq8_1);
+                    xq8_2 = _mm512_maddubs_epi16(xq8_2, yq8_2);
+                    xq8_3 = _mm512_maddubs_epi16(xq8_3, yq8_3);
+
+                    accu32[rb] = _mm512_add_epi16(accu32[rb], _mm512_add_epi16(xq8_0, xq8_1));
+                    accu32[rb] = _mm512_add_epi16(accu32[rb], _mm512_add_epi16(xq8_2, xq8_3));
+
+                    px[rb] += 64;
+                }
+                py += 256;
+            }
+            if (j < 32) {
+                __m256i yq8_0_256 = _mm256_loadu_si256((const __m256i*)(py));
+                __m256i yq8_1_256 = _mm256_loadu_si256((const __m256i*)(py + 32));
+                __m256i yq8_2_256 = _mm256_loadu_si256((const __m256i*)(py + 64));
+                __m256i yq8_3_256 = _mm256_loadu_si256((const __m256i*)(py + 96));
+                __m512i yq8_0 = _mm512_castsi256_si512(yq8_0_256);
+                __m512i yq8_1 = _mm512_castsi256_si512(yq8_1_256);
+                __m512i yq8_2 = _mm512_castsi256_si512(yq8_2_256);
+                __m512i yq8_3 = _mm512_castsi256_si512(yq8_3_256);
+                for (int rb = 0; rb < PARALLEL_SIZE; rb++) {
+                    __m256i xq8_3_256 = _mm256_loadu_si256((const __m256i*)(px[rb]));
+                    __m512i xq8_3 = _mm512_castsi256_si512(xq8_3_256);
+                    __m512i xq8_2 = _mm512_srli_epi16(xq8_3, 2);
+                    __m512i xq8_1 = _mm512_srli_epi16(xq8_3, 4);
+                    __m512i xq8_0 = _mm512_srli_epi16(xq8_3, 6);
+
+                    xq8_3 = _mm512_and_si512(xq8_3, mask);
+                    xq8_2 = _mm512_and_si512(xq8_2, mask);
+                    xq8_1 = _mm512_and_si512(xq8_1, mask);
+                    xq8_0 = _mm512_and_si512(xq8_0, mask);
+
+                    xq8_0 = _mm512_maddubs_epi16(xq8_0, yq8_0);
+                    xq8_1 = _mm512_maddubs_epi16(xq8_1, yq8_1);
+                    xq8_2 = _mm512_maddubs_epi16(xq8_2, yq8_2);
+                    xq8_3 = _mm512_maddubs_epi16(xq8_3, yq8_3);
+
+                    accu32[rb] = _mm512_add_epi16(accu32[rb], _mm512_add_epi16(xq8_0, xq8_1));
+                    accu32[rb] = _mm512_add_epi16(accu32[rb], _mm512_add_epi16(xq8_2, xq8_3));
+
+                    px[rb] += 32;
+                }
+            }
+            for (int rb = 0; rb < PARALLEL_SIZE; rb++) {
+                accu[rb] = _mm512_add_epi32(_mm512_madd_epi16(accu32[rb], one16), accu[rb]);
+            }
+        }
+
+        for (int i = 0; i < groupla_num; i++) {
+            const int8_t  *py = y + group32_num * 4096;
+            const uint8_t * px[PARALLEL_SIZE];
+            __m512i accula[PARALLEL_SIZE];
+            for (int rb = 0; rb < PARALLEL_SIZE; rb++) {
+                px[rb] = x_row[rb] + group32_num * 1024;
+                accula[rb] = _mm512_setzero_si512();
+            }
+
+            int j = 0;
+            for (; j + 1 < la_num; j += 2) {
+                __m512i yq8_0 = _mm512_loadu_si512((const __m512i*)(py));
+                __m512i yq8_1 = _mm512_loadu_si512((const __m512i*)(py + 64));
+                __m512i yq8_2 = _mm512_loadu_si512((const __m512i*)(py + 128));
+                __m512i yq8_3 = _mm512_loadu_si512((const __m512i*)(py + 192));
+
+                for (int rb = 0; rb < PARALLEL_SIZE; rb++) {
+                    __m512i xq8_3 = _mm512_loadu_si512((const __m512i*)(px[rb]));
+                    __m512i xq8_2 = _mm512_srli_epi16(xq8_3, 2);
+                    __m512i xq8_1 = _mm512_srli_epi16(xq8_3, 4);
+                    __m512i xq8_0 = _mm512_srli_epi16(xq8_3, 6);
+
+                    xq8_3 = _mm512_and_si512(xq8_3, mask);
+                    xq8_2 = _mm512_and_si512(xq8_2, mask);
+                    xq8_1 = _mm512_and_si512(xq8_1, mask);
+                    xq8_0 = _mm512_and_si512(xq8_0, mask);
+
+                    xq8_0 = _mm512_maddubs_epi16(xq8_0, yq8_0);
+                    xq8_1 = _mm512_maddubs_epi16(xq8_1, yq8_1);
+                    xq8_2 = _mm512_maddubs_epi16(xq8_2, yq8_2);
+                    xq8_3 = _mm512_maddubs_epi16(xq8_3, yq8_3);
+
+                    accula[rb] = _mm512_add_epi16(accula[rb], _mm512_add_epi16(xq8_0, xq8_1));
+                    accula[rb] = _mm512_add_epi16(accula[rb], _mm512_add_epi16(xq8_2, xq8_3));
+
+                    px[rb] += 64;
+                }
+                py += 256;
+            }
+            if (j < la_num) {
+                __m256i yq8_0_256 = _mm256_loadu_si256((const __m256i*)(py));
+                __m256i yq8_1_256 = _mm256_loadu_si256((const __m256i*)(py + 32));
+                __m256i yq8_2_256 = _mm256_loadu_si256((const __m256i*)(py + 64));
+                __m256i yq8_3_256 = _mm256_loadu_si256((const __m256i*)(py + 96));
+                __m512i yq8_0 = _mm512_castsi256_si512(yq8_0_256);
+                __m512i yq8_1 = _mm512_castsi256_si512(yq8_1_256);
+                __m512i yq8_2 = _mm512_castsi256_si512(yq8_2_256);
+                __m512i yq8_3 = _mm512_castsi256_si512(yq8_3_256);
+
+                for (int rb = 0; rb < PARALLEL_SIZE; rb++) {
+                    __m256i xq8_3_256 = _mm256_loadu_si256((const __m256i*)(px[rb]));
+                    __m512i xq8_3 = _mm512_castsi256_si512(xq8_3_256);
+                    __m512i xq8_2 = _mm512_srli_epi16(xq8_3, 2);
+                    __m512i xq8_1 = _mm512_srli_epi16(xq8_3, 4);
+                    __m512i xq8_0 = _mm512_srli_epi16(xq8_3, 6);
+
+                    xq8_3 = _mm512_and_si512(xq8_3, mask);
+                    xq8_2 = _mm512_and_si512(xq8_2, mask);
+                    xq8_1 = _mm512_and_si512(xq8_1, mask);
+                    xq8_0 = _mm512_and_si512(xq8_0, mask);
+
+                    xq8_0 = _mm512_maddubs_epi16(xq8_0, yq8_0);
+                    xq8_1 = _mm512_maddubs_epi16(xq8_1, yq8_1);
+                    xq8_2 = _mm512_maddubs_epi16(xq8_2, yq8_2);
+                    xq8_3 = _mm512_maddubs_epi16(xq8_3, yq8_3);
+
+                    accula[rb] = _mm512_add_epi16(accula[rb], _mm512_add_epi16(xq8_0, xq8_1));
+                    accula[rb] = _mm512_add_epi16(accula[rb], _mm512_add_epi16(xq8_2, xq8_3));
+
+                    px[rb] += 32;
+                }
+            }
+            for (int rb = 0; rb < PARALLEL_SIZE; rb++) {
+                accu[rb] = _mm512_add_epi32(accu[rb], _mm512_madd_epi16(accula[rb], one16));
+            }
+        }
+
+        for (int rb = 0; rb < PARALLEL_SIZE; rb++) {
+            int sumi = hsum_i32_16(accu[rb]);
+            s[row + rb] = (float)sumi;
+        }
+    }
+#elif defined(__AVX2__)
     const uint8_t *    x = (uint8_t *)vx;
     const int8_t  *    y = (int8_t *)vy;
 
@@ -789,7 +1118,139 @@ void ggml_vec_dot_i2_i8_s_1xN(int n, float * s, size_t bs, const void * vx, size
 }
 
 void ggml_vec_dot_i2_i8_s_Nx1(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) {
-#if defined(__AVX2__)
+#if defined(__AVX512F__) && defined(__AVX512BW__)
+    const uint8_t *    x = (uint8_t *)vx;
+    const int8_t  *    y = (int8_t *)vy;
+
+    const int nb = n / QK_I2_S;
+    const int group32_num = nb / 32;
+    const int la_num = nb % 32;
+    const int groupla_num = nb % 32 != 0 ? 1 : 0;
+
+    const __m512i mask = _mm512_set1_epi8(0x03);
+    const __m512i one16 = _mm512_set1_epi16(1);
+
+    for (int col = 0; col < nrc; col += PARALLEL_SIZE) {
+        __m512i accu[PARALLEL_SIZE];
+
+        for (int iy = 0; iy < PARALLEL_SIZE; iy++) {
+            accu[iy] = _mm512_setzero_si512();
+        }
+
+        const int8_t * y_col = y + col * by;
+
+        for (int i = 0; i < group32_num; i++) {
+            const uint8_t *px = x + i * 1024;
+            const int8_t  *py = y_col + i * 4096;
+            __m512i accu32[PARALLEL_SIZE];
+
+            for (int iy = 0; iy < PARALLEL_SIZE; iy++) {
+                accu32[iy] = _mm512_setzero_si512();
+            }
+
+            int j = 0;
+            for (; j + 1 < 32; j += 2) {
+                __m512i xq8   = _mm512_loadu_si512((const __m512i*)(px));
+                __m512i xq8_3 = _mm512_and_si512(xq8, mask);
+                __m512i xq8_2 = _mm512_and_si512(_mm512_srli_epi16(xq8, 2), mask);
+                __m512i xq8_1 = _mm512_and_si512(_mm512_srli_epi16(xq8, 4), mask);
+                __m512i xq8_0 = _mm512_and_si512(_mm512_srli_epi16(xq8, 6), mask);
+
+                for (int iy = 0; iy < PARALLEL_SIZE; iy++) {
+                    accu32[iy] = _mm512_add_epi16(accu32[iy], _mm512_add_epi16(
+                                    _mm512_add_epi16(_mm512_maddubs_epi16(xq8_0, _mm512_loadu_si512((const __m512i*)(py + 0 * 64 + iy * by))),
+                                                    _mm512_maddubs_epi16(xq8_1, _mm512_loadu_si512((const __m512i*)(py + 1 * 64 + iy * by)))),
+                                    _mm512_add_epi16(_mm512_maddubs_epi16(xq8_2, _mm512_loadu_si512((const __m512i*)(py + 2 * 64 + iy * by))),
+                                                    _mm512_maddubs_epi16(xq8_3, _mm512_loadu_si512((const __m512i*)(py + 3 * 64 + iy * by))))));
+                }
+
+                px += 64;
+                py += 256;
+            }
+            if (j < 32) {
+                __m256i xq8_256 = _mm256_loadu_si256((const __m256i*)(px));
+                __m512i xq8   = _mm512_castsi256_si512(xq8_256);
+                __m512i xq8_3 = _mm512_and_si512(xq8, mask);
+                __m512i xq8_2 = _mm512_and_si512(_mm512_srli_epi16(xq8, 2), mask);
+                __m512i xq8_1 = _mm512_and_si512(_mm512_srli_epi16(xq8, 4), mask);
+                __m512i xq8_0 = _mm512_and_si512(_mm512_srli_epi16(xq8, 6), mask);
+
+                for (int iy = 0; iy < PARALLEL_SIZE; iy++) {
+                    accu32[iy] = _mm512_add_epi16(accu32[iy], _mm512_add_epi16(
+                                    _mm512_add_epi16(_mm512_maddubs_epi16(xq8_0, _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(py + 0 * 32 + iy * by)))),
+                                                    _mm512_maddubs_epi16(xq8_1, _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(py + 1 * 32 + iy * by))))),
+                                    _mm512_add_epi16(_mm512_maddubs_epi16(xq8_2, _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(py + 2 * 32 + iy * by)))),
+                                                    _mm512_maddubs_epi16(xq8_3, _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(py + 3 * 32 + iy * by)))))));
+                }
+
+                px += 32;
+                py += 128;
+            }
+
+            for (int iy = 0; iy < PARALLEL_SIZE; iy++) {
+                accu[iy] = _mm512_add_epi32(_mm512_madd_epi16(accu32[iy], one16), accu[iy]);
+            }
+        }
+
+        for (int i = 0; i < groupla_num; i++) {
+            const uint8_t *px = x + group32_num * 1024;
+            const int8_t  *py = y_col + group32_num * 4096;
+            __m512i accula[PARALLEL_SIZE];
+
+            for (int iy = 0; iy < PARALLEL_SIZE; iy++) {
+                accula[iy] = _mm512_setzero_si512();
+            }
+
+            int j = 0;
+            for (; j + 1 < la_num; j += 2) {
+                __m512i xq8   = _mm512_loadu_si512((const __m512i*)(px));
+                __m512i xq8_3 = _mm512_and_si512(xq8, mask);
+                __m512i xq8_2 = _mm512_and_si512(_mm512_srli_epi16(xq8, 2), mask);
+                __m512i xq8_1 = _mm512_and_si512(_mm512_srli_epi16(xq8, 4), mask);
+                __m512i xq8_0 = _mm512_and_si512(_mm512_srli_epi16(xq8, 6), mask);
+
+                for (int iy = 0; iy < PARALLEL_SIZE; iy++) {
+                    accula[iy] = _mm512_add_epi16(accula[iy], _mm512_add_epi16(
+                                    _mm512_add_epi16(_mm512_maddubs_epi16(xq8_0, _mm512_loadu_si512((const __m512i*)(py + 0 * 64 + iy * by))),
+                                                    _mm512_maddubs_epi16(xq8_1, _mm512_loadu_si512((const __m512i*)(py + 1 * 64 + iy * by)))),
+                                    _mm512_add_epi16(_mm512_maddubs_epi16(xq8_2, _mm512_loadu_si512((const __m512i*)(py + 2 * 64 + iy * by))),
+                                                    _mm512_maddubs_epi16(xq8_3, _mm512_loadu_si512((const __m512i*)(py + 3 * 64 + iy * by))))));
+                }
+
+                px += 64;
+                py += 256;
+            }
+            if (j < la_num) {
+                __m256i xq8_256 = _mm256_loadu_si256((const __m256i*)(px));
+                __m512i xq8   = _mm512_castsi256_si512(xq8_256);
+                __m512i xq8_3 = _mm512_and_si512(xq8, mask);
+                __m512i xq8_2 = _mm512_and_si512(_mm512_srli_epi16(xq8, 2), mask);
+                __m512i xq8_1 = _mm512_and_si512(_mm512_srli_epi16(xq8, 4), mask);
+                __m512i xq8_0 = _mm512_and_si512(_mm512_srli_epi16(xq8, 6), mask);
+
+                for (int iy = 0; iy < PARALLEL_SIZE; iy++) {
+                    accula[iy] = _mm512_add_epi16(accula[iy], _mm512_add_epi16(
+                                    _mm512_add_epi16(_mm512_maddubs_epi16(xq8_0, _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(py + 0 * 32 + iy * by)))),
+                                                    _mm512_maddubs_epi16(xq8_1, _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(py + 1 * 32 + iy * by))))),
+                                    _mm512_add_epi16(_mm512_maddubs_epi16(xq8_2, _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(py + 2 * 32 + iy * by)))),
+                                                    _mm512_maddubs_epi16(xq8_3, _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(py + 3 * 32 + iy * by)))))));
+                }
+
+                px += 32;
+                py += 128;
+            }
+
+            for (int iy = 0; iy < PARALLEL_SIZE; iy++) {
+                accu[iy] = _mm512_add_epi32(_mm512_madd_epi16(accula[iy], one16), accu[iy]);
+            }
+        }
+
+        for (int iy = 0; iy < PARALLEL_SIZE; iy++) {
+            int sumi = hsum_i32_16(accu[iy]);
+            s[(col + iy) * bs] = (float)sumi;
+        }
+    }
+#elif defined(__AVX2__)
     const uint8_t *    x = (uint8_t *)vx;
     const int8_t  *    y = (int8_t *)vy;
 
@@ -808,7 +1269,7 @@ void ggml_vec_dot_i2_i8_s_Nx1(int n, float * s, size_t bs, const void * vx, size
             accu[iy] = _mm256_setzero_si256();
         }
 
-        int8_t * y_col = y + col * by;
+        const int8_t * y_col = y + col * by;
         
         for (int i = 0; i < group32_num; i++) {
             const uint8_t *px = x + i * 1024;
diff --git a/utils/convert-bitnet-embedding-to-gguf.py b/utils/convert-bitnet-embedding-to-gguf.py
new file mode 100644
index 000000000..3a4340734
--- /dev/null
+++ b/utils/convert-bitnet-embedding-to-gguf.py
@@ -0,0 +1,502 @@
+#!/usr/bin/env python3
+
+from __future__ import annotations
+
+import argparse
+import json
+import logging
+import os
+import sys
+from hashlib import sha256
+from pathlib import Path
+from typing import Any, Iterator
+
+import numpy as np
+import torch
+
+# Allow using the local gguf-py if present
+if "NO_LOCAL_GGUF" not in os.environ:
+    _local_gguf = Path(__file__).parent / "gguf-py"
+    if _local_gguf.exists():
+        sys.path.insert(1, str(_local_gguf))
+import gguf
+
+logger = logging.getLogger("convert-bitnet-embedding")
+
+# ---------------------------------------------------------------------------
+# Tensor name mapping: HuggingFace -> GGUF
+# ---------------------------------------------------------------------------
+
+def build_tensor_name_map(n_layers: int) -> dict[str, str]:
+    """Build HF tensor name -> GGUF tensor name mapping."""
+    mapping: dict[str, str] = {
+        "embed_tokens.weight": "token_embd.weight",
+        "norm.weight": "output_norm.weight",
+    }
+
+    for i in range(n_layers):
+        pfx = f"layers.{i}"
+        blk = f"blk.{i}"
+
+        mapping.update({
+            # Layer norms
+            f"{pfx}.input_layernorm.weight":           f"{blk}.attn_norm.weight",
+            f"{pfx}.post_attention_layernorm.weight":   f"{blk}.ffn_norm.weight",
+
+            # Self-attention projections
+            f"{pfx}.self_attn.q_proj.weight":           f"{blk}.attn_q.weight",
+            f"{pfx}.self_attn.k_proj.weight":           f"{blk}.attn_k.weight",
+            f"{pfx}.self_attn.v_proj.weight":           f"{blk}.attn_v.weight",
+            f"{pfx}.self_attn.o_proj.weight":           f"{blk}.attn_output.weight",
+
+            # QK head norms (standard Qwen3)
+            f"{pfx}.self_attn.q_norm.weight":           f"{blk}.attn_q_norm.weight",
+            f"{pfx}.self_attn.k_norm.weight":           f"{blk}.attn_k_norm.weight",
+
+            # Per-projection input norms (BitNet-specific)
+            f"{pfx}.self_attn.q_proj.norm.weight":      f"{blk}.attn_q_norm_in.weight",
+            f"{pfx}.self_attn.k_proj.norm.weight":      f"{blk}.attn_k_norm_in.weight",
+            f"{pfx}.self_attn.v_proj.norm.weight":      f"{blk}.attn_v_norm_in.weight",
+            f"{pfx}.self_attn.o_proj.norm.weight":      f"{blk}.attn_output_norm_in.weight",
+
+            # MLP projections
+            f"{pfx}.mlp.gate_proj.weight":              f"{blk}.ffn_gate.weight",
+            f"{pfx}.mlp.up_proj.weight":                f"{blk}.ffn_up.weight",
+            f"{pfx}.mlp.down_proj.weight":              f"{blk}.ffn_down.weight",
+
+            # Per-projection input norms for MLP (BitNet-specific)
+            f"{pfx}.mlp.gate_proj.norm.weight":         f"{blk}.ffn_gate_norm_in.weight",
+            f"{pfx}.mlp.up_proj.norm.weight":           f"{blk}.ffn_up_norm_in.weight",
+            f"{pfx}.mlp.down_proj.norm.weight":         f"{blk}.ffn_down_norm_in.weight",
+        })
+
+    return mapping
+
+
+# ---------------------------------------------------------------------------
+# Tokenizer handling (GPT-2 / BPE for Qwen3)
+# ---------------------------------------------------------------------------
+
+def get_vocab_base_pre(tokenizer) -> str:
+    # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
+    # is specific for the BPE pre-tokenizer used by the model
+    # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
+    # use in llama.cpp to implement the same pre-tokenizer
+
+    chktxt = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n\U0001f680 (normal) \U0001f636‍\U0001f32b️ (multiple emojis concatenated) ✅ \U0001f999\U0001f999 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច\U0001f601 ?我想在apple工作1314151天～ ------======= нещо на Български \'\'\'\'\'\'```````""""......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
+
+    chktok = tokenizer.encode(chktxt)
+    chkhsh = sha256(str(chktok).encode()).hexdigest()
+
+    logger.debug(f"chktok: {chktok}")
+    logger.debug(f"chkhsh: {chkhsh}")
+
+    res = None
+
+    # NOTE: if you get an error here, you need to update the convert_hf_to_gguf_update.py script
+    #       or pull the latest version of the model from Huggingface
+    #       don't edit the hashes manually!
+    if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
+        # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
+        res = "llama-bpe"
+    if chkhsh == "049ecf7629871e3041641907f3de7c733e4dbfdc736f57d882ba0b0845599754":
+        # ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-base
+        res = "deepseek-llm"
+    if chkhsh == "347715f544604f9118bb75ed199f68779f423cabb20db6de6f31b908d04d7821":
+        # ref: https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base
+        res = "deepseek-coder"
+    if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed":
+        # ref: https://huggingface.co/tiiuae/falcon-7b
+        res = "falcon"
+    if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454":
+        # ref: https://huggingface.co/openai-community/gpt2
+        res = "gpt-2"
+    if chkhsh == "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c":
+        # ref: https://huggingface.co/Qwen/Qwen3-Embedding-0.6B
+        res = "qwen2"
+
+    if res is None:
+        logger.warning("\n")
+        logger.warning("**************************************************************************************")
+        logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!")
+        logger.warning("**          There are 2 possible reasons for this:")
+        logger.warning("**          - the model has not been added to convert_hf_to_gguf_update.py yet")
+        logger.warning("**          - the pre-tokenization config has changed upstream")
+        logger.warning("**          Check your model files and convert_hf_to_gguf_update.py and update them accordingly.")
+        logger.warning("** ref:     https://github.com/ggml-org/llama.cpp/pull/6920")
+        logger.warning("**")
+        logger.warning(f"** chkhsh:  {chkhsh}")
+        logger.warning("**************************************************************************************")
+        logger.warning("\n")
+        raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()")
+
+    logger.debug(f"tokenizer.ggml.pre: {repr(res)}")
+    logger.debug(f"chkhsh: {chkhsh}")
+
+    return res
+
+
+def _does_token_look_special(token: str) -> bool:
+    """Check if a token looks like a special token (e.g., <|...|>, <...>)."""
+    if not token:
+        return False
+    # Matches patterns like <|endoftext|>, <s>, </s>, [CLS], [SEP], etc.
+    if token.startswith(("<|", "<", "[")) and token.endswith(("|>", ">", "]")):
+        return True
+    return False
+
+
+def set_vocab(gguf_writer: gguf.GGUFWriter, dir_model: Path, hparams: dict):
+    """Set GPT-2 BPE vocab for Qwen3."""
+    from transformers import AutoTokenizer
+    tokenizer = AutoTokenizer.from_pretrained(dir_model)
+    vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
+
+    tokpre = get_vocab_base_pre(tokenizer)
+
+    tokens: list[str] = []
+    toktypes: list[int] = []
+
+    reverse_vocab = {id_: tok for tok, id_ in tokenizer.vocab.items()}
+    added_vocab = tokenizer.get_added_vocab()
+
+    added_tokens_decoder = tokenizer.added_tokens_decoder
+
+    for i in range(vocab_size):
+        if i not in reverse_vocab:
+            tokens.append(f"[PAD{i}]")
+            toktypes.append(gguf.TokenType.UNUSED)
+        elif reverse_vocab[i] in added_vocab:
+            token = reverse_vocab[i]
+
+            # Only encode-decode non-normalized tokens (matching llama.cpp upstream)
+            if not added_tokens_decoder[i].normalized:
+                token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
+
+            if added_tokens_decoder[i].special or _does_token_look_special(token):
+                toktypes.append(gguf.TokenType.CONTROL)
+            else:
+                # Pre-normalize user-defined spaces (for Gemma-style tokenizers)
+                token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ")
+                toktypes.append(gguf.TokenType.USER_DEFINED)
+
+            tokens.append(token)
+        else:
+            tokens.append(reverse_vocab[i])
+            toktypes.append(gguf.TokenType.NORMAL)
+
+    gguf_writer.add_tokenizer_model("gpt2")
+    gguf_writer.add_tokenizer_pre(tokpre)
+    gguf_writer.add_token_list(tokens)
+    gguf_writer.add_token_types(toktypes)
+
+    special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
+    # Override EOS token: PyTorch tokenizer appends <|endoftext|> (151643) as the
+    # sentence-end marker, not <|im_end|> (151645). For last-token pooling to work
+    # correctly, llama.cpp must append the same token.
+    special_vocab.special_token_ids["eos"] = 151643
+    special_vocab.add_to_gguf(gguf_writer)
+
+    # Embedding models need EOS token appended for last-token pooling
+    gguf_writer.add_add_eos_token(True)
+
+
+# ---------------------------------------------------------------------------
+# GGUF metadata
+# ---------------------------------------------------------------------------
+
+def set_gguf_parameters(gguf_writer: gguf.GGUFWriter, hparams: dict, dir_model: Path, ftype: int):
+    gguf_writer.add_name(dir_model.name)
+
+    n_layers = hparams["num_hidden_layers"]
+    n_embd = hparams["hidden_size"]
+    n_head = hparams["num_attention_heads"]
+    n_head_kv = hparams.get("num_key_value_heads", n_head)
+    n_ff = hparams["intermediate_size"]
+
+    gguf_writer.add_block_count(n_layers)
+    gguf_writer.add_context_length(hparams.get("max_position_embeddings", 32768))
+    gguf_writer.add_embedding_length(n_embd)
+    gguf_writer.add_feed_forward_length(n_ff)
+    gguf_writer.add_head_count(n_head)
+    gguf_writer.add_head_count_kv(n_head_kv)
+    gguf_writer.add_vocab_size(hparams["vocab_size"])
+
+    head_dim = hparams.get("head_dim", n_embd // n_head)
+    gguf_writer.add_rope_dimension_count(head_dim)
+    gguf_writer.add_key_length(head_dim)
+    gguf_writer.add_value_length(head_dim)
+
+    if hparams.get("rope_theta") is not None:
+        gguf_writer.add_rope_freq_base(hparams["rope_theta"])
+    if hparams.get("rms_norm_eps") is not None:
+        gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
+
+    gguf_writer.add_file_type(ftype)
+
+    # Pooling type for embedding models
+    # Try to read from modules.json / 1_Pooling/config.json (sentence-transformers convention)
+    pooling_type = None
+    module_path = dir_model / "modules.json"
+    if module_path.is_file():
+        with open(module_path, encoding="utf-8") as f:
+            modules = json.load(f)
+        for mod in modules:
+            if mod["type"].endswith("Pooling"):
+                pooling_path = dir_model / mod["path"] / "config.json"
+                if pooling_path.is_file():
+                    with open(pooling_path, encoding="utf-8") as f:
+                        pooling = json.load(f)
+                    if pooling.get("pooling_mode_mean_tokens"):
+                        pooling_type = gguf.PoolingType.MEAN
+                    elif pooling.get("pooling_mode_cls_token"):
+                        pooling_type = gguf.PoolingType.CLS
+                    elif pooling.get("pooling_mode_lasttoken"):
+                        pooling_type = gguf.PoolingType.LAST
+                break
+    if pooling_type is None:
+        # Default to MEAN pooling for embedding models
+        logger.info("  No pooling config found, defaulting to MEAN pooling")
+        pooling_type = gguf.PoolingType.MEAN
+    gguf_writer.add_pooling_type(pooling_type)
+
+    logger.info(f"  n_layers={n_layers}, n_embd={n_embd}, n_head={n_head}, n_head_kv={n_head_kv}, n_ff={n_ff}")
+
+
+# ---------------------------------------------------------------------------
+# Tensor iteration from safetensors
+# ---------------------------------------------------------------------------
+
+def iter_tensors(dir_model: Path) -> Iterator[tuple[str, torch.Tensor]]:
+    """Yield (name, tensor) from safetensors files."""
+    from safetensors import safe_open
+
+    safetensor_files = sorted(dir_model.glob("*.safetensors"))
+    if not safetensor_files:
+        raise FileNotFoundError(f"No .safetensors files in {dir_model}")
+
+    for sf_path in safetensor_files:
+        logger.info(f"Loading {sf_path.name}")
+        with safe_open(str(sf_path), framework="pt", device="cpu") as f:
+            for name in f.keys():
+                yield name, f.get_tensor(name)
+
+
+# ---------------------------------------------------------------------------
+# I2_S ternary packing (platform-independent)
+# ---------------------------------------------------------------------------
+#
+# I2_S format (from dequantize_row_i2_s in ggml-quants.c):
+#   - Every 128 values form a block, packed into 32 bytes
+#   - Each byte stores 4 values at positions [0*32+gp, 1*32+gp, 2*32+gp, 3*32+gp]
+#     where gp is the byte index within the 32-byte group
+#   - Encoding per byte: c0=(b>>6)&3, c1=(b>>4)&3, c2=(b>>2)&3, c3=(b>>0)&3
+#   - Value mapping: 0 -> -1, 1 -> 0, 2 -> +1, 3 -> 0
+#   - Scale is stored as a separate tensor (tensor_name + "_scale")
+
+def quantize_to_i2_s(w: np.ndarray) -> np.ndarray:
+    """Quantize float weights to ternary and pack into I2_S layout.
+
+    Uses the same quantization as BitLinear weight_quant_minmax():
+        scale = 1.0 / mean(|w|)
+        q = round(w * scale).clamp(-1, 1)
+        dequant = q / scale = q * mean(|w|)
+
+    The I2_S format is self-contained: packed ternary bytes followed by a f32 scale
+    appended at the end of the data buffer.
+
+    Args:
+        w: float weight tensor of shape (M, K)
+
+    Returns:
+        packed_data: uint8 array containing I2_S packed bytes + scale (as 4 trailing bytes)
+    """
+    M, K = w.shape
+    n = M * K
+    w_flat = w.flatten().astype(np.float32)
+
+    # BitLinear weight_quant_minmax: scale = 1/mean(|w|), then round & clamp
+    abs_mean = np.mean(np.abs(w_flat))
+    abs_mean = max(abs_mean, 1e-5)
+    inv_scale = 1.0 / abs_mean
+    q_float = np.round(w_flat * inv_scale).clip(-1, 1)  # ternary: {-1, 0, 1}
+
+    # scale for dequantization = abs_mean (i.e., dequant = q * abs_mean)
+    scale = np.float32(abs_mean)
+
+    # Map ternary {-1, 0, 1} -> I2_S encoding {0, 1, 2}
+    #   -1 -> 0,  0 -> 1,  +1 -> 2
+    q = np.ones(n, dtype=np.uint8)  # default to 1 (zero)
+    q[q_float > 0.5] = 2    # +1 -> 2
+    q[q_float < -0.5] = 0   # -1 -> 0
+
+    # Pack into I2_S layout: 128-value blocks, interleaved into 32 bytes
+    # Pad to multiple of 128
+    pad_len = (128 - n % 128) % 128
+    if pad_len:
+        q = np.pad(q, (0, pad_len), constant_values=1)
+
+    n_padded = len(q)
+    n_blocks = n_padded // 128
+
+    q = q.reshape(n_blocks, 4, 32)
+
+    # Pack: byte = (c0 << 6) | (c1 << 4) | (c2 << 2) | c3
+    packed = (q[:, 0, :].astype(np.uint8) << 6) | \
+             (q[:, 1, :].astype(np.uint8) << 4) | \
+             (q[:, 2, :].astype(np.uint8) << 2) | \
+             (q[:, 3, :].astype(np.uint8))
+
+    packed = packed.reshape(-1).astype(np.uint8)
+
+    # I2_S format: packed_bytes + 32-byte aligned tail (scale in first 4 bytes of tail)
+    # Total size = n_elements / 4 + 32  (as defined in ggml.c)
+    packed_size = n // 4
+    total_size = packed_size + 32
+    result = np.zeros(total_size, dtype=np.uint8)
+    result[:len(packed)] = packed[:packed_size]
+    # Write scale as float32 at offset packed_size
+    result[packed_size:packed_size+4] = np.frombuffer(scale.tobytes(), dtype=np.uint8)
+
+    return result
+
+
+# ---------------------------------------------------------------------------
+# Main conversion
+# ---------------------------------------------------------------------------
+
+def main():
+    parser = argparse.ArgumentParser(description="Convert bitnet-embeddings to GGUF")
+    parser.add_argument("model", type=Path, help="Model directory")
+    parser.add_argument("--outfile", type=Path, default=None, help="Output GGUF file")
+    parser.add_argument("--outtype", choices=["f32", "f16", "i2_s"], default="f16",
+                        help="Output type: f32, f16, or i2_s (ternary quantized)")
+    parser.add_argument("--verbose", action="store_true")
+    args = parser.parse_args()
+
+    logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
+
+    dir_model = args.model
+    if not dir_model.is_dir():
+        logger.error(f"{dir_model} is not a directory")
+        sys.exit(1)
+
+    # Default output filename
+    if args.outfile is None:
+        suffix = {"f32": "-f32", "f16": "-f16", "i2_s": "-f16-new-i2_s"}[args.outtype]
+        args.outfile = dir_model / f"{dir_model.name}{suffix}.gguf"
+
+    # Load config
+    with open(dir_model / "config.json") as f:
+        hparams = json.load(f)
+
+    arch = hparams.get("model_type", "qwen3")
+    assert arch == "qwen3", f"Expected qwen3 architecture, got {arch}"
+
+    n_layers = hparams["num_hidden_layers"]
+
+    # Determine ftype
+    if args.outtype == "f32":
+        ftype = 0  # GGML F32
+    elif args.outtype == "f16":
+        ftype = 1  # GGML F16
+    else:  # i2_s
+        ftype = 40  # LLAMA_FTYPE_MOSTLY_I2_S
+
+    logger.info(f"Converting {dir_model.name} to GGUF ({args.outtype})")
+
+    # Create GGUF writer
+    gguf_writer = gguf.GGUFWriter(str(args.outfile), "qwen3")
+
+    # Set parameters
+    set_gguf_parameters(gguf_writer, hparams, dir_model, ftype)
+
+    # Set vocab
+    logger.info("Setting tokenizer/vocab...")
+    set_vocab(gguf_writer, dir_model, hparams)
+
+    # Build tensor name map
+    tensor_map = build_tensor_name_map(n_layers)
+
+    # Process tensors
+    logger.info("Processing tensors...")
+    tensor_count = 0
+    for hf_name, data_torch in iter_tensors(dir_model):
+        # Skip tensors we don't need
+        if hf_name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")):
+            continue
+
+        # Strip "model." prefix if present
+        name = hf_name
+        if name.startswith("model."):
+            name = name[len("model."):]
+
+        # Look up GGUF name
+        gguf_name = tensor_map.get(name)
+        if gguf_name is None:
+            logger.warning(f"Skipping unmapped tensor: {hf_name}")
+            continue
+
+        old_dtype = data_torch.dtype
+
+        # Convert bf16 -> f32 first (bf16 not directly supported by gguf)
+        if data_torch.dtype == torch.bfloat16:
+            data_torch = data_torch.to(torch.float32)
+
+        data = data_torch.squeeze().numpy()
+        n_dims = len(data.shape)
+        data_shape = data.shape
+
+        # Determine if this is a linear weight suitable for ternary quantization
+        is_norm = gguf_name.endswith("_norm.weight") or gguf_name.endswith("_norm_in.weight")
+        is_embed = gguf_name == "token_embd.weight"
+        is_linear_weight = n_dims == 2 and not is_norm and not is_embed
+        suit_i2 = is_linear_weight
+
+        if args.outtype == "i2_s" and suit_i2:
+            # --- I2_S ternary packing (scale embedded in data) ---
+            packed = quantize_to_i2_s(data)
+            data_qtype = gguf.GGMLQuantizationType.I2_S
+
+            shape_str = f"{{{', '.join(str(n) for n in reversed(data_shape))}}}"
+            logger.info(f"  {gguf_name}: {list(data_shape)} {old_dtype} -> I2_S, shape = {shape_str}")
+
+            gguf_writer.add_tensor(gguf_name, packed, raw_shape=data_shape, raw_dtype=data_qtype)
+            tensor_count += 1
+
+        elif args.outtype in ("f16", "i2_s") and (is_linear_weight or is_embed):
+            # 2D weight tensors (linear + embedding) -> f16
+            data = data.astype(np.float16)
+            logger.info(f"  {gguf_name}: {list(data_torch.shape)} {old_dtype} -> float16")
+            gguf_writer.add_tensor(gguf_name, data)
+            tensor_count += 1
+
+        else:
+            # norms, 1D tensors
+            if args.outtype in ("f16", "i2_s"):
+                data = data.astype(np.float16)
+                logger.info(f"  {gguf_name}: {list(data_torch.shape)} {old_dtype} -> float16")
+            else:
+                if data.dtype != np.float32:
+                    data = data.astype(np.float32)
+                logger.info(f"  {gguf_name}: {list(data_torch.shape)} {old_dtype} -> float32")
+            gguf_writer.add_tensor(gguf_name, data)
+            tensor_count += 1
+
+    logger.info(f"Total tensors written: {tensor_count}")
+
+    # Note: output.weight (lm_head) is skipped for embedding models —
+    # it is not needed (no token generation) and saves ~297MB for this model.
+
+    # Write GGUF
+    logger.info(f"Writing to {args.outfile}...")
+    gguf_writer.write_header_to_file()
+    gguf_writer.write_kv_data_to_file()
+    gguf_writer.write_tensors_to_file()
+    gguf_writer.close()
+
+    logger.info("Done!")
+
+
+if __name__ == "__main__":
+    main()

From 9a3f3a22a739f097969f612d25e826747b84143c Mon Sep 17 00:00:00 2001
From: isHuangXin <huangxin.hust@gmail.com>
Date: Thu, 21 May 2026 14:56:53 +0800
Subject: [PATCH 2/3] [feat] Add GGUF conversion and inference support for
 BitNet embedding 270m (Gemma3)

- Add convert-bitnet-embedding-270m-to-gguf.py for Gemma3-based 270m models
- Support f32, f16, and I2_S ternary quantization output types
- Add AVX512BW SIMD paths for I2_S dot product in ggml-bitnet-mad.cpp
- Add immintrin.h include and bitnet-lut-kernels.h guard in ggml-bitnet-lut.cpp
- Add documentation for Gemma3 GGUF conversion implementation
- Update llama.cpp submodule with Gemma3 architecture support
---
 3rdparty/llama.cpp                            |   2 +-
 ...itnet-embeddings-gemma3-gguf-conversion.md | 336 +++++++++++++
 .../convert-bitnet-embedding-270m-to-gguf.py  | 441 ++++++++++++++++++
 3 files changed, 778 insertions(+), 1 deletion(-)
 create mode 100644 docs/bitnet-embeddings-gemma3-gguf-conversion.md
 create mode 100644 utils/convert-bitnet-embedding-270m-to-gguf.py

diff --git a/3rdparty/llama.cpp b/3rdparty/llama.cpp
index 13e129947..a0d4c71d7 160000
--- a/3rdparty/llama.cpp
+++ b/3rdparty/llama.cpp
@@ -1 +1 @@
-Subproject commit 13e129947db43cbbcbfa985c72c443c2f2757f15
+Subproject commit a0d4c71d70f5837451f8faec122c7f0e8aa242aa
diff --git a/docs/bitnet-embeddings-gemma3-gguf-conversion.md b/docs/bitnet-embeddings-gemma3-gguf-conversion.md
new file mode 100644
index 000000000..236b78d1e
--- /dev/null
+++ b/docs/bitnet-embeddings-gemma3-gguf-conversion.md
@@ -0,0 +1,336 @@
+# BitNet Embeddings (Gemma3) GGUF Conversion Implementation
+
+## 1. Background
+
+`bitnet-embeddings-270m` is a Gemma3-based embedding model with BitNet per-projection RMSNorm (`BitLinear`). Each linear projection (q/k/v/o/gate/up/down) has a `.norm.weight` that applies RMSNorm to the input **before** the matmul:
+
+```
+x → RMSNorm(x, norm.weight) → activation_quant(8bit) → matmul(weight_quant(ternary))
+```
+
+This pattern does **not** exist in any standard llama.cpp architecture:
+- Standard Gemma3: no per-projection norms
+- Standard BitNet: has `attn_sub_norm`/`ffn_sub_norm` at different positions (after attention/gate*up, not before each projection)
+
+### Model Config
+
+- Architecture: `Gemma3TextModel`
+- hidden_size: 640, num_attention_heads: 4, num_key_value_heads: 1
+- head_dim: 256 (note: != hidden_size/num_heads = 160)
+- intermediate_size: 2048, num_hidden_layers: 18
+- hidden_activation: gelu_pytorch_tanh
+- vocab_size: 262144
+- rope_theta: 10000.0, rms_norm_eps: 1e-06
+- query_pre_attn_scalar: 256
+- tie_word_embeddings: true (implied, no separate output.weight)
+
+### Gemma3 vs Gemma2 Key Differences
+
+| Feature | Gemma2 | Gemma3 |
+|---------|--------|--------|
+| QK head norms | No | Yes (`q_norm`, `k_norm`) |
+| Pre-FFW norm | `ffn_norm` | `pre_feedforward_layernorm` → `ffn_norm` |
+| Post-FFW norm | `post_ffw_norm` | `post_feedforward_layernorm` → `post_ffw_norm` |
+| Post-attn norm | `post_attention_norm` | Same |
+| Activation | GELU | GELU |
+| Embedding scaling | sqrt(n_embd) | sqrt(n_embd) |
+
+### Per-Layer Tensors (7 extra norm tensors per layer)
+
+| Tensor | Shape |
+|--------|-------|
+| `self_attn.q_proj.norm.weight` | [640] |
+| `self_attn.k_proj.norm.weight` | [640] |
+| `self_attn.v_proj.norm.weight` | [640] |
+| `self_attn.o_proj.norm.weight` | [1024] |
+| `mlp.gate_proj.norm.weight` | [640] |
+| `mlp.up_proj.norm.weight` | [640] |
+| `mlp.down_proj.norm.weight` | [2048] |
+
+---
+
+## 2. GGUF Tensor Name Mapping
+
+| HF Name | GGUF Name | Notes |
+|----------|-----------|-------|
+| `embed_tokens.weight` | `token_embd.weight` | |
+| `norm.weight` | `output_norm.weight` | |
+| `layers.{i}.input_layernorm.weight` | `blk.{i}.attn_norm.weight` | |
+| `layers.{i}.post_attention_layernorm.weight` | `blk.{i}.post_attention_norm.weight` | |
+| `layers.{i}.pre_feedforward_layernorm.weight` | `blk.{i}.ffn_norm.weight` | |
+| `layers.{i}.post_feedforward_layernorm.weight` | `blk.{i}.post_ffw_norm.weight` | |
+| `layers.{i}.self_attn.q_proj.weight` | `blk.{i}.attn_q.weight` | |
+| `layers.{i}.self_attn.k_proj.weight` | `blk.{i}.attn_k.weight` | |
+| `layers.{i}.self_attn.v_proj.weight` | `blk.{i}.attn_v.weight` | |
+| `layers.{i}.self_attn.o_proj.weight` | `blk.{i}.attn_output.weight` | |
+| `layers.{i}.self_attn.q_norm.weight` | `blk.{i}.attn_q_norm.weight` | QK head norm |
+| `layers.{i}.self_attn.k_norm.weight` | `blk.{i}.attn_k_norm.weight` | QK head norm |
+| `layers.{i}.self_attn.q_proj.norm.weight` | `blk.{i}.attn_q_norm_in.weight` | BitNet per-projection |
+| `layers.{i}.self_attn.k_proj.norm.weight` | `blk.{i}.attn_k_norm_in.weight` | BitNet per-projection |
+| `layers.{i}.self_attn.v_proj.norm.weight` | `blk.{i}.attn_v_norm_in.weight` | BitNet per-projection |
+| `layers.{i}.self_attn.o_proj.norm.weight` | `blk.{i}.attn_output_norm_in.weight` | BitNet per-projection |
+| `layers.{i}.mlp.gate_proj.weight` | `blk.{i}.ffn_gate.weight` | |
+| `layers.{i}.mlp.up_proj.weight` | `blk.{i}.ffn_up.weight` | |
+| `layers.{i}.mlp.down_proj.weight` | `blk.{i}.ffn_down.weight` | |
+| `layers.{i}.mlp.gate_proj.norm.weight` | `blk.{i}.ffn_gate_norm_in.weight` | BitNet per-projection |
+| `layers.{i}.mlp.up_proj.norm.weight` | `blk.{i}.ffn_up_norm_in.weight` | BitNet per-projection |
+| `layers.{i}.mlp.down_proj.norm.weight` | `blk.{i}.ffn_down_norm_in.weight` | BitNet per-projection |
+
+---
+
+## 3. Conversion Script
+
+### `utils/convert-bitnet-embedding-270m-to-gguf.py`
+
+Standalone conversion script (safetensors → GGUF). Key features:
+
+- Hardcoded HF→GGUF tensor name mapping (no dependency on llama.cpp's Python converter)
+- Supports three output types:
+  - `--outtype f32`: all weights in float32
+  - `--outtype f16`: 2D weights and embeddings as float16, norms as float16
+  - `--outtype i2_s`: ternary weights packed in I2_S layout, non-ternary weights as float16
+- Writes `key_length` and `value_length` metadata for head_dim=256
+- Writes `query_pre_attn_scalar = 256` for correct attention scaling
+- GemmaTokenizerFast (BPE) tokenizer handling with pre-tokenizer hash verification
+- Pooling type auto-detection from `modules.json` / `1_Pooling/config.json` (sentence-transformers convention)
+- EOS token auto-set by SpecialVocab from tokenizer_config.json (eos_token_id=1)
+- Architecture string: `"gemma3"`
+
+### I2_S Ternary Packing
+
+The I2_S format packs ternary weights {-1, 0, +1} into 2-bit representation:
+
+- Quantization: `scale = 1/mean(|w|)`, `q = round(w * scale).clamp(-1, 1)`
+- Encoding: `-1 → 0`, `0 → 1`, `+1 → 2`
+- Every 128 values form a block, packed into 32 bytes
+- Each byte stores 4 values: `byte = (c0 << 6) | (c1 << 4) | (c2 << 2) | c3`
+- Scale (float32) is appended at the end of the packed data buffer
+
+### Tensor Type Assignment
+
+| Tensor Type | f16 mode | i2_s mode |
+|-------------|----------|-----------|
+| 2D linear weights | float16 | I2_S ternary packed |
+| Embedding weights | float16 | float16 |
+| Norm weights (1D) | float16 | float16 |
+
+Note: `output.weight` (lm_head) is skipped for embedding models — it is not needed (no token generation).
+
+---
+
+## 4. C++ Modifications (`3rdparty/llama.cpp/src/llama.cpp`)
+
+### 4.1 New Architecture: `LLM_ARCH_GEMMA3`
+
+Added after `LLM_ARCH_GEMMA2` in the `llm_arch` enum with name mapping `"gemma3"`.
+
+### 4.2 Tensor Enums (shared with Qwen3)
+
+Reuses the 7 per-projection norm tensor enums added for Qwen3:
+
+```cpp
+LLM_TENSOR_ATTN_Q_NORM_IN,
+LLM_TENSOR_ATTN_K_NORM_IN,
+LLM_TENSOR_ATTN_V_NORM_IN,
+LLM_TENSOR_ATTN_OUT_NORM_IN,
+LLM_TENSOR_FFN_GATE_NORM_IN,
+LLM_TENSOR_FFN_UP_NORM_IN,
+LLM_TENSOR_FFN_DOWN_NORM_IN,
+```
+
+### 4.3 Tensor Name Mappings for `LLM_ARCH_GEMMA3`
+
+```cpp
+{ LLM_TENSOR_TOKEN_EMBD,          "token_embd" },
+{ LLM_TENSOR_OUTPUT_NORM,         "output_norm" },
+{ LLM_TENSOR_ATTN_NORM,           "blk.%d.attn_norm" },
+{ LLM_TENSOR_ATTN_Q,              "blk.%d.attn_q" },
+{ LLM_TENSOR_ATTN_K,              "blk.%d.attn_k" },
+{ LLM_TENSOR_ATTN_V,              "blk.%d.attn_v" },
+{ LLM_TENSOR_ATTN_OUT,            "blk.%d.attn_output" },
+{ LLM_TENSOR_ATTN_Q_NORM,         "blk.%d.attn_q_norm" },
+{ LLM_TENSOR_ATTN_K_NORM,         "blk.%d.attn_k_norm" },
+{ LLM_TENSOR_ATTN_Q_NORM_IN,      "blk.%d.attn_q_norm_in" },
+{ LLM_TENSOR_ATTN_K_NORM_IN,      "blk.%d.attn_k_norm_in" },
+{ LLM_TENSOR_ATTN_V_NORM_IN,      "blk.%d.attn_v_norm_in" },
+{ LLM_TENSOR_ATTN_OUT_NORM_IN,    "blk.%d.attn_output_norm_in" },
+{ LLM_TENSOR_ATTN_POST_NORM,      "blk.%d.post_attention_norm" },
+{ LLM_TENSOR_FFN_NORM,            "blk.%d.ffn_norm" },
+{ LLM_TENSOR_FFN_GATE,            "blk.%d.ffn_gate" },
+{ LLM_TENSOR_FFN_DOWN,            "blk.%d.ffn_down" },
+{ LLM_TENSOR_FFN_UP,              "blk.%d.ffn_up" },
+{ LLM_TENSOR_FFN_GATE_NORM_IN,    "blk.%d.ffn_gate_norm_in" },
+{ LLM_TENSOR_FFN_UP_NORM_IN,      "blk.%d.ffn_up_norm_in" },
+{ LLM_TENSOR_FFN_DOWN_NORM_IN,    "blk.%d.ffn_down_norm_in" },
+{ LLM_TENSOR_FFN_POST_NORM,       "blk.%d.post_ffw_norm" },
+```
+
+### 4.4 load_tensors (LLM_ARCH_GEMMA3)
+
+Based on Gemma2's tensor loading with additions:
+
+- QK head norms: `attn_q_norm`, `attn_k_norm`
+- All 7 BitNet per-projection norm_in tensors (TENSOR_NOT_REQUIRED)
+
+```cpp
+layer.attn_q_norm_in   = create_tensor(tn(...), {n_embd}, TENSOR_NOT_REQUIRED);
+layer.attn_k_norm_in   = create_tensor(tn(...), {n_embd}, TENSOR_NOT_REQUIRED);
+layer.attn_v_norm_in   = create_tensor(tn(...), {n_embd}, TENSOR_NOT_REQUIRED);
+layer.attn_out_norm_in = create_tensor(tn(...), {n_embd_head_k * n_head}, TENSOR_NOT_REQUIRED);
+layer.ffn_gate_norm_in = create_tensor(tn(...), {n_embd}, TENSOR_NOT_REQUIRED);
+layer.ffn_up_norm_in   = create_tensor(tn(...), {n_embd}, TENSOR_NOT_REQUIRED);
+layer.ffn_down_norm_in = create_tensor(tn(...), {n_ff}, TENSOR_NOT_REQUIRED);
+```
+
+### 4.5 build_gemma3() Graph Function
+
+Combines Gemma2's structure with Qwen3's per-projection norm pattern:
+
+**Key features:**
+- Embedding scaling by `sqrt(n_embd)` (Gemma convention)
+- GELU activation (gelu_pytorch_tanh)
+- QK head norms after Q/K projection
+- Conditional per-projection RMSNorm (backward compatible)
+- Post-attention and post-FFN layer norms
+- `wo=NULL` pattern for `attn_out_norm_in` (same as Qwen3)
+- `query_pre_attn_scalar` for attention scaling
+
+**Attention per-projection norms:**
+```
+// Before Q/K/V matmul:
+if (layer.attn_q_norm_in) {
+    cur_q = ggml_rms_norm(ctx, cur, hparams.f_norm_rms_eps);
+    cur_q = ggml_mul(ctx, cur_q, layer.attn_q_norm_in);
+} else {
+    cur_q = cur;
+}
+Qcur = ggml_mul_mat(ctx, layer.wq, cur_q);
+// QK head norms applied after projection
+Qcur = ggml_rms_norm(ctx, Qcur, hparams.f_norm_rms_eps);
+Qcur = ggml_mul(ctx, Qcur, layer.attn_q_norm);
+```
+
+**O_proj norm** with `wo=NULL` pattern:
+```
+cur = llm_build_kv(..., wo=NULL, ...);
+if (layer.attn_out_norm_in) {
+    cur = ggml_rms_norm(ctx, cur, hparams.f_norm_rms_eps);
+    cur = ggml_mul(ctx, cur, layer.attn_out_norm_in);
+}
+cur = ggml_mul_mat(ctx, layer.wo, cur);
+```
+
+**FFN per-projection norms with GELU:**
+```
+if (layer.ffn_gate_norm_in) {
+    tmp_gate = rms_norm(cur) * gate_norm_in;
+} else {
+    tmp_gate = cur;
+}
+tmp_gate = matmul(gate_proj, tmp_gate);
+tmp_gate = gelu(tmp_gate);  // GELU, not SILU
+// ...
+```
+
+---
+
+## 5. GGUF Conversion Process
+
+There are two GGUF files to produce, from **two different source models**:
+
+| GGUF Output | Source Model | Description |
+|-------------|-------------|-------------|
+| `multilingual-e5-270m-f16.gguf` | `multilingual-e5-270m-260311` (standard Gemma3) | F16 baseline, standard float16 weights |
+| `bitnet-embeddings-270m-i2_s.gguf` | `bitnet-embeddings-270m` (BitNet ternary) | I2_S ternary packed weights |
+
+### 5.1 F16 GGUF: from multilingual-e5-270m-260311
+
+```bash
+python3 utils/convert-bitnet-embedding-270m-to-gguf.py \
+  /path/to/multilingual-e5-270m-260311 \
+  --outtype f16
+```
+
+**What happens:**
+1. Load `model.safetensors` (standard Gemma3 weights, bfloat16)
+2. Convert all 2D weights (projections, embeddings) to float16
+3. Convert norm weights to float16
+4. Write GGUF with `gemma3` architecture metadata and tokenizer
+
+### 5.2 I2_S GGUF: from bitnet-embeddings-270m
+
+```bash
+python3 utils/convert-bitnet-embedding-270m-to-gguf.py \
+  /path/to/bitnet-embeddings-270m \
+  --outtype i2_s
+```
+
+**What happens:**
+1. Load `model.safetensors` (BitNet ternary weights, bfloat16)
+2. Map HF tensor names to GGUF names, including 7 extra `*_norm_in` tensors per layer
+3. For each 2D linear weight: quantize to I2_S ternary packed format
+4. Keep embeddings (`token_embd.weight`) in float16
+5. Keep all norm weights in float16
+6. Skip `output.weight` (lm_head, not needed for embedding models)
+7. Write GGUF with `I2_S` type tag for quantized tensors
+
+### 5.3 Why Two Different Source Models?
+
+- `multilingual-e5-270m-260311` is the **teacher/baseline model** with standard float weights, used as the F16 performance reference
+- `bitnet-embeddings-270m` is the **1-bit quantized student model** with ternary weights and per-projection BitLinear norms, converted to I2_S for efficient CPU inference
+- Benchmarking compares both to measure the throughput gain and quality trade-off of ternary quantization
+
+### 5.4 Tensor Type Summary
+
+| Tensor | F16 (from e5-270m) | I2_S (from bitnet-270m) |
+|--------|---------------------|-------------------------|
+| Linear projections (q/k/v/o/gate/up/down) | float16 | I2_S (2-bit packed + float32 scale) |
+| Embedding (`token_embd.weight`) | float16 | float16 |
+| Per-projection norms (`*_norm_in`) | N/A (not present) | float16 |
+| Layer norms (attn_norm, ffn_norm, etc.) | float16 | float16 |
+| QK head norms (`attn_q_norm`, `attn_k_norm`) | float16 | float16 |
+| `output.weight` (lm_head) | skipped | skipped |
+
+---
+
+## 6. Additional Changes
+
+### 6.1 ggml.c: F16 Norm Weight Support
+
+Added `ggml_compute_forward_mul_f32_f16()` function to support element-wise multiplication where norm weights are stored in float16. Modified `ggml_compute_forward_mul()` to dispatch based on `src1->type`.
+
+### 6.2 gguf-py: I2_S Type
+
+Added `I2_S = 36` to `GGMLQuantizationType` enum and `(4, 1)` quant size in `constants.py`.
+
+### 6.3 CMakeLists.txt: BitNet LUT Kernels Guard
+
+Guarded `bitnet-lut-kernels.h` include with `if (GGML_BITNET_ARM_TL1 OR GGML_BITNET_X86_TL2)` to prevent build errors when LUT kernels are not available.
+
+### 6.4 ggml-bitnet-mad.cpp: AVX512 SIMD
+
+Added AVX512BW SIMD paths for I2_S dot product functions:
+- `ggml_vec_dot_i2_i8_s_1x1`
+- `ggml_vec_dot_i2_i8_s_1xN`
+- `ggml_vec_dot_i2_i8_s_Nx1`
+
+---
+
+## 7. Build and Run
+
+```bash
+# Build with BitNet repo (includes I2_S support)
+cmake -S /path/to/BitNet -B build -DCMAKE_BUILD_TYPE=Release
+cmake --build build --target llama-embedding llama-bench -j$(nproc)
+
+# Run embedding inference
+build/bin/llama-embedding -m bitnet-embeddings-270m-i2_s.gguf \
+  -p "hello world" --embd-normalize 2 --embd-output-format array
+
+# Benchmark: F16 vs I2_S
+build/bin/llama-bench -m multilingual-e5-270m-f16.gguf \
+  -t 8 -p 128,256,512,1024,2048 -n 32,64 -r 3 -ngl 0
+
+build/bin/llama-bench -m bitnet-embeddings-270m-i2_s.gguf \
+  -t 8 -p 128,256,512,1024,2048 -n 32,64 -r 3 -ngl 0
+```
diff --git a/utils/convert-bitnet-embedding-270m-to-gguf.py b/utils/convert-bitnet-embedding-270m-to-gguf.py
new file mode 100644
index 000000000..4408452ee
--- /dev/null
+++ b/utils/convert-bitnet-embedding-270m-to-gguf.py
@@ -0,0 +1,441 @@
+#!/usr/bin/env python3
+
+from __future__ import annotations
+
+import argparse
+import json
+import logging
+import os
+import sys
+from hashlib import sha256
+from pathlib import Path
+from typing import Any, Iterator
+
+import numpy as np
+import torch
+
+# Allow using the local gguf-py if present
+if "NO_LOCAL_GGUF" not in os.environ:
+    _local_gguf = Path(__file__).parent / "gguf-py"
+    if _local_gguf.exists():
+        sys.path.insert(1, str(_local_gguf))
+import gguf
+
+logger = logging.getLogger("convert-bitnet-embedding-270m")
+
+# ---------------------------------------------------------------------------
+# Tensor name mapping: HuggingFace -> GGUF
+# ---------------------------------------------------------------------------
+
+def build_tensor_name_map(n_layers: int) -> dict[str, str]:
+    """Build HF tensor name -> GGUF tensor name mapping."""
+    mapping: dict[str, str] = {
+        "embed_tokens.weight": "token_embd.weight",
+        "norm.weight": "output_norm.weight",
+    }
+
+    for i in range(n_layers):
+        pfx = f"layers.{i}"
+        blk = f"blk.{i}"
+
+        mapping.update({
+            # Layer norms
+            f"{pfx}.input_layernorm.weight":           f"{blk}.attn_norm.weight",
+            f"{pfx}.post_attention_layernorm.weight":   f"{blk}.post_attention_norm.weight",
+            f"{pfx}.pre_feedforward_layernorm.weight":  f"{blk}.ffn_norm.weight",
+            f"{pfx}.post_feedforward_layernorm.weight": f"{blk}.post_ffw_norm.weight",
+
+            # Self-attention projections
+            f"{pfx}.self_attn.q_proj.weight":           f"{blk}.attn_q.weight",
+            f"{pfx}.self_attn.k_proj.weight":           f"{blk}.attn_k.weight",
+            f"{pfx}.self_attn.v_proj.weight":           f"{blk}.attn_v.weight",
+            f"{pfx}.self_attn.o_proj.weight":           f"{blk}.attn_output.weight",
+
+            # QK head norms (Gemma3)
+            f"{pfx}.self_attn.q_norm.weight":           f"{blk}.attn_q_norm.weight",
+            f"{pfx}.self_attn.k_norm.weight":           f"{blk}.attn_k_norm.weight",
+
+            # Per-projection input norms (BitNet-specific)
+            f"{pfx}.self_attn.q_proj.norm.weight":      f"{blk}.attn_q_norm_in.weight",
+            f"{pfx}.self_attn.k_proj.norm.weight":      f"{blk}.attn_k_norm_in.weight",
+            f"{pfx}.self_attn.v_proj.norm.weight":      f"{blk}.attn_v_norm_in.weight",
+            f"{pfx}.self_attn.o_proj.norm.weight":      f"{blk}.attn_output_norm_in.weight",
+
+            # MLP projections
+            f"{pfx}.mlp.gate_proj.weight":              f"{blk}.ffn_gate.weight",
+            f"{pfx}.mlp.up_proj.weight":                f"{blk}.ffn_up.weight",
+            f"{pfx}.mlp.down_proj.weight":              f"{blk}.ffn_down.weight",
+
+            # Per-projection input norms for MLP (BitNet-specific)
+            f"{pfx}.mlp.gate_proj.norm.weight":         f"{blk}.ffn_gate_norm_in.weight",
+            f"{pfx}.mlp.up_proj.norm.weight":           f"{blk}.ffn_up_norm_in.weight",
+            f"{pfx}.mlp.down_proj.norm.weight":         f"{blk}.ffn_down_norm_in.weight",
+        })
+
+    return mapping
+
+
+# ---------------------------------------------------------------------------
+# Tokenizer handling (BPE for Gemma3)
+# ---------------------------------------------------------------------------
+
+def get_vocab_base_pre(tokenizer) -> str:
+    chktxt = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n\U0001f680 (normal) \U0001f636‍\U0001f32b️ (multiple emojis concatenated) ✅ \U0001f999\U0001f999 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច\U0001f601 ?我想在apple工作1314151天～ ------======= нещо на Български \'\'\'\'\'\'```````""""""......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
+
+    chktok = tokenizer.encode(chktxt)
+    chkhsh = sha256(str(chktok).encode()).hexdigest()
+
+    logger.debug(f"chktok: {chktok}")
+    logger.debug(f"chkhsh: {chkhsh}")
+
+    res = None
+
+    if chkhsh == "fcb6bf9f20f6c40fa4aa4f7f99607bd6c106ca2348efdacacdca8152e59dcfe9":
+        # ref: multilingual-e5-270m-260311 (Gemma3 tokenizer)
+        res = "default"
+    if chkhsh == "a8594e3edff7c29c003940395316294b2c623571571fc8d3d2d6571f5571cbe6":
+        # ref: google/gemma-2-9b
+        res = "default"
+
+    if res is None:
+        logger.warning("\n")
+        logger.warning("**************************************************************************************")
+        logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!")
+        logger.warning(f"** chkhsh:  {chkhsh}")
+        logger.warning("**************************************************************************************")
+        logger.warning("\n")
+        raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()")
+
+    logger.debug(f"tokenizer.ggml.pre: {repr(res)}")
+    return res
+
+
+def _does_token_look_special(token: str) -> bool:
+    if not token:
+        return False
+    if token.startswith(("<|", "<", "[")) and token.endswith(("|>", ">", "]")):
+        return True
+    return False
+
+
+def set_vocab(gguf_writer: gguf.GGUFWriter, dir_model: Path, hparams: dict):
+    """Set BPE vocab for Gemma3."""
+    from transformers import AutoTokenizer
+    tokenizer = AutoTokenizer.from_pretrained(dir_model)
+    vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
+
+    tokpre = get_vocab_base_pre(tokenizer)
+
+    tokens: list[str] = []
+    toktypes: list[int] = []
+
+    reverse_vocab = {id_: tok for tok, id_ in tokenizer.vocab.items()}
+    added_vocab = tokenizer.get_added_vocab()
+
+    added_tokens_decoder = tokenizer.added_tokens_decoder
+
+    for i in range(vocab_size):
+        if i not in reverse_vocab:
+            tokens.append(f"[PAD{i}]")
+            toktypes.append(gguf.TokenType.UNUSED)
+        elif reverse_vocab[i] in added_vocab:
+            token = reverse_vocab[i]
+
+            if not added_tokens_decoder[i].normalized:
+                token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
+
+            if added_tokens_decoder[i].special or _does_token_look_special(token):
+                toktypes.append(gguf.TokenType.CONTROL)
+            else:
+                token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ")
+                toktypes.append(gguf.TokenType.USER_DEFINED)
+
+            tokens.append(token)
+        else:
+            tokens.append(reverse_vocab[i])
+            toktypes.append(gguf.TokenType.NORMAL)
+
+    gguf_writer.add_tokenizer_model("gpt2")
+    gguf_writer.add_tokenizer_pre(tokpre)
+    gguf_writer.add_token_list(tokens)
+    gguf_writer.add_token_types(toktypes)
+
+    special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
+    special_vocab.add_to_gguf(gguf_writer)
+
+
+# ---------------------------------------------------------------------------
+# GGUF metadata
+# ---------------------------------------------------------------------------
+
+def set_gguf_parameters(gguf_writer: gguf.GGUFWriter, hparams: dict, dir_model: Path, ftype: int):
+    gguf_writer.add_name(dir_model.name)
+
+    n_layers = hparams["num_hidden_layers"]
+    n_embd = hparams["hidden_size"]
+    n_head = hparams["num_attention_heads"]
+    n_head_kv = hparams.get("num_key_value_heads", n_head)
+    n_ff = hparams["intermediate_size"]
+
+    gguf_writer.add_block_count(n_layers)
+    gguf_writer.add_context_length(hparams.get("max_position_embeddings", 32768))
+    gguf_writer.add_embedding_length(n_embd)
+    gguf_writer.add_feed_forward_length(n_ff)
+    gguf_writer.add_head_count(n_head)
+    gguf_writer.add_head_count_kv(n_head_kv)
+    gguf_writer.add_vocab_size(hparams["vocab_size"])
+
+    head_dim = hparams.get("head_dim", n_embd // n_head)
+    gguf_writer.add_rope_dimension_count(head_dim)
+    gguf_writer.add_key_length(head_dim)
+    gguf_writer.add_value_length(head_dim)
+
+    if hparams.get("rope_theta") is not None:
+        gguf_writer.add_rope_freq_base(hparams["rope_theta"])
+    if hparams.get("rms_norm_eps") is not None:
+        gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
+
+    gguf_writer.add_file_type(ftype)
+
+    # Pooling type for embedding models
+    pooling_type = None
+    module_path = dir_model / "modules.json"
+    if module_path.is_file():
+        with open(module_path, encoding="utf-8") as f:
+            modules = json.load(f)
+        for mod in modules:
+            if mod["type"].endswith("Pooling"):
+                pooling_path = dir_model / mod["path"] / "config.json"
+                if pooling_path.is_file():
+                    with open(pooling_path, encoding="utf-8") as f:
+                        pooling = json.load(f)
+                    if pooling.get("pooling_mode_mean_tokens"):
+                        pooling_type = gguf.PoolingType.MEAN
+                    elif pooling.get("pooling_mode_cls_token"):
+                        pooling_type = gguf.PoolingType.CLS
+                    elif pooling.get("pooling_mode_lasttoken"):
+                        pooling_type = gguf.PoolingType.LAST
+                break
+    if pooling_type is None:
+        logger.info("  No pooling config found, defaulting to MEAN pooling")
+        pooling_type = gguf.PoolingType.MEAN
+    gguf_writer.add_pooling_type(pooling_type)
+
+    logger.info(f"  n_layers={n_layers}, n_embd={n_embd}, n_head={n_head}, n_head_kv={n_head_kv}, n_ff={n_ff}, head_dim={head_dim}")
+
+
+# ---------------------------------------------------------------------------
+# Tensor iteration from safetensors
+# ---------------------------------------------------------------------------
+
+def iter_tensors(dir_model: Path) -> Iterator[tuple[str, torch.Tensor]]:
+    """Yield (name, tensor) from safetensors files."""
+    from safetensors import safe_open
+
+    safetensor_files = sorted(dir_model.glob("*.safetensors"))
+    if not safetensor_files:
+        raise FileNotFoundError(f"No .safetensors files in {dir_model}")
+
+    for sf_path in safetensor_files:
+        logger.info(f"Loading {sf_path.name}")
+        with safe_open(str(sf_path), framework="pt", device="cpu") as f:
+            for name in f.keys():
+                yield name, f.get_tensor(name)
+
+
+# ---------------------------------------------------------------------------
+# I2_S ternary packing (platform-independent)
+# ---------------------------------------------------------------------------
+
+def quantize_to_i2_s(w: np.ndarray) -> np.ndarray:
+    """Quantize float weights to ternary and pack into I2_S layout.
+
+    Uses the same quantization as BitLinear weight_quant_minmax():
+        scale = 1.0 / mean(|w|)
+        q = round(w * scale).clamp(-1, 1)
+        dequant = q / scale = q * mean(|w|)
+
+    Args:
+        w: float weight tensor of shape (M, K)
+
+    Returns:
+        packed_data: uint8 array containing I2_S packed bytes + scale (as 4 trailing bytes)
+    """
+    M, K = w.shape
+    n = M * K
+    w_flat = w.flatten().astype(np.float32)
+
+    abs_mean = np.mean(np.abs(w_flat))
+    abs_mean = max(abs_mean, 1e-5)
+    inv_scale = 1.0 / abs_mean
+    q_float = np.round(w_flat * inv_scale).clip(-1, 1)
+
+    scale = np.float32(abs_mean)
+
+    # Map ternary {-1, 0, 1} -> I2_S encoding {0, 1, 2}
+    q = np.ones(n, dtype=np.uint8)
+    q[q_float > 0.5] = 2
+    q[q_float < -0.5] = 0
+
+    # Pack into I2_S layout: 128-value blocks, interleaved into 32 bytes
+    pad_len = (128 - n % 128) % 128
+    if pad_len:
+        q = np.pad(q, (0, pad_len), constant_values=1)
+
+    n_padded = len(q)
+    n_blocks = n_padded // 128
+
+    q = q.reshape(n_blocks, 4, 32)
+
+    packed = (q[:, 0, :].astype(np.uint8) << 6) | \
+             (q[:, 1, :].astype(np.uint8) << 4) | \
+             (q[:, 2, :].astype(np.uint8) << 2) | \
+             (q[:, 3, :].astype(np.uint8))
+
+    packed = packed.reshape(-1).astype(np.uint8)
+
+    packed_size = n // 4
+    total_size = packed_size + 32
+    result = np.zeros(total_size, dtype=np.uint8)
+    result[:len(packed)] = packed[:packed_size]
+    result[packed_size:packed_size+4] = np.frombuffer(scale.tobytes(), dtype=np.uint8)
+
+    return result
+
+
+# ---------------------------------------------------------------------------
+# Main conversion
+# ---------------------------------------------------------------------------
+
+def main():
+    parser = argparse.ArgumentParser(description="Convert bitnet-embeddings-270m (Gemma3) to GGUF")
+    parser.add_argument("model", type=Path, help="Model directory")
+    parser.add_argument("--outfile", type=Path, default=None, help="Output GGUF file")
+    parser.add_argument("--outtype", choices=["f32", "f16", "i2_s"], default="f16",
+                        help="Output type: f32, f16, or i2_s (ternary quantized)")
+    parser.add_argument("--verbose", action="store_true")
+    args = parser.parse_args()
+
+    logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
+
+    dir_model = args.model
+    if not dir_model.is_dir():
+        logger.error(f"{dir_model} is not a directory")
+        sys.exit(1)
+
+    # Default output filename
+    if args.outfile is None:
+        suffix = {"f32": "-f32", "f16": "-f16", "i2_s": "-f16-new-i2_s"}[args.outtype]
+        args.outfile = dir_model / f"{dir_model.name}{suffix}.gguf"
+
+    # Load config
+    with open(dir_model / "config.json") as f:
+        hparams = json.load(f)
+
+    arch = hparams.get("model_type", "gemma3_text")
+    assert arch == "gemma3_text", f"Expected gemma3_text architecture, got {arch}"
+
+    n_layers = hparams["num_hidden_layers"]
+
+    # Determine ftype
+    if args.outtype == "f32":
+        ftype = 0  # GGML F32
+    elif args.outtype == "f16":
+        ftype = 1  # GGML F16
+    else:  # i2_s
+        ftype = 40  # LLAMA_FTYPE_MOSTLY_I2_S
+
+    logger.info(f"Converting {dir_model.name} to GGUF ({args.outtype})")
+
+    # Create GGUF writer
+    gguf_writer = gguf.GGUFWriter(str(args.outfile), "gemma3")
+
+    # Set parameters
+    set_gguf_parameters(gguf_writer, hparams, dir_model, ftype)
+
+    # Set vocab
+    logger.info("Setting tokenizer/vocab...")
+    set_vocab(gguf_writer, dir_model, hparams)
+
+    # Build tensor name map
+    tensor_map = build_tensor_name_map(n_layers)
+
+    # Process tensors
+    logger.info("Processing tensors...")
+    tensor_count = 0
+    for hf_name, data_torch in iter_tensors(dir_model):
+        # Skip tensors we don't need
+        if hf_name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")):
+            continue
+
+        # Strip "model." prefix if present
+        name = hf_name
+        if name.startswith("model."):
+            name = name[len("model."):]
+
+        # Look up GGUF name
+        gguf_name = tensor_map.get(name)
+        if gguf_name is None:
+            logger.warning(f"Skipping unmapped tensor: {hf_name}")
+            continue
+
+        old_dtype = data_torch.dtype
+
+        # Convert bf16 -> f32 first (bf16 not directly supported by gguf)
+        if data_torch.dtype == torch.bfloat16:
+            data_torch = data_torch.to(torch.float32)
+
+        data = data_torch.squeeze().numpy()
+        n_dims = len(data.shape)
+        data_shape = data.shape
+
+        # Determine if this is a linear weight suitable for ternary quantization
+        is_norm = gguf_name.endswith("_norm.weight") or gguf_name.endswith("_norm_in.weight")
+        is_embed = gguf_name == "token_embd.weight"
+        is_linear_weight = n_dims == 2 and not is_norm and not is_embed
+        suit_i2 = is_linear_weight
+
+        if args.outtype == "i2_s" and suit_i2:
+            # --- I2_S ternary packing (scale embedded in data) ---
+            packed = quantize_to_i2_s(data)
+            data_qtype = gguf.GGMLQuantizationType.I2_S
+
+            shape_str = f"{{{', '.join(str(n) for n in reversed(data_shape))}}}"
+            logger.info(f"  {gguf_name}: {list(data_shape)} {old_dtype} -> I2_S, shape = {shape_str}")
+
+            gguf_writer.add_tensor(gguf_name, packed, raw_shape=data_shape, raw_dtype=data_qtype)
+            tensor_count += 1
+
+        elif args.outtype in ("f16", "i2_s") and (is_linear_weight or is_embed):
+            # 2D weight tensors (linear + embedding) -> f16
+            data = data.astype(np.float16)
+            logger.info(f"  {gguf_name}: {list(data_torch.shape)} {old_dtype} -> float16")
+            gguf_writer.add_tensor(gguf_name, data)
+            tensor_count += 1
+
+        else:
+            # norms, 1D tensors
+            if args.outtype in ("f16", "i2_s"):
+                data = data.astype(np.float16)
+                logger.info(f"  {gguf_name}: {list(data_torch.shape)} {old_dtype} -> float16")
+            else:
+                if data.dtype != np.float32:
+                    data = data.astype(np.float32)
+                logger.info(f"  {gguf_name}: {list(data_torch.shape)} {old_dtype} -> float32")
+            gguf_writer.add_tensor(gguf_name, data)
+            tensor_count += 1
+
+    logger.info(f"Total tensors written: {tensor_count}")
+
+    # Write GGUF
+    logger.info(f"Writing to {args.outfile}...")
+    gguf_writer.write_header_to_file()
+    gguf_writer.write_kv_data_to_file()
+    gguf_writer.write_tensors_to_file()
+    gguf_writer.close()
+
+    logger.info("Done!")
+
+
+if __name__ == "__main__":
+    main()

From 5720fc7c030483f8e20f824ea8345c1c2efbe252 Mon Sep 17 00:00:00 2001
From: isHuangXin <huangxin.hust@gmail.com>
Date: Sun, 24 May 2026 18:24:01 +0800
Subject: [PATCH 3/3] [refactor] Unify GGUF conversion script and docs for
 Qwen3/Gemma3 embedding models

- Merge convert-bitnet-embedding-270m-to-gguf.py into convert-bitnet-embedding-to-gguf.py
  with auto-detection of model architecture (qwen3/gemma3_text) from config.json
- Merge separate Qwen3 and Gemma3 conversion docs into a single
  bitnet-embeddings-gguf-conversion.md
- Remove redundant per-architecture scripts and docs
---
 ...itnet-embeddings-gemma3-gguf-conversion.md | 336 -------------
 docs/bitnet-embeddings-gguf-conversion.md     | 410 ++++++++++++++++
 ...bitnet-embeddings-qwen3-gguf-conversion.md | 302 ------------
 .../convert-bitnet-embedding-270m-to-gguf.py  | 441 ------------------
 utils/convert-bitnet-embedding-to-gguf.py     | 117 +++--
 5 files changed, 483 insertions(+), 1123 deletions(-)
 delete mode 100644 docs/bitnet-embeddings-gemma3-gguf-conversion.md
 create mode 100644 docs/bitnet-embeddings-gguf-conversion.md
 delete mode 100644 docs/bitnet-embeddings-qwen3-gguf-conversion.md
 delete mode 100644 utils/convert-bitnet-embedding-270m-to-gguf.py

diff --git a/docs/bitnet-embeddings-gemma3-gguf-conversion.md b/docs/bitnet-embeddings-gemma3-gguf-conversion.md
deleted file mode 100644
index 236b78d1e..000000000
--- a/docs/bitnet-embeddings-gemma3-gguf-conversion.md
+++ /dev/null
@@ -1,336 +0,0 @@
-# BitNet Embeddings (Gemma3) GGUF Conversion Implementation
-
-## 1. Background
-
-`bitnet-embeddings-270m` is a Gemma3-based embedding model with BitNet per-projection RMSNorm (`BitLinear`). Each linear projection (q/k/v/o/gate/up/down) has a `.norm.weight` that applies RMSNorm to the input **before** the matmul:
-
-```
-x → RMSNorm(x, norm.weight) → activation_quant(8bit) → matmul(weight_quant(ternary))
-```
-
-This pattern does **not** exist in any standard llama.cpp architecture:
-- Standard Gemma3: no per-projection norms
-- Standard BitNet: has `attn_sub_norm`/`ffn_sub_norm` at different positions (after attention/gate*up, not before each projection)
-
-### Model Config
-
-- Architecture: `Gemma3TextModel`
-- hidden_size: 640, num_attention_heads: 4, num_key_value_heads: 1
-- head_dim: 256 (note: != hidden_size/num_heads = 160)
-- intermediate_size: 2048, num_hidden_layers: 18
-- hidden_activation: gelu_pytorch_tanh
-- vocab_size: 262144
-- rope_theta: 10000.0, rms_norm_eps: 1e-06
-- query_pre_attn_scalar: 256
-- tie_word_embeddings: true (implied, no separate output.weight)
-
-### Gemma3 vs Gemma2 Key Differences
-
-| Feature | Gemma2 | Gemma3 |
-|---------|--------|--------|
-| QK head norms | No | Yes (`q_norm`, `k_norm`) |
-| Pre-FFW norm | `ffn_norm` | `pre_feedforward_layernorm` → `ffn_norm` |
-| Post-FFW norm | `post_ffw_norm` | `post_feedforward_layernorm` → `post_ffw_norm` |
-| Post-attn norm | `post_attention_norm` | Same |
-| Activation | GELU | GELU |
-| Embedding scaling | sqrt(n_embd) | sqrt(n_embd) |
-
-### Per-Layer Tensors (7 extra norm tensors per layer)
-
-| Tensor | Shape |
-|--------|-------|
-| `self_attn.q_proj.norm.weight` | [640] |
-| `self_attn.k_proj.norm.weight` | [640] |
-| `self_attn.v_proj.norm.weight` | [640] |
-| `self_attn.o_proj.norm.weight` | [1024] |
-| `mlp.gate_proj.norm.weight` | [640] |
-| `mlp.up_proj.norm.weight` | [640] |
-| `mlp.down_proj.norm.weight` | [2048] |
-
----
-
-## 2. GGUF Tensor Name Mapping
-
-| HF Name | GGUF Name | Notes |
-|----------|-----------|-------|
-| `embed_tokens.weight` | `token_embd.weight` | |
-| `norm.weight` | `output_norm.weight` | |
-| `layers.{i}.input_layernorm.weight` | `blk.{i}.attn_norm.weight` | |
-| `layers.{i}.post_attention_layernorm.weight` | `blk.{i}.post_attention_norm.weight` | |
-| `layers.{i}.pre_feedforward_layernorm.weight` | `blk.{i}.ffn_norm.weight` | |
-| `layers.{i}.post_feedforward_layernorm.weight` | `blk.{i}.post_ffw_norm.weight` | |
-| `layers.{i}.self_attn.q_proj.weight` | `blk.{i}.attn_q.weight` | |
-| `layers.{i}.self_attn.k_proj.weight` | `blk.{i}.attn_k.weight` | |
-| `layers.{i}.self_attn.v_proj.weight` | `blk.{i}.attn_v.weight` | |
-| `layers.{i}.self_attn.o_proj.weight` | `blk.{i}.attn_output.weight` | |
-| `layers.{i}.self_attn.q_norm.weight` | `blk.{i}.attn_q_norm.weight` | QK head norm |
-| `layers.{i}.self_attn.k_norm.weight` | `blk.{i}.attn_k_norm.weight` | QK head norm |
-| `layers.{i}.self_attn.q_proj.norm.weight` | `blk.{i}.attn_q_norm_in.weight` | BitNet per-projection |
-| `layers.{i}.self_attn.k_proj.norm.weight` | `blk.{i}.attn_k_norm_in.weight` | BitNet per-projection |
-| `layers.{i}.self_attn.v_proj.norm.weight` | `blk.{i}.attn_v_norm_in.weight` | BitNet per-projection |
-| `layers.{i}.self_attn.o_proj.norm.weight` | `blk.{i}.attn_output_norm_in.weight` | BitNet per-projection |
-| `layers.{i}.mlp.gate_proj.weight` | `blk.{i}.ffn_gate.weight` | |
-| `layers.{i}.mlp.up_proj.weight` | `blk.{i}.ffn_up.weight` | |
-| `layers.{i}.mlp.down_proj.weight` | `blk.{i}.ffn_down.weight` | |
-| `layers.{i}.mlp.gate_proj.norm.weight` | `blk.{i}.ffn_gate_norm_in.weight` | BitNet per-projection |
-| `layers.{i}.mlp.up_proj.norm.weight` | `blk.{i}.ffn_up_norm_in.weight` | BitNet per-projection |
-| `layers.{i}.mlp.down_proj.norm.weight` | `blk.{i}.ffn_down_norm_in.weight` | BitNet per-projection |
-
----
-
-## 3. Conversion Script
-
-### `utils/convert-bitnet-embedding-270m-to-gguf.py`
-
-Standalone conversion script (safetensors → GGUF). Key features:
-
-- Hardcoded HF→GGUF tensor name mapping (no dependency on llama.cpp's Python converter)
-- Supports three output types:
-  - `--outtype f32`: all weights in float32
-  - `--outtype f16`: 2D weights and embeddings as float16, norms as float16
-  - `--outtype i2_s`: ternary weights packed in I2_S layout, non-ternary weights as float16
-- Writes `key_length` and `value_length` metadata for head_dim=256
-- Writes `query_pre_attn_scalar = 256` for correct attention scaling
-- GemmaTokenizerFast (BPE) tokenizer handling with pre-tokenizer hash verification
-- Pooling type auto-detection from `modules.json` / `1_Pooling/config.json` (sentence-transformers convention)
-- EOS token auto-set by SpecialVocab from tokenizer_config.json (eos_token_id=1)
-- Architecture string: `"gemma3"`
-
-### I2_S Ternary Packing
-
-The I2_S format packs ternary weights {-1, 0, +1} into 2-bit representation:
-
-- Quantization: `scale = 1/mean(|w|)`, `q = round(w * scale).clamp(-1, 1)`
-- Encoding: `-1 → 0`, `0 → 1`, `+1 → 2`
-- Every 128 values form a block, packed into 32 bytes
-- Each byte stores 4 values: `byte = (c0 << 6) | (c1 << 4) | (c2 << 2) | c3`
-- Scale (float32) is appended at the end of the packed data buffer
-
-### Tensor Type Assignment
-
-| Tensor Type | f16 mode | i2_s mode |
-|-------------|----------|-----------|
-| 2D linear weights | float16 | I2_S ternary packed |
-| Embedding weights | float16 | float16 |
-| Norm weights (1D) | float16 | float16 |
-
-Note: `output.weight` (lm_head) is skipped for embedding models — it is not needed (no token generation).
-
----
-
-## 4. C++ Modifications (`3rdparty/llama.cpp/src/llama.cpp`)
-
-### 4.1 New Architecture: `LLM_ARCH_GEMMA3`
-
-Added after `LLM_ARCH_GEMMA2` in the `llm_arch` enum with name mapping `"gemma3"`.
-
-### 4.2 Tensor Enums (shared with Qwen3)
-
-Reuses the 7 per-projection norm tensor enums added for Qwen3:
-
-```cpp
-LLM_TENSOR_ATTN_Q_NORM_IN,
-LLM_TENSOR_ATTN_K_NORM_IN,
-LLM_TENSOR_ATTN_V_NORM_IN,
-LLM_TENSOR_ATTN_OUT_NORM_IN,
-LLM_TENSOR_FFN_GATE_NORM_IN,
-LLM_TENSOR_FFN_UP_NORM_IN,
-LLM_TENSOR_FFN_DOWN_NORM_IN,
-```
-
-### 4.3 Tensor Name Mappings for `LLM_ARCH_GEMMA3`
-
-```cpp
-{ LLM_TENSOR_TOKEN_EMBD,          "token_embd" },
-{ LLM_TENSOR_OUTPUT_NORM,         "output_norm" },
-{ LLM_TENSOR_ATTN_NORM,           "blk.%d.attn_norm" },
-{ LLM_TENSOR_ATTN_Q,              "blk.%d.attn_q" },
-{ LLM_TENSOR_ATTN_K,              "blk.%d.attn_k" },
-{ LLM_TENSOR_ATTN_V,              "blk.%d.attn_v" },
-{ LLM_TENSOR_ATTN_OUT,            "blk.%d.attn_output" },
-{ LLM_TENSOR_ATTN_Q_NORM,         "blk.%d.attn_q_norm" },
-{ LLM_TENSOR_ATTN_K_NORM,         "blk.%d.attn_k_norm" },
-{ LLM_TENSOR_ATTN_Q_NORM_IN,      "blk.%d.attn_q_norm_in" },
-{ LLM_TENSOR_ATTN_K_NORM_IN,      "blk.%d.attn_k_norm_in" },
-{ LLM_TENSOR_ATTN_V_NORM_IN,      "blk.%d.attn_v_norm_in" },
-{ LLM_TENSOR_ATTN_OUT_NORM_IN,    "blk.%d.attn_output_norm_in" },
-{ LLM_TENSOR_ATTN_POST_NORM,      "blk.%d.post_attention_norm" },
-{ LLM_TENSOR_FFN_NORM,            "blk.%d.ffn_norm" },
-{ LLM_TENSOR_FFN_GATE,            "blk.%d.ffn_gate" },
-{ LLM_TENSOR_FFN_DOWN,            "blk.%d.ffn_down" },
-{ LLM_TENSOR_FFN_UP,              "blk.%d.ffn_up" },
-{ LLM_TENSOR_FFN_GATE_NORM_IN,    "blk.%d.ffn_gate_norm_in" },
-{ LLM_TENSOR_FFN_UP_NORM_IN,      "blk.%d.ffn_up_norm_in" },
-{ LLM_TENSOR_FFN_DOWN_NORM_IN,    "blk.%d.ffn_down_norm_in" },
-{ LLM_TENSOR_FFN_POST_NORM,       "blk.%d.post_ffw_norm" },
-```
-
-### 4.4 load_tensors (LLM_ARCH_GEMMA3)
-
-Based on Gemma2's tensor loading with additions:
-
-- QK head norms: `attn_q_norm`, `attn_k_norm`
-- All 7 BitNet per-projection norm_in tensors (TENSOR_NOT_REQUIRED)
-
-```cpp
-layer.attn_q_norm_in   = create_tensor(tn(...), {n_embd}, TENSOR_NOT_REQUIRED);
-layer.attn_k_norm_in   = create_tensor(tn(...), {n_embd}, TENSOR_NOT_REQUIRED);
-layer.attn_v_norm_in   = create_tensor(tn(...), {n_embd}, TENSOR_NOT_REQUIRED);
-layer.attn_out_norm_in = create_tensor(tn(...), {n_embd_head_k * n_head}, TENSOR_NOT_REQUIRED);
-layer.ffn_gate_norm_in = create_tensor(tn(...), {n_embd}, TENSOR_NOT_REQUIRED);
-layer.ffn_up_norm_in   = create_tensor(tn(...), {n_embd}, TENSOR_NOT_REQUIRED);
-layer.ffn_down_norm_in = create_tensor(tn(...), {n_ff}, TENSOR_NOT_REQUIRED);
-```
-
-### 4.5 build_gemma3() Graph Function
-
-Combines Gemma2's structure with Qwen3's per-projection norm pattern:
-
-**Key features:**
-- Embedding scaling by `sqrt(n_embd)` (Gemma convention)
-- GELU activation (gelu_pytorch_tanh)
-- QK head norms after Q/K projection
-- Conditional per-projection RMSNorm (backward compatible)
-- Post-attention and post-FFN layer norms
-- `wo=NULL` pattern for `attn_out_norm_in` (same as Qwen3)
-- `query_pre_attn_scalar` for attention scaling
-
-**Attention per-projection norms:**
-```
-// Before Q/K/V matmul:
-if (layer.attn_q_norm_in) {
-    cur_q = ggml_rms_norm(ctx, cur, hparams.f_norm_rms_eps);
-    cur_q = ggml_mul(ctx, cur_q, layer.attn_q_norm_in);
-} else {
-    cur_q = cur;
-}
-Qcur = ggml_mul_mat(ctx, layer.wq, cur_q);
-// QK head norms applied after projection
-Qcur = ggml_rms_norm(ctx, Qcur, hparams.f_norm_rms_eps);
-Qcur = ggml_mul(ctx, Qcur, layer.attn_q_norm);
-```
-
-**O_proj norm** with `wo=NULL` pattern:
-```
-cur = llm_build_kv(..., wo=NULL, ...);
-if (layer.attn_out_norm_in) {
-    cur = ggml_rms_norm(ctx, cur, hparams.f_norm_rms_eps);
-    cur = ggml_mul(ctx, cur, layer.attn_out_norm_in);
-}
-cur = ggml_mul_mat(ctx, layer.wo, cur);
-```
-
-**FFN per-projection norms with GELU:**
-```
-if (layer.ffn_gate_norm_in) {
-    tmp_gate = rms_norm(cur) * gate_norm_in;
-} else {
-    tmp_gate = cur;
-}
-tmp_gate = matmul(gate_proj, tmp_gate);
-tmp_gate = gelu(tmp_gate);  // GELU, not SILU
-// ...
-```
-
----
-
-## 5. GGUF Conversion Process
-
-There are two GGUF files to produce, from **two different source models**:
-
-| GGUF Output | Source Model | Description |
-|-------------|-------------|-------------|
-| `multilingual-e5-270m-f16.gguf` | `multilingual-e5-270m-260311` (standard Gemma3) | F16 baseline, standard float16 weights |
-| `bitnet-embeddings-270m-i2_s.gguf` | `bitnet-embeddings-270m` (BitNet ternary) | I2_S ternary packed weights |
-
-### 5.1 F16 GGUF: from multilingual-e5-270m-260311
-
-```bash
-python3 utils/convert-bitnet-embedding-270m-to-gguf.py \
-  /path/to/multilingual-e5-270m-260311 \
-  --outtype f16
-```
-
-**What happens:**
-1. Load `model.safetensors` (standard Gemma3 weights, bfloat16)
-2. Convert all 2D weights (projections, embeddings) to float16
-3. Convert norm weights to float16
-4. Write GGUF with `gemma3` architecture metadata and tokenizer
-
-### 5.2 I2_S GGUF: from bitnet-embeddings-270m
-
-```bash
-python3 utils/convert-bitnet-embedding-270m-to-gguf.py \
-  /path/to/bitnet-embeddings-270m \
-  --outtype i2_s
-```
-
-**What happens:**
-1. Load `model.safetensors` (BitNet ternary weights, bfloat16)
-2. Map HF tensor names to GGUF names, including 7 extra `*_norm_in` tensors per layer
-3. For each 2D linear weight: quantize to I2_S ternary packed format
-4. Keep embeddings (`token_embd.weight`) in float16
-5. Keep all norm weights in float16
-6. Skip `output.weight` (lm_head, not needed for embedding models)
-7. Write GGUF with `I2_S` type tag for quantized tensors
-
-### 5.3 Why Two Different Source Models?
-
-- `multilingual-e5-270m-260311` is the **teacher/baseline model** with standard float weights, used as the F16 performance reference
-- `bitnet-embeddings-270m` is the **1-bit quantized student model** with ternary weights and per-projection BitLinear norms, converted to I2_S for efficient CPU inference
-- Benchmarking compares both to measure the throughput gain and quality trade-off of ternary quantization
-
-### 5.4 Tensor Type Summary
-
-| Tensor | F16 (from e5-270m) | I2_S (from bitnet-270m) |
-|--------|---------------------|-------------------------|
-| Linear projections (q/k/v/o/gate/up/down) | float16 | I2_S (2-bit packed + float32 scale) |
-| Embedding (`token_embd.weight`) | float16 | float16 |
-| Per-projection norms (`*_norm_in`) | N/A (not present) | float16 |
-| Layer norms (attn_norm, ffn_norm, etc.) | float16 | float16 |
-| QK head norms (`attn_q_norm`, `attn_k_norm`) | float16 | float16 |
-| `output.weight` (lm_head) | skipped | skipped |
-
----
-
-## 6. Additional Changes
-
-### 6.1 ggml.c: F16 Norm Weight Support
-
-Added `ggml_compute_forward_mul_f32_f16()` function to support element-wise multiplication where norm weights are stored in float16. Modified `ggml_compute_forward_mul()` to dispatch based on `src1->type`.
-
-### 6.2 gguf-py: I2_S Type
-
-Added `I2_S = 36` to `GGMLQuantizationType` enum and `(4, 1)` quant size in `constants.py`.
-
-### 6.3 CMakeLists.txt: BitNet LUT Kernels Guard
-
-Guarded `bitnet-lut-kernels.h` include with `if (GGML_BITNET_ARM_TL1 OR GGML_BITNET_X86_TL2)` to prevent build errors when LUT kernels are not available.
-
-### 6.4 ggml-bitnet-mad.cpp: AVX512 SIMD
-
-Added AVX512BW SIMD paths for I2_S dot product functions:
-- `ggml_vec_dot_i2_i8_s_1x1`
-- `ggml_vec_dot_i2_i8_s_1xN`
-- `ggml_vec_dot_i2_i8_s_Nx1`
-
----
-
-## 7. Build and Run
-
-```bash
-# Build with BitNet repo (includes I2_S support)
-cmake -S /path/to/BitNet -B build -DCMAKE_BUILD_TYPE=Release
-cmake --build build --target llama-embedding llama-bench -j$(nproc)
-
-# Run embedding inference
-build/bin/llama-embedding -m bitnet-embeddings-270m-i2_s.gguf \
-  -p "hello world" --embd-normalize 2 --embd-output-format array
-
-# Benchmark: F16 vs I2_S
-build/bin/llama-bench -m multilingual-e5-270m-f16.gguf \
-  -t 8 -p 128,256,512,1024,2048 -n 32,64 -r 3 -ngl 0
-
-build/bin/llama-bench -m bitnet-embeddings-270m-i2_s.gguf \
-  -t 8 -p 128,256,512,1024,2048 -n 32,64 -r 3 -ngl 0
-```
diff --git a/docs/bitnet-embeddings-gguf-conversion.md b/docs/bitnet-embeddings-gguf-conversion.md
new file mode 100644
index 000000000..a4ee919d6
--- /dev/null
+++ b/docs/bitnet-embeddings-gguf-conversion.md
@@ -0,0 +1,410 @@
+# BitNet Embeddings GGUF Conversion Implementation
+
+## 1. Background
+
+BitNet embedding models apply per-projection RMSNorm (`BitLinear`) before each linear projection (q/k/v/o/gate/up/down). Each projection has a `.norm.weight` that applies RMSNorm to the input **before** the matmul:
+
+```
+x → RMSNorm(x, norm.weight) → activation_quant(8bit) → matmul(weight_quant(ternary))
+```
+
+This pattern does **not** exist in any standard llama.cpp architecture:
+- Standard Qwen3/Gemma3: no per-projection norms
+- Standard BitNet: has `attn_sub_norm`/`ffn_sub_norm` at different positions (after attention/gate*up, not before each projection)
+
+Currently two base architectures are supported:
+
+| | bitnet-embeddings-0.6b (Qwen3) | bitnet-embeddings-270m (Gemma3) |
+|---|---|---|
+| Architecture | `Qwen3Model` | `Gemma3TextModel` |
+| hidden_size | 1024 | 640 |
+| num_attention_heads | 16 | 4 |
+| num_key_value_heads | 8 | 1 |
+| head_dim | 128 (note: != hidden_size/num_heads = 64) | 256 (note: != hidden_size/num_heads = 160) |
+| intermediate_size | 3072 | 2048 |
+| num_hidden_layers | 28 | 18 |
+| hidden_activation | SiLU | gelu_pytorch_tanh |
+| vocab_size | 151936 | 262144 |
+| rope_theta | 1000000 | 10000.0 |
+| rms_norm_eps | 1e-06 | 1e-06 |
+| query_pre_attn_scalar | N/A | 256 |
+| tie_word_embeddings | true | true |
+
+### Gemma3 vs Qwen3 Key Differences
+
+| Feature | Qwen3 | Gemma3 |
+|---------|-------|--------|
+| Post-attn norm | No | Yes (`post_attention_norm`) |
+| Post-FFW norm | No | Yes (`post_ffw_norm`) |
+| Pre-FFW norm naming | `post_attention_layernorm` → `ffn_norm` | `pre_feedforward_layernorm` → `ffn_norm` |
+| QK head norms | Yes | Yes |
+| Activation | SiLU | GELU |
+| Embedding scaling | No | sqrt(n_embd) |
+| EOS token override | Yes (`<\|endoftext\|>` 151643) | No (auto from tokenizer) |
+
+### Per-Layer Tensors (7 extra norm tensors per layer)
+
+| Tensor | Qwen3 Shape | Gemma3 Shape |
+|--------|-------------|--------------|
+| `self_attn.q_proj.norm.weight` | [1024] | [640] |
+| `self_attn.k_proj.norm.weight` | [1024] | [640] |
+| `self_attn.v_proj.norm.weight` | [1024] | [640] |
+| `self_attn.o_proj.norm.weight` | [2048] | [1024] |
+| `mlp.gate_proj.norm.weight` | [1024] | [640] |
+| `mlp.up_proj.norm.weight` | [1024] | [640] |
+| `mlp.down_proj.norm.weight` | [3072] | [2048] |
+
+---
+
+## 2. GGUF Tensor Name Mapping
+
+### Common Tensors (both architectures)
+
+| HF Name | GGUF Name | Notes |
+|----------|-----------|-------|
+| `embed_tokens.weight` | `token_embd.weight` | |
+| `norm.weight` | `output_norm.weight` | |
+| `layers.{i}.input_layernorm.weight` | `blk.{i}.attn_norm.weight` | |
+| `layers.{i}.self_attn.q_proj.weight` | `blk.{i}.attn_q.weight` | |
+| `layers.{i}.self_attn.k_proj.weight` | `blk.{i}.attn_k.weight` | |
+| `layers.{i}.self_attn.v_proj.weight` | `blk.{i}.attn_v.weight` | |
+| `layers.{i}.self_attn.o_proj.weight` | `blk.{i}.attn_output.weight` | |
+| `layers.{i}.self_attn.q_norm.weight` | `blk.{i}.attn_q_norm.weight` | QK head norm |
+| `layers.{i}.self_attn.k_norm.weight` | `blk.{i}.attn_k_norm.weight` | QK head norm |
+| `layers.{i}.self_attn.q_proj.norm.weight` | `blk.{i}.attn_q_norm_in.weight` | BitNet per-projection |
+| `layers.{i}.self_attn.k_proj.norm.weight` | `blk.{i}.attn_k_norm_in.weight` | BitNet per-projection |
+| `layers.{i}.self_attn.v_proj.norm.weight` | `blk.{i}.attn_v_norm_in.weight` | BitNet per-projection |
+| `layers.{i}.self_attn.o_proj.norm.weight` | `blk.{i}.attn_output_norm_in.weight` | BitNet per-projection |
+| `layers.{i}.mlp.gate_proj.weight` | `blk.{i}.ffn_gate.weight` | |
+| `layers.{i}.mlp.up_proj.weight` | `blk.{i}.ffn_up.weight` | |
+| `layers.{i}.mlp.down_proj.weight` | `blk.{i}.ffn_down.weight` | |
+| `layers.{i}.mlp.gate_proj.norm.weight` | `blk.{i}.ffn_gate_norm_in.weight` | BitNet per-projection |
+| `layers.{i}.mlp.up_proj.norm.weight` | `blk.{i}.ffn_up_norm_in.weight` | BitNet per-projection |
+| `layers.{i}.mlp.down_proj.norm.weight` | `blk.{i}.ffn_down_norm_in.weight` | BitNet per-projection |
+
+### Architecture-Specific Tensors
+
+**Qwen3:**
+
+| HF Name | GGUF Name |
+|----------|-----------|
+| `layers.{i}.post_attention_layernorm.weight` | `blk.{i}.ffn_norm.weight` |
+
+**Gemma3 (additional):**
+
+| HF Name | GGUF Name |
+|----------|-----------|
+| `layers.{i}.post_attention_layernorm.weight` | `blk.{i}.post_attention_norm.weight` |
+| `layers.{i}.pre_feedforward_layernorm.weight` | `blk.{i}.ffn_norm.weight` |
+| `layers.{i}.post_feedforward_layernorm.weight` | `blk.{i}.post_ffw_norm.weight` |
+
+---
+
+## 3. Conversion Script
+
+### `utils/convert-bitnet-embedding-to-gguf.py`
+
+Unified standalone conversion script (safetensors → GGUF) that **auto-detects** the model architecture from `config.json`'s `model_type` field (`qwen3` or `gemma3_text`). Key features:
+
+- Hardcoded HF→GGUF tensor name mapping (no dependency on llama.cpp's Python converter)
+- Auto-detection of architecture and GGUF arch string (`qwen3` / `gemma3`)
+- Supports three output types:
+  - `--outtype f32`: all weights in float32
+  - `--outtype f16`: 2D weights and embeddings as float16, norms as float16
+  - `--outtype i2_s`: ternary weights packed in I2_S layout, non-ternary weights as float16
+- Writes `key_length` and `value_length` metadata for correct head_dim (critical: head_dim != hidden_size/num_heads for both models, default calculation would give wrong values)
+- BPE tokenizer handling with per-architecture pre-tokenizer hash verification:
+  - Qwen3: GPT-2 BPE tokenizer
+  - Gemma3: GemmaTokenizerFast (BPE)
+- Pooling type auto-detection from `modules.json` / `1_Pooling/config.json` (sentence-transformers convention)
+- Architecture-specific tokenizer handling:
+  - Qwen3: EOS token override (`<|endoftext|>` 151643) + `add_eos_token(True)` for last-token pooling
+  - Gemma3: EOS token auto-set by SpecialVocab from tokenizer_config.json (eos_token_id=1)
+- Gemma3: writes `query_pre_attn_scalar = 256` for correct attention scaling
+
+### I2_S Ternary Packing
+
+The I2_S format packs ternary weights {-1, 0, +1} into 2-bit representation:
+
+- Quantization: `scale = 1/mean(|w|)`, `q = round(w * scale).clamp(-1, 1)`
+- Encoding: `-1 → 0`, `0 → 1`, `+1 → 2`
+- Every 128 values form a block, packed into 32 bytes
+- Each byte stores 4 values: `byte = (c0 << 6) | (c1 << 4) | (c2 << 2) | c3`
+- Scale (float32) is appended at the end of the packed data buffer
+
+### Tensor Type Assignment
+
+| Tensor Type | f16 mode | i2_s mode |
+|-------------|----------|-----------|
+| 2D linear weights | float16 | I2_S ternary packed |
+| Embedding weights | float16 | float16 |
+| Norm weights (1D) | float16 | float16 |
+
+Note: `output.weight` (lm_head) is skipped for embedding models — it is not needed (no token generation).
+
+---
+
+## 4. C++ Modifications (`3rdparty/llama.cpp/src/llama.cpp`)
+
+### 4.1 New Architecture: `LLM_ARCH_GEMMA3`
+
+Added after `LLM_ARCH_GEMMA2` in the `llm_arch` enum with name mapping `"gemma3"`. Qwen3 (`LLM_ARCH_QWEN3`) was added by the 0.6b adaptation.
+
+### 4.2 New Tensor Enums (shared across architectures)
+
+Added 7 new entries after `LLM_TENSOR_FFN_SUB_NORM`:
+
+```cpp
+LLM_TENSOR_ATTN_Q_NORM_IN,
+LLM_TENSOR_ATTN_K_NORM_IN,
+LLM_TENSOR_ATTN_V_NORM_IN,
+LLM_TENSOR_ATTN_OUT_NORM_IN,
+LLM_TENSOR_FFN_GATE_NORM_IN,
+LLM_TENSOR_FFN_UP_NORM_IN,
+LLM_TENSOR_FFN_DOWN_NORM_IN,
+```
+
+### 4.3 Layer Struct Fields
+
+Added to `struct llama_layer`:
+
+```cpp
+struct ggml_tensor * attn_q_norm_in;
+struct ggml_tensor * attn_k_norm_in;
+struct ggml_tensor * attn_v_norm_in;
+struct ggml_tensor * attn_out_norm_in;
+struct ggml_tensor * ffn_gate_norm_in;
+struct ggml_tensor * ffn_up_norm_in;
+struct ggml_tensor * ffn_down_norm_in;
+```
+
+### 4.4 Tensor Name Mappings
+
+Both `LLM_ARCH_QWEN3` and `LLM_ARCH_GEMMA3` include the 7 per-projection norm tensor mappings plus standard tensors (see Section 2 for full mapping). Key differences:
+
+- Qwen3 includes `LLM_TENSOR_OUTPUT` (`"output"`); Gemma3 does not (uses tied embeddings directly)
+- Gemma3 additionally includes `LLM_TENSOR_ATTN_POST_NORM` (`"blk.%d.post_attention_norm"`) and `LLM_TENSOR_FFN_POST_NORM` (`"blk.%d.post_ffw_norm"`)
+
+### 4.5 load_tensors
+
+Both architectures load the 7 per-projection norm tensors as optional (`TENSOR_NOT_REQUIRED`):
+
+```cpp
+layer.attn_q_norm_in   = create_tensor(tn(...), {n_embd},              TENSOR_NOT_REQUIRED);
+layer.attn_k_norm_in   = create_tensor(tn(...), {n_embd},              TENSOR_NOT_REQUIRED);
+layer.attn_v_norm_in   = create_tensor(tn(...), {n_embd},              TENSOR_NOT_REQUIRED);
+layer.attn_out_norm_in = create_tensor(tn(...), {n_embd_head_k * n_head}, TENSOR_NOT_REQUIRED);
+layer.ffn_gate_norm_in = create_tensor(tn(...), {n_embd},              TENSOR_NOT_REQUIRED);
+layer.ffn_up_norm_in   = create_tensor(tn(...), {n_embd},              TENSOR_NOT_REQUIRED);
+layer.ffn_down_norm_in = create_tensor(tn(...), {n_ff},                TENSOR_NOT_REQUIRED);
+```
+
+Note: `o_proj.norm` input dimension is `n_embd_head_k * n_head` (Qwen3: 2048, Gemma3: 1024), `down_proj.norm` input dimension is `n_ff` (Qwen3: 3072, Gemma3: 2048).
+
+Both graph functions use the same per-projection norm pattern. The logic is fully backward compatible — when no `*_norm_in` tensors exist, behavior is identical to the original.
+
+**Attention per-projection norms:**
+```
+// Before Q/K/V matmul:
+if (layer.attn_q_norm_in) {
+    cur_q = ggml_rms_norm(ctx, cur, hparams.f_norm_rms_eps);
+    cur_q = ggml_mul(ctx, cur_q, layer.attn_q_norm_in);
+} else {
+    cur_q = cur;
+}
+Qcur = ggml_mul_mat(ctx, layer.wq, cur_q);
+// QK head norms applied after projection
+Qcur = ggml_rms_norm(ctx, Qcur, hparams.f_norm_rms_eps);
+Qcur = ggml_mul(ctx, Qcur, layer.attn_q_norm);
+```
+
+**O_proj norm** requires special handling because `llm_build_kv()` normally applies `wo` internally. Solution: pass `wo=NULL` to `llm_build_kv()`, then apply norm + wo manually:
+
+```
+cur = llm_build_kv(..., wo=NULL, ...);  // returns attention output without o_proj
+if (layer.attn_out_norm_in) {
+    cur = ggml_rms_norm(ctx, cur, hparams.f_norm_rms_eps);
+    cur = ggml_mul(ctx, cur, layer.attn_out_norm_in);
+}
+cur = ggml_mul_mat(ctx, layer.wo, cur);
+```
+
+**FFN per-projection norms:**
+```
+// Instead of llm_build_ffn(), manually:
+if (layer.ffn_gate_norm_in) {
+    tmp_gate = rms_norm(cur) * gate_norm_in;
+} else {
+    tmp_gate = cur;
+}
+tmp_gate = matmul(gate_proj, tmp_gate);
+tmp_gate = activation(tmp_gate);  // SiLU for Qwen3, GELU for Gemma3
+// Similarly for up_proj
+tmp = tmp_gate * tmp_up;
+
+if (layer.ffn_down_norm_in) {
+    tmp = rms_norm(tmp) * down_norm_in;
+}
+cur = matmul(down_proj, tmp);
+```
+
+**Gemma3-specific differences:**
+- Embedding scaling by `sqrt(n_embd)` (Gemma convention)
+- GELU activation instead of SiLU
+- Post-attention and post-FFN layer norms
+- `query_pre_attn_scalar` for attention scaling
+
+---
+
+## 5. GGUF Conversion Process
+
+Each model variant requires two GGUF files from **two different source models**:
+
+### 5.1 Qwen3 (0.6b)
+
+| GGUF Output | Source Model | Description |
+|-------------|-------------|-------------|
+| `embeddings-0.6b-f16.gguf` | `multilingual-e5-0.6b` (standard Qwen3) | F16 baseline |
+| `bitnet-embeddings-0.6b-f16-i2_s.gguf` | `bitnet-embeddings-0.6b` (BitNet ternary) | I2_S ternary packed |
+
+**F16 (from standard Qwen3 model):**
+```bash
+python3 utils/convert-bitnet-embedding-to-gguf.py \
+  /path/to/multilingual-e5-0.6b \
+  --outtype f16 \
+  --outfile embeddings-0.6b-f16.gguf
+```
+
+What happens:
+1. Load `model.safetensors` (standard Qwen3 weights, bfloat16)
+2. Convert all 2D weights (projections, embeddings) to float16
+3. Convert norm weights to float16
+4. Write GGUF with `qwen3` architecture metadata and tokenizer
+
+**Output:** ~1.11 GiB (595.78M params)
+
+**I2_S (from BitNet model):**
+```bash
+python3 utils/convert-bitnet-embedding-to-gguf.py \
+  /path/to/bitnet-embeddings-0.6b \
+  --outfile bitnet-embeddings-0.6b-f16-i2_s.gguf --outtype i2_s
+```
+
+What happens:
+1. Load `model.safetensors` (BitNet ternary weights, bfloat16)
+2. Map HF tensor names to GGUF names, including 7 extra `*_norm_in` tensors per layer
+3. For each 2D linear weight: quantize to I2_S ternary packed format
+4. Keep embeddings (`token_embd.weight`) in float16
+5. Keep all norm weights in float16
+6. Skip `output.weight` (lm_head, not needed for embedding models)
+7. Write GGUF with `I2_S` type tag for quantized tensors
+
+**Output:** ~699 MiB (~50% of F16 size)
+
+### 5.2 Gemma3 (270m)
+
+| GGUF Output | Source Model | Description |
+|-------------|-------------|-------------|
+| `multilingual-e5-270m-f16.gguf` | `multilingual-e5-270m-260311` (standard Gemma3) | F16 baseline |
+| `bitnet-embeddings-270m-i2_s.gguf` | `bitnet-embeddings-270m` (BitNet ternary) | I2_S ternary packed |
+
+**F16 (from standard Gemma3 model):**
+```bash
+python3 utils/convert-bitnet-embedding-to-gguf.py \
+  /path/to/multilingual-e5-270m-260311 \
+  --outtype f16
+```
+
+What happens:
+1. Load `model.safetensors` (standard Gemma3 weights, bfloat16)
+2. Convert all 2D weights (projections, embeddings) to float16
+3. Convert norm weights to float16
+4. Write GGUF with `gemma3` architecture metadata and tokenizer
+
+**I2_S (from BitNet model):**
+```bash
+python3 utils/convert-bitnet-embedding-to-gguf.py \
+  /path/to/bitnet-embeddings-270m \
+  --outtype i2_s
+```
+
+What happens:
+1. Load `model.safetensors` (BitNet ternary weights, bfloat16)
+2. Map HF tensor names to GGUF names, including 7 extra `*_norm_in` tensors per layer
+3. For each 2D linear weight: quantize to I2_S ternary packed format
+4. Keep embeddings (`token_embd.weight`) in float16
+5. Keep all norm weights in float16
+6. Skip `output.weight` (lm_head, not needed for embedding models)
+7. Write GGUF with `I2_S` type tag for quantized tensors
+
+### 5.3 Why Two Different Source Models?
+
+- `multilingual-e5-*` is the **teacher/baseline model** with standard float weights, used as the F16 performance reference
+- `bitnet-embeddings-*` is the **1-bit quantized student model** with ternary weights and per-projection BitLinear norms, converted to I2_S for efficient CPU inference
+- Benchmarking compares both to measure the throughput gain and quality trade-off of ternary quantization
+
+### 5.4 Tensor Type Summary
+
+| Tensor | F16 (baseline) | I2_S (BitNet) |
+|--------|----------------|---------------|
+| Linear projections (q/k/v/o/gate/up/down) | float16 | I2_S (2-bit packed + float32 scale) |
+| Embedding (`token_embd.weight`) | float16 | float16 |
+| Per-projection norms (`*_norm_in`) | N/A (not present) | float16 |
+| Layer norms (attn_norm, ffn_norm, etc.) | float16 | float16 |
+| QK head norms (`attn_q_norm`, `attn_k_norm`) | float16 | float16 |
+| `output.weight` (lm_head) | skipped | skipped |
+
+---
+
+## 6. Additional Changes
+
+### 6.1 ggml.c: F16 Norm Weight Support
+
+Added `ggml_compute_forward_mul_f32_f16()` function to support element-wise multiplication where norm weights are stored in float16. Modified `ggml_compute_forward_mul()` to dispatch based on `src1->type`.
+
+### 6.2 gguf-py: I2_S Type
+
+Added `I2_S = 36` to `GGMLQuantizationType` enum and `(4, 1)` quant size in `constants.py`.
+
+### 6.3 CMakeLists.txt: BitNet LUT Kernels Guard
+
+Guarded `bitnet-lut-kernels.h` include with `if (GGML_BITNET_ARM_TL1 OR GGML_BITNET_X86_TL2)` to prevent build errors when LUT kernels are not available.
+
+### 6.4 ggml-bitnet-mad.cpp: AVX512 SIMD
+
+Added AVX512BW SIMD paths for I2_S dot product functions:
+- `ggml_vec_dot_i2_i8_s_1x1`
+- `ggml_vec_dot_i2_i8_s_1xN`
+- `ggml_vec_dot_i2_i8_s_Nx1`
+
+---
+
+## 7. Build and Run
+
+```bash
+# Build with BitNet repo (includes I2_S support)
+cmake -S /path/to/BitNet -B build -DCMAKE_BUILD_TYPE=Release
+cmake --build build --target llama-embedding llama-bench -j$(nproc)
+
+# Run embedding inference (Qwen3 example)
+build/bin/llama-embedding -m bitnet-embeddings-0.6b-f16-i2_s.gguf \
+  -p "hello world" --embd-normalize 2 --embd-output-format array
+
+# Run embedding inference (Gemma3 example)
+build/bin/llama-embedding -m bitnet-embeddings-270m-i2_s.gguf \
+  -p "hello world" --embd-normalize 2 --embd-output-format array
+
+# Benchmark: F16 vs I2_S (Qwen3)
+build/bin/llama-bench -m embeddings-0.6b-f16.gguf \
+  -t 8 -p 128,256,512,1024,2048 -n 32,64 -r 3 -ngl 0
+
+build/bin/llama-bench -m bitnet-embeddings-0.6b-f16-i2_s.gguf \
+  -t 8 -p 128,256,512,1024,2048 -n 32,64 -r 3 -ngl 0
+
+# Benchmark: F16 vs I2_S (Gemma3)
+build/bin/llama-bench -m multilingual-e5-270m-f16.gguf \
+  -t 8 -p 128,256,512,1024,2048 -n 32,64 -r 3 -ngl 0
+
+build/bin/llama-bench -m bitnet-embeddings-270m-i2_s.gguf \
+  -t 8 -p 128,256,512,1024,2048 -n 32,64 -r 3 -ngl 0
+```
diff --git a/docs/bitnet-embeddings-qwen3-gguf-conversion.md b/docs/bitnet-embeddings-qwen3-gguf-conversion.md
deleted file mode 100644
index 9d63c9300..000000000
--- a/docs/bitnet-embeddings-qwen3-gguf-conversion.md
+++ /dev/null
@@ -1,302 +0,0 @@
-# BitNet Embeddings (Qwen3) GGUF Conversion Implementation
-
-## 1. Background
-
-`bitnet-embeddings-0.6b` is a Qwen3-based embedding model with BitNet per-projection RMSNorm (`BitLinear`). Each linear projection (q/k/v/o/gate/up/down) has a `.norm.weight` that applies RMSNorm to the input **before** the matmul:
-
-```
-x → RMSNorm(x, norm.weight) → activation_quant(8bit) → matmul(weight_quant(ternary))
-```
-
-This pattern does **not** exist in any standard llama.cpp architecture:
-- Standard Qwen3: no per-projection norms
-- Standard BitNet: has `attn_sub_norm`/`ffn_sub_norm` at different positions (after attention/gate*up, not before each projection)
-
-### Model Config
-
-- Architecture: `Qwen3Model`
-- hidden_size: 1024, num_attention_heads: 16, num_key_value_heads: 8
-- head_dim: 128 (note: != hidden_size/num_heads = 64)
-- intermediate_size: 3072, num_hidden_layers: 28
-- tie_word_embeddings: true
-- rope_theta: 1000000, rms_norm_eps: 1e-06
-
-### Per-Layer Tensors (7 extra norm tensors per layer)
-
-| Tensor | Shape |
-|--------|-------|
-| `self_attn.q_proj.norm.weight` | [1024] |
-| `self_attn.k_proj.norm.weight` | [1024] |
-| `self_attn.v_proj.norm.weight` | [1024] |
-| `self_attn.o_proj.norm.weight` | [2048] |
-| `mlp.gate_proj.norm.weight` | [1024] |
-| `mlp.up_proj.norm.weight` | [1024] |
-| `mlp.down_proj.norm.weight` | [3072] |
-
----
-
-## 2. GGUF Tensor Name Mapping
-
-| HF Name | GGUF Name | Notes |
-|----------|-----------|-------|
-| `embed_tokens.weight` | `token_embd.weight` | |
-| `norm.weight` | `output_norm.weight` | |
-| `layers.{i}.input_layernorm.weight` | `blk.{i}.attn_norm.weight` | |
-| `layers.{i}.post_attention_layernorm.weight` | `blk.{i}.ffn_norm.weight` | |
-| `layers.{i}.self_attn.q_proj.weight` | `blk.{i}.attn_q.weight` | |
-| `layers.{i}.self_attn.k_proj.weight` | `blk.{i}.attn_k.weight` | |
-| `layers.{i}.self_attn.v_proj.weight` | `blk.{i}.attn_v.weight` | |
-| `layers.{i}.self_attn.o_proj.weight` | `blk.{i}.attn_output.weight` | |
-| `layers.{i}.self_attn.q_norm.weight` | `blk.{i}.attn_q_norm.weight` | QK head norm |
-| `layers.{i}.self_attn.k_norm.weight` | `blk.{i}.attn_k_norm.weight` | QK head norm |
-| `layers.{i}.self_attn.q_proj.norm.weight` | `blk.{i}.attn_q_norm_in.weight` | BitNet per-projection |
-| `layers.{i}.self_attn.k_proj.norm.weight` | `blk.{i}.attn_k_norm_in.weight` | BitNet per-projection |
-| `layers.{i}.self_attn.v_proj.norm.weight` | `blk.{i}.attn_v_norm_in.weight` | BitNet per-projection |
-| `layers.{i}.self_attn.o_proj.norm.weight` | `blk.{i}.attn_output_norm_in.weight` | BitNet per-projection |
-| `layers.{i}.mlp.gate_proj.weight` | `blk.{i}.ffn_gate.weight` | |
-| `layers.{i}.mlp.up_proj.weight` | `blk.{i}.ffn_up.weight` | |
-| `layers.{i}.mlp.down_proj.weight` | `blk.{i}.ffn_down.weight` | |
-| `layers.{i}.mlp.gate_proj.norm.weight` | `blk.{i}.ffn_gate_norm_in.weight` | BitNet per-projection |
-| `layers.{i}.mlp.up_proj.norm.weight` | `blk.{i}.ffn_up_norm_in.weight` | BitNet per-projection |
-| `layers.{i}.mlp.down_proj.norm.weight` | `blk.{i}.ffn_down_norm_in.weight` | BitNet per-projection |
-
----
-
-## 3. Conversion Script
-
-### `utils/convert-bitnet-embedding-to-gguf.py`
-
-Standalone conversion script (safetensors → GGUF). Key features:
-
-- Hardcoded HF→GGUF tensor name mapping (no dependency on llama.cpp's Python converter)
-- Supports three output types:
-  - `--outtype f32`: all weights in float32
-  - `--outtype f16`: 2D weights and embeddings as float16, norms as float16
-  - `--outtype i2_s`: ternary weights packed in I2_S layout, non-ternary weights as float16
-- Writes `key_length` and `value_length` metadata for head_dim=128 (critical: default calculation would give wrong value 64)
-- GPT-2 BPE tokenizer handling with pre-tokenizer hash verification
-- Pooling type auto-detection from `modules.json` / `1_Pooling/config.json` (sentence-transformers convention)
-- EOS token override: uses `<|endoftext|>` (151643) for correct last-token pooling
-- Architecture string: `"qwen3"`
-
-### I2_S Ternary Packing
-
-The I2_S format packs ternary weights {-1, 0, +1} into 2-bit representation:
-
-- Quantization: `scale = 1/mean(|w|)`, `q = round(w * scale).clamp(-1, 1)`
-- Encoding: `-1 → 0`, `0 → 1`, `+1 → 2`
-- Every 128 values form a block, packed into 32 bytes
-- Each byte stores 4 values: `byte = (c0 << 6) | (c1 << 4) | (c2 << 2) | c3`
-- Scale (float32) is appended at the end of the packed data buffer
-
-### Tensor Type Assignment
-
-| Tensor Type | f16 mode | i2_s mode |
-|-------------|----------|-----------|
-| 2D linear weights | float16 | I2_S ternary packed |
-| Embedding weights | float16 | float16 |
-| Norm weights (1D) | float16 | float16 |
-
-Note: `output.weight` (lm_head) is skipped for embedding models — it is not needed (no token generation).
-
----
-
-## 4. C++ Modifications (`3rdparty/llama.cpp/src/llama.cpp`)
-
-### 4.1 New Tensor Enums
-
-Added 7 new entries after `LLM_TENSOR_FFN_SUB_NORM`:
-
-```cpp
-LLM_TENSOR_ATTN_Q_NORM_IN,
-LLM_TENSOR_ATTN_K_NORM_IN,
-LLM_TENSOR_ATTN_V_NORM_IN,
-LLM_TENSOR_ATTN_OUT_NORM_IN,
-LLM_TENSOR_FFN_GATE_NORM_IN,
-LLM_TENSOR_FFN_UP_NORM_IN,
-LLM_TENSOR_FFN_DOWN_NORM_IN,
-```
-
-### 4.2 Tensor Name Mappings
-
-Added to `LLM_ARCH_QWEN3` tensor name map:
-
-```cpp
-{ LLM_TENSOR_ATTN_Q_NORM_IN,   "blk.%d.attn_q_norm_in" },
-{ LLM_TENSOR_ATTN_K_NORM_IN,   "blk.%d.attn_k_norm_in" },
-{ LLM_TENSOR_ATTN_V_NORM_IN,   "blk.%d.attn_v_norm_in" },
-{ LLM_TENSOR_ATTN_OUT_NORM_IN, "blk.%d.attn_output_norm_in" },
-{ LLM_TENSOR_FFN_GATE_NORM_IN, "blk.%d.ffn_gate_norm_in" },
-{ LLM_TENSOR_FFN_UP_NORM_IN,   "blk.%d.ffn_up_norm_in" },
-{ LLM_TENSOR_FFN_DOWN_NORM_IN, "blk.%d.ffn_down_norm_in" },
-```
-
-### 4.3 Layer Struct Fields
-
-Added to `struct llama_layer`:
-
-```cpp
-struct ggml_tensor * attn_q_norm_in;
-struct ggml_tensor * attn_k_norm_in;
-struct ggml_tensor * attn_v_norm_in;
-struct ggml_tensor * attn_out_norm_in;
-struct ggml_tensor * ffn_gate_norm_in;
-struct ggml_tensor * ffn_up_norm_in;
-struct ggml_tensor * ffn_down_norm_in;
-```
-
-### 4.4 load_tensors (LLM_ARCH_QWEN3)
-
-Added optional loading with `TENSOR_NOT_REQUIRED`:
-
-```cpp
-layer.attn_q_norm_in   = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM_IN,   "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
-layer.attn_k_norm_in   = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM_IN,   "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
-layer.attn_v_norm_in   = create_tensor(tn(LLM_TENSOR_ATTN_V_NORM_IN,   "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
-layer.attn_out_norm_in = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM_IN, "weight", i), {n_embd_head_k * n_head},    TENSOR_NOT_REQUIRED);
-layer.ffn_gate_norm_in = create_tensor(tn(LLM_TENSOR_FFN_GATE_NORM_IN, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
-layer.ffn_up_norm_in   = create_tensor(tn(LLM_TENSOR_FFN_UP_NORM_IN,   "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
-layer.ffn_down_norm_in = create_tensor(tn(LLM_TENSOR_FFN_DOWN_NORM_IN, "weight", i), {n_ff},   TENSOR_NOT_REQUIRED);
-```
-
-Note: `o_proj.norm` input dimension is `n_embd_head_k * n_head` (=2048), `down_proj.norm` input dimension is `n_ff` (=3072).
-
-### 4.5 build_qwen3() Graph Modifications
-
-The `build_qwen3()` function was modified to conditionally apply per-projection RMSNorm. The logic is fully backward compatible — when no `*_norm_in` tensors exist, behavior is identical to original.
-
-**Attention per-projection norms:**
-```
-// Before Q/K/V matmul:
-if (layer.attn_q_norm_in) {
-    cur_q = ggml_rms_norm(ctx, cur, hparams.f_norm_rms_eps);
-    cur_q = ggml_mul(ctx, cur_q, layer.attn_q_norm_in);
-} else {
-    cur_q = cur;
-}
-Qcur = ggml_mul_mat(ctx, layer.wq, cur_q);
-// Similarly for K, V
-```
-
-**O_proj norm** requires special handling because `llm_build_kv()` normally applies `wo` internally. Solution: pass `wo=NULL` to `llm_build_kv()`, then apply norm + wo manually:
-
-```
-cur = llm_build_kv(..., wo=NULL, ...);  // returns attention output without o_proj
-if (layer.attn_out_norm_in) {
-    cur = ggml_rms_norm(ctx, cur, hparams.f_norm_rms_eps);
-    cur = ggml_mul(ctx, cur, layer.attn_out_norm_in);
-}
-cur = ggml_mul_mat(ctx, layer.wo, cur);
-```
-
-**FFN per-projection norms:**
-```
-// Instead of llm_build_ffn(), manually:
-if (layer.ffn_gate_norm_in) {
-    tmp_gate = rms_norm(cur) * gate_norm_in;
-} else {
-    tmp_gate = cur;
-}
-tmp_gate = matmul(gate_proj, tmp_gate);
-// Similarly for up_proj
-tmp = silu(tmp_gate) * tmp_up;
-
-if (layer.ffn_down_norm_in) {
-    tmp = rms_norm(tmp) * down_norm_in;
-}
-cur = matmul(down_proj, tmp);
-```
-
----
-
-## 5. GGUF Conversion Process
-
-There are two GGUF files to produce, from **two different source models**:
-
-| GGUF Output | Source Model | Description |
-|-------------|-------------|-------------|
-| `embeddings-0.6b-f16.gguf` | `multilingual-e5-0.6b` (standard Qwen3) | F16 baseline, standard float16 weights |
-| `bitnet-embeddings-0.6b-f16-i2_s.gguf` | `bitnet-embeddings-0.6b` (BitNet ternary) | I2_S ternary packed weights |
-
-### 5.1 F16 GGUF: from multilingual-e5-0.6b
-
-The F16 GGUF is converted from the **standard (non-BitNet) model** `multilingual-e5-0.6b`, which has normal float weights and no per-projection RMSNorm. This uses llama.cpp's standard converter since it is a vanilla Qwen3 model:
-
-```bash
-python3 /path/to/llama.cpp/convert_hf_to_gguf.py \
-  /path/to/multilingual-e5-0.6b \
-  --outtype f16 \
-  --outfile embeddings-0.6b-f16.gguf
-```
-
-**What happens:**
-1. Load `model.safetensors` (standard Qwen3 weights, bfloat16)
-2. Convert all 2D weights (projections, embeddings) to float16
-3. Convert norm weights to float32
-4. Write GGUF with `qwen3` architecture metadata and tokenizer
-
-**Output:** ~1.11 GiB (595.78M params)
-
-### 5.2 I2_S GGUF: from bitnet-embeddings-0.6b
-
-The I2_S GGUF is converted from the **BitNet ternary model** `bitnet-embeddings-0.6b`, which has ternary weights {-1, 0, +1} and 7 extra per-projection RMSNorm tensors per layer. This uses the custom converter because the standard llama.cpp converter does not handle per-projection norms or I2_S quantization:
-
-```bash
-python3 utils/convert-bitnet-embedding-to-gguf.py \
-  /path/to/bitnet-embeddings-0.6b \
-  --outfile bitnet-embeddings-0.6b-f16-i2_s.gguf --outtype i2_s
-```
-
-**What happens:**
-1. Load `model.safetensors` (BitNet ternary weights, bfloat16)
-2. Map HF tensor names to GGUF names, including 7 extra `*_norm_in` tensors per layer (see Section 2)
-3. For each 2D linear weight (q/k/v/o/gate/up/down projections):
-   - Compute scale: `scale = 1 / mean(|w|)`
-   - Quantize: `q = round(w * scale).clamp(-1, 1)`
-   - Encode: `-1 -> 0`, `0 -> 1`, `+1 -> 2`
-   - Pack every 128 values into 32 bytes (4 values per byte, 2 bits each)
-   - Append per-row float32 scale
-4. Keep embeddings (`token_embd.weight`) in float16 (not ternary)
-5. Keep all norm weights in float16
-6. Skip `output.weight` (lm_head, not needed for embedding models)
-7. Write GGUF with `I2_S` type tag for quantized tensors
-
-**Output:** ~699 MiB (~50% of F16 size)
-
-### 5.3 Why Two Different Source Models?
-
-- `multilingual-e5-0.6b` is the **teacher/baseline model** with standard float weights, used as the F16 performance reference
-- `bitnet-embeddings-0.6b` is the **1-bit quantized student model** with ternary weights and per-projection BitLinear norms, converted to I2_S for efficient CPU inference
-- Benchmarking compares both to measure the throughput gain and quality trade-off of ternary quantization
-
-### 5.4 Tensor Type Summary
-
-| Tensor | F16 (from e5-0.6b) | I2_S (from bitnet-0.6b) |
-|--------|---------------------|-------------------------|
-| Linear projections (q/k/v/o/gate/up/down) | float16 | I2_S (2-bit packed + float32 scale) |
-| Embedding (`token_embd.weight`) | float16 | float16 |
-| Per-projection norms (`*_norm_in`) | N/A (not present) | float16 |
-| Layer norms (`attn_norm`, `ffn_norm`) | float32 | float16 |
-| QK head norms (`attn_q_norm`, `attn_k_norm`) | float32 | float32 |
-| `output.weight` (lm_head) | present | skipped |
-
----
-
-## 6. Build and Run
-
-```bash
-# Build with BitNet repo (includes I2_S support)
-cmake -S /path/to/BitNet -B build -DCMAKE_BUILD_TYPE=Release
-cmake --build build --target llama-embedding llama-bench -j$(nproc)
-
-# Run embedding inference
-build/bin/llama-embedding -m bitnet-embeddings-0.6b-f16-i2_s.gguf \
-  -p "hello world" --embd-normalize 2 --embd-output-format array
-
-# Benchmark: F16 vs I2_S
-build/bin/llama-bench -m embeddings-0.6b-f16.gguf \
-  -t 8 -p 128,256,512,1024,2048 -n 32,64 -r 3 -ngl 0
-
-build/bin/llama-bench -m bitnet-embeddings-0.6b-f16-i2_s.gguf \
-  -t 8 -p 128,256,512,1024,2048 -n 32,64 -r 3 -ngl 0
-```
diff --git a/utils/convert-bitnet-embedding-270m-to-gguf.py b/utils/convert-bitnet-embedding-270m-to-gguf.py
deleted file mode 100644
index 4408452ee..000000000
--- a/utils/convert-bitnet-embedding-270m-to-gguf.py
+++ /dev/null
@@ -1,441 +0,0 @@
-#!/usr/bin/env python3
-
-from __future__ import annotations
-
-import argparse
-import json
-import logging
-import os
-import sys
-from hashlib import sha256
-from pathlib import Path
-from typing import Any, Iterator
-
-import numpy as np
-import torch
-
-# Allow using the local gguf-py if present
-if "NO_LOCAL_GGUF" not in os.environ:
-    _local_gguf = Path(__file__).parent / "gguf-py"
-    if _local_gguf.exists():
-        sys.path.insert(1, str(_local_gguf))
-import gguf
-
-logger = logging.getLogger("convert-bitnet-embedding-270m")
-
-# ---------------------------------------------------------------------------
-# Tensor name mapping: HuggingFace -> GGUF
-# ---------------------------------------------------------------------------
-
-def build_tensor_name_map(n_layers: int) -> dict[str, str]:
-    """Build HF tensor name -> GGUF tensor name mapping."""
-    mapping: dict[str, str] = {
-        "embed_tokens.weight": "token_embd.weight",
-        "norm.weight": "output_norm.weight",
-    }
-
-    for i in range(n_layers):
-        pfx = f"layers.{i}"
-        blk = f"blk.{i}"
-
-        mapping.update({
-            # Layer norms
-            f"{pfx}.input_layernorm.weight":           f"{blk}.attn_norm.weight",
-            f"{pfx}.post_attention_layernorm.weight":   f"{blk}.post_attention_norm.weight",
-            f"{pfx}.pre_feedforward_layernorm.weight":  f"{blk}.ffn_norm.weight",
-            f"{pfx}.post_feedforward_layernorm.weight": f"{blk}.post_ffw_norm.weight",
-
-            # Self-attention projections
-            f"{pfx}.self_attn.q_proj.weight":           f"{blk}.attn_q.weight",
-            f"{pfx}.self_attn.k_proj.weight":           f"{blk}.attn_k.weight",
-            f"{pfx}.self_attn.v_proj.weight":           f"{blk}.attn_v.weight",
-            f"{pfx}.self_attn.o_proj.weight":           f"{blk}.attn_output.weight",
-
-            # QK head norms (Gemma3)
-            f"{pfx}.self_attn.q_norm.weight":           f"{blk}.attn_q_norm.weight",
-            f"{pfx}.self_attn.k_norm.weight":           f"{blk}.attn_k_norm.weight",
-
-            # Per-projection input norms (BitNet-specific)
-            f"{pfx}.self_attn.q_proj.norm.weight":      f"{blk}.attn_q_norm_in.weight",
-            f"{pfx}.self_attn.k_proj.norm.weight":      f"{blk}.attn_k_norm_in.weight",
-            f"{pfx}.self_attn.v_proj.norm.weight":      f"{blk}.attn_v_norm_in.weight",
-            f"{pfx}.self_attn.o_proj.norm.weight":      f"{blk}.attn_output_norm_in.weight",
-
-            # MLP projections
-            f"{pfx}.mlp.gate_proj.weight":              f"{blk}.ffn_gate.weight",
-            f"{pfx}.mlp.up_proj.weight":                f"{blk}.ffn_up.weight",
-            f"{pfx}.mlp.down_proj.weight":              f"{blk}.ffn_down.weight",
-
-            # Per-projection input norms for MLP (BitNet-specific)
-            f"{pfx}.mlp.gate_proj.norm.weight":         f"{blk}.ffn_gate_norm_in.weight",
-            f"{pfx}.mlp.up_proj.norm.weight":           f"{blk}.ffn_up_norm_in.weight",
-            f"{pfx}.mlp.down_proj.norm.weight":         f"{blk}.ffn_down_norm_in.weight",
-        })
-
-    return mapping
-
-
-# ---------------------------------------------------------------------------
-# Tokenizer handling (BPE for Gemma3)
-# ---------------------------------------------------------------------------
-
-def get_vocab_base_pre(tokenizer) -> str:
-    chktxt = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n\U0001f680 (normal) \U0001f636‍\U0001f32b️ (multiple emojis concatenated) ✅ \U0001f999\U0001f999 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច\U0001f601 ?我想在apple工作1314151天～ ------======= нещо на Български \'\'\'\'\'\'```````""""""......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
-
-    chktok = tokenizer.encode(chktxt)
-    chkhsh = sha256(str(chktok).encode()).hexdigest()
-
-    logger.debug(f"chktok: {chktok}")
-    logger.debug(f"chkhsh: {chkhsh}")
-
-    res = None
-
-    if chkhsh == "fcb6bf9f20f6c40fa4aa4f7f99607bd6c106ca2348efdacacdca8152e59dcfe9":
-        # ref: multilingual-e5-270m-260311 (Gemma3 tokenizer)
-        res = "default"
-    if chkhsh == "a8594e3edff7c29c003940395316294b2c623571571fc8d3d2d6571f5571cbe6":
-        # ref: google/gemma-2-9b
-        res = "default"
-
-    if res is None:
-        logger.warning("\n")
-        logger.warning("**************************************************************************************")
-        logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!")
-        logger.warning(f"** chkhsh:  {chkhsh}")
-        logger.warning("**************************************************************************************")
-        logger.warning("\n")
-        raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()")
-
-    logger.debug(f"tokenizer.ggml.pre: {repr(res)}")
-    return res
-
-
-def _does_token_look_special(token: str) -> bool:
-    if not token:
-        return False
-    if token.startswith(("<|", "<", "[")) and token.endswith(("|>", ">", "]")):
-        return True
-    return False
-
-
-def set_vocab(gguf_writer: gguf.GGUFWriter, dir_model: Path, hparams: dict):
-    """Set BPE vocab for Gemma3."""
-    from transformers import AutoTokenizer
-    tokenizer = AutoTokenizer.from_pretrained(dir_model)
-    vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
-
-    tokpre = get_vocab_base_pre(tokenizer)
-
-    tokens: list[str] = []
-    toktypes: list[int] = []
-
-    reverse_vocab = {id_: tok for tok, id_ in tokenizer.vocab.items()}
-    added_vocab = tokenizer.get_added_vocab()
-
-    added_tokens_decoder = tokenizer.added_tokens_decoder
-
-    for i in range(vocab_size):
-        if i not in reverse_vocab:
-            tokens.append(f"[PAD{i}]")
-            toktypes.append(gguf.TokenType.UNUSED)
-        elif reverse_vocab[i] in added_vocab:
-            token = reverse_vocab[i]
-
-            if not added_tokens_decoder[i].normalized:
-                token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
-
-            if added_tokens_decoder[i].special or _does_token_look_special(token):
-                toktypes.append(gguf.TokenType.CONTROL)
-            else:
-                token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ")
-                toktypes.append(gguf.TokenType.USER_DEFINED)
-
-            tokens.append(token)
-        else:
-            tokens.append(reverse_vocab[i])
-            toktypes.append(gguf.TokenType.NORMAL)
-
-    gguf_writer.add_tokenizer_model("gpt2")
-    gguf_writer.add_tokenizer_pre(tokpre)
-    gguf_writer.add_token_list(tokens)
-    gguf_writer.add_token_types(toktypes)
-
-    special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
-    special_vocab.add_to_gguf(gguf_writer)
-
-
-# ---------------------------------------------------------------------------
-# GGUF metadata
-# ---------------------------------------------------------------------------
-
-def set_gguf_parameters(gguf_writer: gguf.GGUFWriter, hparams: dict, dir_model: Path, ftype: int):
-    gguf_writer.add_name(dir_model.name)
-
-    n_layers = hparams["num_hidden_layers"]
-    n_embd = hparams["hidden_size"]
-    n_head = hparams["num_attention_heads"]
-    n_head_kv = hparams.get("num_key_value_heads", n_head)
-    n_ff = hparams["intermediate_size"]
-
-    gguf_writer.add_block_count(n_layers)
-    gguf_writer.add_context_length(hparams.get("max_position_embeddings", 32768))
-    gguf_writer.add_embedding_length(n_embd)
-    gguf_writer.add_feed_forward_length(n_ff)
-    gguf_writer.add_head_count(n_head)
-    gguf_writer.add_head_count_kv(n_head_kv)
-    gguf_writer.add_vocab_size(hparams["vocab_size"])
-
-    head_dim = hparams.get("head_dim", n_embd // n_head)
-    gguf_writer.add_rope_dimension_count(head_dim)
-    gguf_writer.add_key_length(head_dim)
-    gguf_writer.add_value_length(head_dim)
-
-    if hparams.get("rope_theta") is not None:
-        gguf_writer.add_rope_freq_base(hparams["rope_theta"])
-    if hparams.get("rms_norm_eps") is not None:
-        gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
-
-    gguf_writer.add_file_type(ftype)
-
-    # Pooling type for embedding models
-    pooling_type = None
-    module_path = dir_model / "modules.json"
-    if module_path.is_file():
-        with open(module_path, encoding="utf-8") as f:
-            modules = json.load(f)
-        for mod in modules:
-            if mod["type"].endswith("Pooling"):
-                pooling_path = dir_model / mod["path"] / "config.json"
-                if pooling_path.is_file():
-                    with open(pooling_path, encoding="utf-8") as f:
-                        pooling = json.load(f)
-                    if pooling.get("pooling_mode_mean_tokens"):
-                        pooling_type = gguf.PoolingType.MEAN
-                    elif pooling.get("pooling_mode_cls_token"):
-                        pooling_type = gguf.PoolingType.CLS
-                    elif pooling.get("pooling_mode_lasttoken"):
-                        pooling_type = gguf.PoolingType.LAST
-                break
-    if pooling_type is None:
-        logger.info("  No pooling config found, defaulting to MEAN pooling")
-        pooling_type = gguf.PoolingType.MEAN
-    gguf_writer.add_pooling_type(pooling_type)
-
-    logger.info(f"  n_layers={n_layers}, n_embd={n_embd}, n_head={n_head}, n_head_kv={n_head_kv}, n_ff={n_ff}, head_dim={head_dim}")
-
-
-# ---------------------------------------------------------------------------
-# Tensor iteration from safetensors
-# ---------------------------------------------------------------------------
-
-def iter_tensors(dir_model: Path) -> Iterator[tuple[str, torch.Tensor]]:
-    """Yield (name, tensor) from safetensors files."""
-    from safetensors import safe_open
-
-    safetensor_files = sorted(dir_model.glob("*.safetensors"))
-    if not safetensor_files:
-        raise FileNotFoundError(f"No .safetensors files in {dir_model}")
-
-    for sf_path in safetensor_files:
-        logger.info(f"Loading {sf_path.name}")
-        with safe_open(str(sf_path), framework="pt", device="cpu") as f:
-            for name in f.keys():
-                yield name, f.get_tensor(name)
-
-
-# ---------------------------------------------------------------------------
-# I2_S ternary packing (platform-independent)
-# ---------------------------------------------------------------------------
-
-def quantize_to_i2_s(w: np.ndarray) -> np.ndarray:
-    """Quantize float weights to ternary and pack into I2_S layout.
-
-    Uses the same quantization as BitLinear weight_quant_minmax():
-        scale = 1.0 / mean(|w|)
-        q = round(w * scale).clamp(-1, 1)
-        dequant = q / scale = q * mean(|w|)
-
-    Args:
-        w: float weight tensor of shape (M, K)
-
-    Returns:
-        packed_data: uint8 array containing I2_S packed bytes + scale (as 4 trailing bytes)
-    """
-    M, K = w.shape
-    n = M * K
-    w_flat = w.flatten().astype(np.float32)
-
-    abs_mean = np.mean(np.abs(w_flat))
-    abs_mean = max(abs_mean, 1e-5)
-    inv_scale = 1.0 / abs_mean
-    q_float = np.round(w_flat * inv_scale).clip(-1, 1)
-
-    scale = np.float32(abs_mean)
-
-    # Map ternary {-1, 0, 1} -> I2_S encoding {0, 1, 2}
-    q = np.ones(n, dtype=np.uint8)
-    q[q_float > 0.5] = 2
-    q[q_float < -0.5] = 0
-
-    # Pack into I2_S layout: 128-value blocks, interleaved into 32 bytes
-    pad_len = (128 - n % 128) % 128
-    if pad_len:
-        q = np.pad(q, (0, pad_len), constant_values=1)
-
-    n_padded = len(q)
-    n_blocks = n_padded // 128
-
-    q = q.reshape(n_blocks, 4, 32)
-
-    packed = (q[:, 0, :].astype(np.uint8) << 6) | \
-             (q[:, 1, :].astype(np.uint8) << 4) | \
-             (q[:, 2, :].astype(np.uint8) << 2) | \
-             (q[:, 3, :].astype(np.uint8))
-
-    packed = packed.reshape(-1).astype(np.uint8)
-
-    packed_size = n // 4
-    total_size = packed_size + 32
-    result = np.zeros(total_size, dtype=np.uint8)
-    result[:len(packed)] = packed[:packed_size]
-    result[packed_size:packed_size+4] = np.frombuffer(scale.tobytes(), dtype=np.uint8)
-
-    return result
-
-
-# ---------------------------------------------------------------------------
-# Main conversion
-# ---------------------------------------------------------------------------
-
-def main():
-    parser = argparse.ArgumentParser(description="Convert bitnet-embeddings-270m (Gemma3) to GGUF")
-    parser.add_argument("model", type=Path, help="Model directory")
-    parser.add_argument("--outfile", type=Path, default=None, help="Output GGUF file")
-    parser.add_argument("--outtype", choices=["f32", "f16", "i2_s"], default="f16",
-                        help="Output type: f32, f16, or i2_s (ternary quantized)")
-    parser.add_argument("--verbose", action="store_true")
-    args = parser.parse_args()
-
-    logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
-
-    dir_model = args.model
-    if not dir_model.is_dir():
-        logger.error(f"{dir_model} is not a directory")
-        sys.exit(1)
-
-    # Default output filename
-    if args.outfile is None:
-        suffix = {"f32": "-f32", "f16": "-f16", "i2_s": "-f16-new-i2_s"}[args.outtype]
-        args.outfile = dir_model / f"{dir_model.name}{suffix}.gguf"
-
-    # Load config
-    with open(dir_model / "config.json") as f:
-        hparams = json.load(f)
-
-    arch = hparams.get("model_type", "gemma3_text")
-    assert arch == "gemma3_text", f"Expected gemma3_text architecture, got {arch}"
-
-    n_layers = hparams["num_hidden_layers"]
-
-    # Determine ftype
-    if args.outtype == "f32":
-        ftype = 0  # GGML F32
-    elif args.outtype == "f16":
-        ftype = 1  # GGML F16
-    else:  # i2_s
-        ftype = 40  # LLAMA_FTYPE_MOSTLY_I2_S
-
-    logger.info(f"Converting {dir_model.name} to GGUF ({args.outtype})")
-
-    # Create GGUF writer
-    gguf_writer = gguf.GGUFWriter(str(args.outfile), "gemma3")
-
-    # Set parameters
-    set_gguf_parameters(gguf_writer, hparams, dir_model, ftype)
-
-    # Set vocab
-    logger.info("Setting tokenizer/vocab...")
-    set_vocab(gguf_writer, dir_model, hparams)
-
-    # Build tensor name map
-    tensor_map = build_tensor_name_map(n_layers)
-
-    # Process tensors
-    logger.info("Processing tensors...")
-    tensor_count = 0
-    for hf_name, data_torch in iter_tensors(dir_model):
-        # Skip tensors we don't need
-        if hf_name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")):
-            continue
-
-        # Strip "model." prefix if present
-        name = hf_name
-        if name.startswith("model."):
-            name = name[len("model."):]
-
-        # Look up GGUF name
-        gguf_name = tensor_map.get(name)
-        if gguf_name is None:
-            logger.warning(f"Skipping unmapped tensor: {hf_name}")
-            continue
-
-        old_dtype = data_torch.dtype
-
-        # Convert bf16 -> f32 first (bf16 not directly supported by gguf)
-        if data_torch.dtype == torch.bfloat16:
-            data_torch = data_torch.to(torch.float32)
-
-        data = data_torch.squeeze().numpy()
-        n_dims = len(data.shape)
-        data_shape = data.shape
-
-        # Determine if this is a linear weight suitable for ternary quantization
-        is_norm = gguf_name.endswith("_norm.weight") or gguf_name.endswith("_norm_in.weight")
-        is_embed = gguf_name == "token_embd.weight"
-        is_linear_weight = n_dims == 2 and not is_norm and not is_embed
-        suit_i2 = is_linear_weight
-
-        if args.outtype == "i2_s" and suit_i2:
-            # --- I2_S ternary packing (scale embedded in data) ---
-            packed = quantize_to_i2_s(data)
-            data_qtype = gguf.GGMLQuantizationType.I2_S
-
-            shape_str = f"{{{', '.join(str(n) for n in reversed(data_shape))}}}"
-            logger.info(f"  {gguf_name}: {list(data_shape)} {old_dtype} -> I2_S, shape = {shape_str}")
-
-            gguf_writer.add_tensor(gguf_name, packed, raw_shape=data_shape, raw_dtype=data_qtype)
-            tensor_count += 1
-
-        elif args.outtype in ("f16", "i2_s") and (is_linear_weight or is_embed):
-            # 2D weight tensors (linear + embedding) -> f16
-            data = data.astype(np.float16)
-            logger.info(f"  {gguf_name}: {list(data_torch.shape)} {old_dtype} -> float16")
-            gguf_writer.add_tensor(gguf_name, data)
-            tensor_count += 1
-
-        else:
-            # norms, 1D tensors
-            if args.outtype in ("f16", "i2_s"):
-                data = data.astype(np.float16)
-                logger.info(f"  {gguf_name}: {list(data_torch.shape)} {old_dtype} -> float16")
-            else:
-                if data.dtype != np.float32:
-                    data = data.astype(np.float32)
-                logger.info(f"  {gguf_name}: {list(data_torch.shape)} {old_dtype} -> float32")
-            gguf_writer.add_tensor(gguf_name, data)
-            tensor_count += 1
-
-    logger.info(f"Total tensors written: {tensor_count}")
-
-    # Write GGUF
-    logger.info(f"Writing to {args.outfile}...")
-    gguf_writer.write_header_to_file()
-    gguf_writer.write_kv_data_to_file()
-    gguf_writer.write_tensors_to_file()
-    gguf_writer.close()
-
-    logger.info("Done!")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/utils/convert-bitnet-embedding-to-gguf.py b/utils/convert-bitnet-embedding-to-gguf.py
index 3a4340734..9c62ac542 100644
--- a/utils/convert-bitnet-embedding-to-gguf.py
+++ b/utils/convert-bitnet-embedding-to-gguf.py
@@ -23,11 +23,17 @@
 
 logger = logging.getLogger("convert-bitnet-embedding")
 
+# Supported architectures: model_type -> gguf arch name
+SUPPORTED_ARCHS = {
+    "qwen3":       "qwen3",
+    "gemma3_text": "gemma3",
+}
+
 # ---------------------------------------------------------------------------
 # Tensor name mapping: HuggingFace -> GGUF
 # ---------------------------------------------------------------------------
 
-def build_tensor_name_map(n_layers: int) -> dict[str, str]:
+def build_tensor_name_map(n_layers: int, arch: str) -> dict[str, str]:
     """Build HF tensor name -> GGUF tensor name mapping."""
     mapping: dict[str, str] = {
         "embed_tokens.weight": "token_embd.weight",
@@ -41,7 +47,6 @@ def build_tensor_name_map(n_layers: int) -> dict[str, str]:
         mapping.update({
             # Layer norms
             f"{pfx}.input_layernorm.weight":           f"{blk}.attn_norm.weight",
-            f"{pfx}.post_attention_layernorm.weight":   f"{blk}.ffn_norm.weight",
 
             # Self-attention projections
             f"{pfx}.self_attn.q_proj.weight":           f"{blk}.attn_q.weight",
@@ -49,7 +54,7 @@ def build_tensor_name_map(n_layers: int) -> dict[str, str]:
             f"{pfx}.self_attn.v_proj.weight":           f"{blk}.attn_v.weight",
             f"{pfx}.self_attn.o_proj.weight":           f"{blk}.attn_output.weight",
 
-            # QK head norms (standard Qwen3)
+            # QK head norms
             f"{pfx}.self_attn.q_norm.weight":           f"{blk}.attn_q_norm.weight",
             f"{pfx}.self_attn.k_norm.weight":           f"{blk}.attn_k_norm.weight",
 
@@ -70,20 +75,29 @@ def build_tensor_name_map(n_layers: int) -> dict[str, str]:
             f"{pfx}.mlp.down_proj.norm.weight":         f"{blk}.ffn_down_norm_in.weight",
         })
 
+        if arch == "qwen3":
+            mapping[f"{pfx}.post_attention_layernorm.weight"] = f"{blk}.ffn_norm.weight"
+        elif arch == "gemma3_text":
+            mapping.update({
+                f"{pfx}.post_attention_layernorm.weight":   f"{blk}.post_attention_norm.weight",
+                f"{pfx}.pre_feedforward_layernorm.weight":  f"{blk}.ffn_norm.weight",
+                f"{pfx}.post_feedforward_layernorm.weight": f"{blk}.post_ffw_norm.weight",
+            })
+
     return mapping
 
 
 # ---------------------------------------------------------------------------
-# Tokenizer handling (GPT-2 / BPE for Qwen3)
+# Tokenizer handling
 # ---------------------------------------------------------------------------
 
-def get_vocab_base_pre(tokenizer) -> str:
+def get_vocab_base_pre(tokenizer, arch: str) -> str:
     # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
     # is specific for the BPE pre-tokenizer used by the model
     # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
     # use in llama.cpp to implement the same pre-tokenizer
 
-    chktxt = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n\U0001f680 (normal) \U0001f636‍\U0001f32b️ (multiple emojis concatenated) ✅ \U0001f999\U0001f999 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច\U0001f601 ?我想在apple工作1314151天～ ------======= нещо на Български \'\'\'\'\'\'```````""""......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
+    chktxt = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n\U0001f680 (normal) \U0001f636‍\U0001f32b️ (multiple emojis concatenated) ✅ \U0001f999\U0001f999 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច\U0001f601 ?我想在apple工作1314151天～ ------======= нещо на Български \'\'\'\'\'\'```````""""""......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
 
     chktok = tokenizer.encode(chktxt)
     chkhsh = sha256(str(chktok).encode()).hexdigest()
@@ -93,27 +107,35 @@ def get_vocab_base_pre(tokenizer) -> str:
 
     res = None
 
-    # NOTE: if you get an error here, you need to update the convert_hf_to_gguf_update.py script
-    #       or pull the latest version of the model from Huggingface
-    #       don't edit the hashes manually!
-    if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
-        # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
-        res = "llama-bpe"
-    if chkhsh == "049ecf7629871e3041641907f3de7c733e4dbfdc736f57d882ba0b0845599754":
-        # ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-base
-        res = "deepseek-llm"
-    if chkhsh == "347715f544604f9118bb75ed199f68779f423cabb20db6de6f31b908d04d7821":
-        # ref: https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base
-        res = "deepseek-coder"
-    if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed":
-        # ref: https://huggingface.co/tiiuae/falcon-7b
-        res = "falcon"
-    if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454":
-        # ref: https://huggingface.co/openai-community/gpt2
-        res = "gpt-2"
-    if chkhsh == "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c":
-        # ref: https://huggingface.co/Qwen/Qwen3-Embedding-0.6B
-        res = "qwen2"
+    if arch == "qwen3":
+        # NOTE: if you get an error here, you need to update the convert_hf_to_gguf_update.py script
+        #       or pull the latest version of the model from Huggingface
+        #       don't edit the hashes manually!
+        if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
+            # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
+            res = "llama-bpe"
+        if chkhsh == "049ecf7629871e3041641907f3de7c733e4dbfdc736f57d882ba0b0845599754":
+            # ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-base
+            res = "deepseek-llm"
+        if chkhsh == "347715f544604f9118bb75ed199f68779f423cabb20db6de6f31b908d04d7821":
+            # ref: https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base
+            res = "deepseek-coder"
+        if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed":
+            # ref: https://huggingface.co/tiiuae/falcon-7b
+            res = "falcon"
+        if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454":
+            # ref: https://huggingface.co/openai-community/gpt2
+            res = "gpt-2"
+        if chkhsh == "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c":
+            # ref: https://huggingface.co/Qwen/Qwen3-Embedding-0.6B
+            res = "qwen2"
+    elif arch == "gemma3_text":
+        if chkhsh == "fcb6bf9f20f6c40fa4aa4f7f99607bd6c106ca2348efdacacdca8152e59dcfe9":
+            # ref: multilingual-e5-270m-260311 (Gemma3 tokenizer)
+            res = "default"
+        if chkhsh == "a8594e3edff7c29c003940395316294b2c623571571fc8d3d2d6571f5571cbe6":
+            # ref: google/gemma-2-9b
+            res = "default"
 
     if res is None:
         logger.warning("\n")
@@ -146,13 +168,13 @@ def _does_token_look_special(token: str) -> bool:
     return False
 
 
-def set_vocab(gguf_writer: gguf.GGUFWriter, dir_model: Path, hparams: dict):
-    """Set GPT-2 BPE vocab for Qwen3."""
+def set_vocab(gguf_writer: gguf.GGUFWriter, dir_model: Path, hparams: dict, arch: str):
+    """Set BPE vocab."""
     from transformers import AutoTokenizer
     tokenizer = AutoTokenizer.from_pretrained(dir_model)
     vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
 
-    tokpre = get_vocab_base_pre(tokenizer)
+    tokpre = get_vocab_base_pre(tokenizer, arch)
 
     tokens: list[str] = []
     toktypes: list[int] = []
@@ -191,14 +213,18 @@ def set_vocab(gguf_writer: gguf.GGUFWriter, dir_model: Path, hparams: dict):
     gguf_writer.add_token_types(toktypes)
 
     special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
-    # Override EOS token: PyTorch tokenizer appends <|endoftext|> (151643) as the
-    # sentence-end marker, not <|im_end|> (151645). For last-token pooling to work
-    # correctly, llama.cpp must append the same token.
-    special_vocab.special_token_ids["eos"] = 151643
+
+    if arch == "qwen3":
+        # Override EOS token: PyTorch tokenizer appends <|endoftext|> (151643) as the
+        # sentence-end marker, not <|im_end|> (151645). For last-token pooling to work
+        # correctly, llama.cpp must append the same token.
+        special_vocab.special_token_ids["eos"] = 151643
+
     special_vocab.add_to_gguf(gguf_writer)
 
-    # Embedding models need EOS token appended for last-token pooling
-    gguf_writer.add_add_eos_token(True)
+    if arch == "qwen3":
+        # Embedding models need EOS token appended for last-token pooling
+        gguf_writer.add_add_eos_token(True)
 
 
 # ---------------------------------------------------------------------------
@@ -260,7 +286,7 @@ def set_gguf_parameters(gguf_writer: gguf.GGUFWriter, hparams: dict, dir_model:
         pooling_type = gguf.PoolingType.MEAN
     gguf_writer.add_pooling_type(pooling_type)
 
-    logger.info(f"  n_layers={n_layers}, n_embd={n_embd}, n_head={n_head}, n_head_kv={n_head_kv}, n_ff={n_ff}")
+    logger.info(f"  n_layers={n_layers}, n_embd={n_embd}, n_head={n_head}, n_head_kv={n_head_kv}, n_ff={n_ff}, head_dim={head_dim}")
 
 
 # ---------------------------------------------------------------------------
@@ -366,7 +392,7 @@ def quantize_to_i2_s(w: np.ndarray) -> np.ndarray:
 # ---------------------------------------------------------------------------
 
 def main():
-    parser = argparse.ArgumentParser(description="Convert bitnet-embeddings to GGUF")
+    parser = argparse.ArgumentParser(description="Convert bitnet-embeddings (Qwen3/Gemma3) to GGUF")
     parser.add_argument("model", type=Path, help="Model directory")
     parser.add_argument("--outfile", type=Path, default=None, help="Output GGUF file")
     parser.add_argument("--outtype", choices=["f32", "f16", "i2_s"], default="f16",
@@ -390,9 +416,12 @@ def main():
     with open(dir_model / "config.json") as f:
         hparams = json.load(f)
 
-    arch = hparams.get("model_type", "qwen3")
-    assert arch == "qwen3", f"Expected qwen3 architecture, got {arch}"
+    arch = hparams.get("model_type", "")
+    if arch not in SUPPORTED_ARCHS:
+        logger.error(f"Unsupported model_type '{arch}'. Supported: {list(SUPPORTED_ARCHS.keys())}")
+        sys.exit(1)
 
+    gguf_arch = SUPPORTED_ARCHS[arch]
     n_layers = hparams["num_hidden_layers"]
 
     # Determine ftype
@@ -403,20 +432,20 @@ def main():
     else:  # i2_s
         ftype = 40  # LLAMA_FTYPE_MOSTLY_I2_S
 
-    logger.info(f"Converting {dir_model.name} to GGUF ({args.outtype})")
+    logger.info(f"Converting {dir_model.name} (arch={arch}) to GGUF ({args.outtype})")
 
     # Create GGUF writer
-    gguf_writer = gguf.GGUFWriter(str(args.outfile), "qwen3")
+    gguf_writer = gguf.GGUFWriter(str(args.outfile), gguf_arch)
 
     # Set parameters
     set_gguf_parameters(gguf_writer, hparams, dir_model, ftype)
 
     # Set vocab
     logger.info("Setting tokenizer/vocab...")
-    set_vocab(gguf_writer, dir_model, hparams)
+    set_vocab(gguf_writer, dir_model, hparams, arch)
 
     # Build tensor name map
-    tensor_map = build_tensor_name_map(n_layers)
+    tensor_map = build_tensor_name_map(n_layers, arch)
 
     # Process tensors
     logger.info("Processing tensors...")