From 6d186cb9f91a4ff8a69427675875852e1254a1c1 Mon Sep 17 00:00:00 2001 From: isHuangXin Date: Thu, 21 May 2026 12:16:23 +0800 Subject: [PATCH 1/3] Add bitnet-embeddings-0.6b model adaptation with F16 and I2_S GGUF conversion - Add GGUF conversion tool for bitnet-embeddings-0.6b (safetensors -> F16/I2_S GGUF) - Add Qwen3 architecture support in llama.cpp submodule with per-projection RMSNorm - Add I2_S ternary quantization (2-bit packed -1/0/+1) for lossless precision - Add f16 norm weight support for correct embedding inference - Add AVX512BW SIMD paths for I2_S kernel (~2x throughput on AVX512-capable CPUs) - Guard bitnet-lut-kernels.h include with TL1/TL2 preprocessor checks - Update llama.cpp submodule to dev-bitnet-embedding-0.6b branch - Document F16 (from multilingual-e5-0.6b) and I2_S (from bitnet-embeddings-0.6b) conversion process --- 3rdparty/llama.cpp | 2 +- ...bitnet-embeddings-qwen3-gguf-conversion.md | 302 +++++++++++ src/ggml-bitnet-lut.cpp | 7 + src/ggml-bitnet-mad.cpp | 469 +++++++++++++++- utils/convert-bitnet-embedding-to-gguf.py | 502 ++++++++++++++++++ 5 files changed, 1277 insertions(+), 5 deletions(-) create mode 100644 docs/bitnet-embeddings-qwen3-gguf-conversion.md create mode 100644 utils/convert-bitnet-embedding-to-gguf.py diff --git a/3rdparty/llama.cpp b/3rdparty/llama.cpp index 1f86f058d..13e129947 160000 --- a/3rdparty/llama.cpp +++ b/3rdparty/llama.cpp @@ -1 +1 @@ -Subproject commit 1f86f058de0c3f4098dedae2ae8653c335c868a1 +Subproject commit 13e129947db43cbbcbfa985c72c443c2f2757f15 diff --git a/docs/bitnet-embeddings-qwen3-gguf-conversion.md b/docs/bitnet-embeddings-qwen3-gguf-conversion.md new file mode 100644 index 000000000..9d63c9300 --- /dev/null +++ b/docs/bitnet-embeddings-qwen3-gguf-conversion.md @@ -0,0 +1,302 @@ +# BitNet Embeddings (Qwen3) GGUF Conversion Implementation + +## 1. Background + +`bitnet-embeddings-0.6b` is a Qwen3-based embedding model with BitNet per-projection RMSNorm (`BitLinear`). Each linear projection (q/k/v/o/gate/up/down) has a `.norm.weight` that applies RMSNorm to the input **before** the matmul: + +``` +x → RMSNorm(x, norm.weight) → activation_quant(8bit) → matmul(weight_quant(ternary)) +``` + +This pattern does **not** exist in any standard llama.cpp architecture: +- Standard Qwen3: no per-projection norms +- Standard BitNet: has `attn_sub_norm`/`ffn_sub_norm` at different positions (after attention/gate*up, not before each projection) + +### Model Config + +- Architecture: `Qwen3Model` +- hidden_size: 1024, num_attention_heads: 16, num_key_value_heads: 8 +- head_dim: 128 (note: != hidden_size/num_heads = 64) +- intermediate_size: 3072, num_hidden_layers: 28 +- tie_word_embeddings: true +- rope_theta: 1000000, rms_norm_eps: 1e-06 + +### Per-Layer Tensors (7 extra norm tensors per layer) + +| Tensor | Shape | +|--------|-------| +| `self_attn.q_proj.norm.weight` | [1024] | +| `self_attn.k_proj.norm.weight` | [1024] | +| `self_attn.v_proj.norm.weight` | [1024] | +| `self_attn.o_proj.norm.weight` | [2048] | +| `mlp.gate_proj.norm.weight` | [1024] | +| `mlp.up_proj.norm.weight` | [1024] | +| `mlp.down_proj.norm.weight` | [3072] | + +--- + +## 2. GGUF Tensor Name Mapping + +| HF Name | GGUF Name | Notes | +|----------|-----------|-------| +| `embed_tokens.weight` | `token_embd.weight` | | +| `norm.weight` | `output_norm.weight` | | +| `layers.{i}.input_layernorm.weight` | `blk.{i}.attn_norm.weight` | | +| `layers.{i}.post_attention_layernorm.weight` | `blk.{i}.ffn_norm.weight` | | +| `layers.{i}.self_attn.q_proj.weight` | `blk.{i}.attn_q.weight` | | +| `layers.{i}.self_attn.k_proj.weight` | `blk.{i}.attn_k.weight` | | +| `layers.{i}.self_attn.v_proj.weight` | `blk.{i}.attn_v.weight` | | +| `layers.{i}.self_attn.o_proj.weight` | `blk.{i}.attn_output.weight` | | +| `layers.{i}.self_attn.q_norm.weight` | `blk.{i}.attn_q_norm.weight` | QK head norm | +| `layers.{i}.self_attn.k_norm.weight` | `blk.{i}.attn_k_norm.weight` | QK head norm | +| `layers.{i}.self_attn.q_proj.norm.weight` | `blk.{i}.attn_q_norm_in.weight` | BitNet per-projection | +| `layers.{i}.self_attn.k_proj.norm.weight` | `blk.{i}.attn_k_norm_in.weight` | BitNet per-projection | +| `layers.{i}.self_attn.v_proj.norm.weight` | `blk.{i}.attn_v_norm_in.weight` | BitNet per-projection | +| `layers.{i}.self_attn.o_proj.norm.weight` | `blk.{i}.attn_output_norm_in.weight` | BitNet per-projection | +| `layers.{i}.mlp.gate_proj.weight` | `blk.{i}.ffn_gate.weight` | | +| `layers.{i}.mlp.up_proj.weight` | `blk.{i}.ffn_up.weight` | | +| `layers.{i}.mlp.down_proj.weight` | `blk.{i}.ffn_down.weight` | | +| `layers.{i}.mlp.gate_proj.norm.weight` | `blk.{i}.ffn_gate_norm_in.weight` | BitNet per-projection | +| `layers.{i}.mlp.up_proj.norm.weight` | `blk.{i}.ffn_up_norm_in.weight` | BitNet per-projection | +| `layers.{i}.mlp.down_proj.norm.weight` | `blk.{i}.ffn_down_norm_in.weight` | BitNet per-projection | + +--- + +## 3. Conversion Script + +### `utils/convert-bitnet-embedding-to-gguf.py` + +Standalone conversion script (safetensors → GGUF). Key features: + +- Hardcoded HF→GGUF tensor name mapping (no dependency on llama.cpp's Python converter) +- Supports three output types: + - `--outtype f32`: all weights in float32 + - `--outtype f16`: 2D weights and embeddings as float16, norms as float16 + - `--outtype i2_s`: ternary weights packed in I2_S layout, non-ternary weights as float16 +- Writes `key_length` and `value_length` metadata for head_dim=128 (critical: default calculation would give wrong value 64) +- GPT-2 BPE tokenizer handling with pre-tokenizer hash verification +- Pooling type auto-detection from `modules.json` / `1_Pooling/config.json` (sentence-transformers convention) +- EOS token override: uses `<|endoftext|>` (151643) for correct last-token pooling +- Architecture string: `"qwen3"` + +### I2_S Ternary Packing + +The I2_S format packs ternary weights {-1, 0, +1} into 2-bit representation: + +- Quantization: `scale = 1/mean(|w|)`, `q = round(w * scale).clamp(-1, 1)` +- Encoding: `-1 → 0`, `0 → 1`, `+1 → 2` +- Every 128 values form a block, packed into 32 bytes +- Each byte stores 4 values: `byte = (c0 << 6) | (c1 << 4) | (c2 << 2) | c3` +- Scale (float32) is appended at the end of the packed data buffer + +### Tensor Type Assignment + +| Tensor Type | f16 mode | i2_s mode | +|-------------|----------|-----------| +| 2D linear weights | float16 | I2_S ternary packed | +| Embedding weights | float16 | float16 | +| Norm weights (1D) | float16 | float16 | + +Note: `output.weight` (lm_head) is skipped for embedding models — it is not needed (no token generation). + +--- + +## 4. C++ Modifications (`3rdparty/llama.cpp/src/llama.cpp`) + +### 4.1 New Tensor Enums + +Added 7 new entries after `LLM_TENSOR_FFN_SUB_NORM`: + +```cpp +LLM_TENSOR_ATTN_Q_NORM_IN, +LLM_TENSOR_ATTN_K_NORM_IN, +LLM_TENSOR_ATTN_V_NORM_IN, +LLM_TENSOR_ATTN_OUT_NORM_IN, +LLM_TENSOR_FFN_GATE_NORM_IN, +LLM_TENSOR_FFN_UP_NORM_IN, +LLM_TENSOR_FFN_DOWN_NORM_IN, +``` + +### 4.2 Tensor Name Mappings + +Added to `LLM_ARCH_QWEN3` tensor name map: + +```cpp +{ LLM_TENSOR_ATTN_Q_NORM_IN, "blk.%d.attn_q_norm_in" }, +{ LLM_TENSOR_ATTN_K_NORM_IN, "blk.%d.attn_k_norm_in" }, +{ LLM_TENSOR_ATTN_V_NORM_IN, "blk.%d.attn_v_norm_in" }, +{ LLM_TENSOR_ATTN_OUT_NORM_IN, "blk.%d.attn_output_norm_in" }, +{ LLM_TENSOR_FFN_GATE_NORM_IN, "blk.%d.ffn_gate_norm_in" }, +{ LLM_TENSOR_FFN_UP_NORM_IN, "blk.%d.ffn_up_norm_in" }, +{ LLM_TENSOR_FFN_DOWN_NORM_IN, "blk.%d.ffn_down_norm_in" }, +``` + +### 4.3 Layer Struct Fields + +Added to `struct llama_layer`: + +```cpp +struct ggml_tensor * attn_q_norm_in; +struct ggml_tensor * attn_k_norm_in; +struct ggml_tensor * attn_v_norm_in; +struct ggml_tensor * attn_out_norm_in; +struct ggml_tensor * ffn_gate_norm_in; +struct ggml_tensor * ffn_up_norm_in; +struct ggml_tensor * ffn_down_norm_in; +``` + +### 4.4 load_tensors (LLM_ARCH_QWEN3) + +Added optional loading with `TENSOR_NOT_REQUIRED`: + +```cpp +layer.attn_q_norm_in = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM_IN, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED); +layer.attn_k_norm_in = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM_IN, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED); +layer.attn_v_norm_in = create_tensor(tn(LLM_TENSOR_ATTN_V_NORM_IN, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED); +layer.attn_out_norm_in = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM_IN, "weight", i), {n_embd_head_k * n_head}, TENSOR_NOT_REQUIRED); +layer.ffn_gate_norm_in = create_tensor(tn(LLM_TENSOR_FFN_GATE_NORM_IN, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED); +layer.ffn_up_norm_in = create_tensor(tn(LLM_TENSOR_FFN_UP_NORM_IN, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED); +layer.ffn_down_norm_in = create_tensor(tn(LLM_TENSOR_FFN_DOWN_NORM_IN, "weight", i), {n_ff}, TENSOR_NOT_REQUIRED); +``` + +Note: `o_proj.norm` input dimension is `n_embd_head_k * n_head` (=2048), `down_proj.norm` input dimension is `n_ff` (=3072). + +### 4.5 build_qwen3() Graph Modifications + +The `build_qwen3()` function was modified to conditionally apply per-projection RMSNorm. The logic is fully backward compatible — when no `*_norm_in` tensors exist, behavior is identical to original. + +**Attention per-projection norms:** +``` +// Before Q/K/V matmul: +if (layer.attn_q_norm_in) { + cur_q = ggml_rms_norm(ctx, cur, hparams.f_norm_rms_eps); + cur_q = ggml_mul(ctx, cur_q, layer.attn_q_norm_in); +} else { + cur_q = cur; +} +Qcur = ggml_mul_mat(ctx, layer.wq, cur_q); +// Similarly for K, V +``` + +**O_proj norm** requires special handling because `llm_build_kv()` normally applies `wo` internally. Solution: pass `wo=NULL` to `llm_build_kv()`, then apply norm + wo manually: + +``` +cur = llm_build_kv(..., wo=NULL, ...); // returns attention output without o_proj +if (layer.attn_out_norm_in) { + cur = ggml_rms_norm(ctx, cur, hparams.f_norm_rms_eps); + cur = ggml_mul(ctx, cur, layer.attn_out_norm_in); +} +cur = ggml_mul_mat(ctx, layer.wo, cur); +``` + +**FFN per-projection norms:** +``` +// Instead of llm_build_ffn(), manually: +if (layer.ffn_gate_norm_in) { + tmp_gate = rms_norm(cur) * gate_norm_in; +} else { + tmp_gate = cur; +} +tmp_gate = matmul(gate_proj, tmp_gate); +// Similarly for up_proj +tmp = silu(tmp_gate) * tmp_up; + +if (layer.ffn_down_norm_in) { + tmp = rms_norm(tmp) * down_norm_in; +} +cur = matmul(down_proj, tmp); +``` + +--- + +## 5. GGUF Conversion Process + +There are two GGUF files to produce, from **two different source models**: + +| GGUF Output | Source Model | Description | +|-------------|-------------|-------------| +| `embeddings-0.6b-f16.gguf` | `multilingual-e5-0.6b` (standard Qwen3) | F16 baseline, standard float16 weights | +| `bitnet-embeddings-0.6b-f16-i2_s.gguf` | `bitnet-embeddings-0.6b` (BitNet ternary) | I2_S ternary packed weights | + +### 5.1 F16 GGUF: from multilingual-e5-0.6b + +The F16 GGUF is converted from the **standard (non-BitNet) model** `multilingual-e5-0.6b`, which has normal float weights and no per-projection RMSNorm. This uses llama.cpp's standard converter since it is a vanilla Qwen3 model: + +```bash +python3 /path/to/llama.cpp/convert_hf_to_gguf.py \ + /path/to/multilingual-e5-0.6b \ + --outtype f16 \ + --outfile embeddings-0.6b-f16.gguf +``` + +**What happens:** +1. Load `model.safetensors` (standard Qwen3 weights, bfloat16) +2. Convert all 2D weights (projections, embeddings) to float16 +3. Convert norm weights to float32 +4. Write GGUF with `qwen3` architecture metadata and tokenizer + +**Output:** ~1.11 GiB (595.78M params) + +### 5.2 I2_S GGUF: from bitnet-embeddings-0.6b + +The I2_S GGUF is converted from the **BitNet ternary model** `bitnet-embeddings-0.6b`, which has ternary weights {-1, 0, +1} and 7 extra per-projection RMSNorm tensors per layer. This uses the custom converter because the standard llama.cpp converter does not handle per-projection norms or I2_S quantization: + +```bash +python3 utils/convert-bitnet-embedding-to-gguf.py \ + /path/to/bitnet-embeddings-0.6b \ + --outfile bitnet-embeddings-0.6b-f16-i2_s.gguf --outtype i2_s +``` + +**What happens:** +1. Load `model.safetensors` (BitNet ternary weights, bfloat16) +2. Map HF tensor names to GGUF names, including 7 extra `*_norm_in` tensors per layer (see Section 2) +3. For each 2D linear weight (q/k/v/o/gate/up/down projections): + - Compute scale: `scale = 1 / mean(|w|)` + - Quantize: `q = round(w * scale).clamp(-1, 1)` + - Encode: `-1 -> 0`, `0 -> 1`, `+1 -> 2` + - Pack every 128 values into 32 bytes (4 values per byte, 2 bits each) + - Append per-row float32 scale +4. Keep embeddings (`token_embd.weight`) in float16 (not ternary) +5. Keep all norm weights in float16 +6. Skip `output.weight` (lm_head, not needed for embedding models) +7. Write GGUF with `I2_S` type tag for quantized tensors + +**Output:** ~699 MiB (~50% of F16 size) + +### 5.3 Why Two Different Source Models? + +- `multilingual-e5-0.6b` is the **teacher/baseline model** with standard float weights, used as the F16 performance reference +- `bitnet-embeddings-0.6b` is the **1-bit quantized student model** with ternary weights and per-projection BitLinear norms, converted to I2_S for efficient CPU inference +- Benchmarking compares both to measure the throughput gain and quality trade-off of ternary quantization + +### 5.4 Tensor Type Summary + +| Tensor | F16 (from e5-0.6b) | I2_S (from bitnet-0.6b) | +|--------|---------------------|-------------------------| +| Linear projections (q/k/v/o/gate/up/down) | float16 | I2_S (2-bit packed + float32 scale) | +| Embedding (`token_embd.weight`) | float16 | float16 | +| Per-projection norms (`*_norm_in`) | N/A (not present) | float16 | +| Layer norms (`attn_norm`, `ffn_norm`) | float32 | float16 | +| QK head norms (`attn_q_norm`, `attn_k_norm`) | float32 | float32 | +| `output.weight` (lm_head) | present | skipped | + +--- + +## 6. Build and Run + +```bash +# Build with BitNet repo (includes I2_S support) +cmake -S /path/to/BitNet -B build -DCMAKE_BUILD_TYPE=Release +cmake --build build --target llama-embedding llama-bench -j$(nproc) + +# Run embedding inference +build/bin/llama-embedding -m bitnet-embeddings-0.6b-f16-i2_s.gguf \ + -p "hello world" --embd-normalize 2 --embd-output-format array + +# Benchmark: F16 vs I2_S +build/bin/llama-bench -m embeddings-0.6b-f16.gguf \ + -t 8 -p 128,256,512,1024,2048 -n 32,64 -r 3 -ngl 0 + +build/bin/llama-bench -m bitnet-embeddings-0.6b-f16-i2_s.gguf \ + -t 8 -p 128,256,512,1024,2048 -n 32,64 -r 3 -ngl 0 +``` diff --git a/src/ggml-bitnet-lut.cpp b/src/ggml-bitnet-lut.cpp index 59422d548..beef726f7 100644 --- a/src/ggml-bitnet-lut.cpp +++ b/src/ggml-bitnet-lut.cpp @@ -5,9 +5,16 @@ #include #include +#ifdef __x86_64__ +#include +#endif + #include "ggml-bitnet.h" #include "ggml-quants.h" + +#if defined(GGML_BITNET_ARM_TL1) || defined(GGML_BITNET_X86_TL2) #include "bitnet-lut-kernels.h" +#endif #if defined(GGML_BITNET_ARM_TL1) diff --git a/src/ggml-bitnet-mad.cpp b/src/ggml-bitnet-mad.cpp index 4ba9d6509..f99368bbd 100644 --- a/src/ggml-bitnet-mad.cpp +++ b/src/ggml-bitnet-mad.cpp @@ -24,6 +24,12 @@ static inline int hsum_i32_8(const __m256i a) { const __m128i hi32 = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1)); return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32)); } +#if defined(__AVX512F__) && defined(__AVX512BW__) +// horizontally add 16 int32_t +static inline int hsum_i32_16(const __m512i a) { + return _mm512_reduce_add_epi32(a); +} +#endif #elif defined(__loongarch_asx) // horizontally add 8 int32_t static inline int hsum_i32_8(const __m256i a) { @@ -196,7 +202,153 @@ size_t quantize_i2_s(const float * src, void * dst, int64_t nrow, int64_t n_per_ } void ggml_vec_dot_i2_i8_s_1x1(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) { -#if defined(__AVX2__) +#if defined(__AVX512F__) && defined(__AVX512BW__) + const uint8_t * x = (uint8_t *)vx; + const int8_t * y = (int8_t *)vy; + + const int nb = n / QK_I2_S; + const int group32_num = nb / 32; + const int la_num = nb % 32; + const int groupla_num = nb % 32 != 0 ? 1 : 0; + + const __m512i mask = _mm512_set1_epi8(0x03); + const __m512i one16 = _mm512_set1_epi16(1); + + for (int row = 0; row < nrc; row++) { + __m512i accu = _mm512_setzero_si512(); + + const uint8_t * x_row = x + row * bx / 4; + + for (int i = 0; i < group32_num; i++) { + const uint8_t *px = x_row + i * 1024; + const int8_t *py = y + i * 4096; + __m512i accu32 = _mm512_setzero_si512(); + + // Process 2 blocks per iteration (j+=2), 16 iterations instead of 32 + int j = 0; + for (; j + 1 < 32; j += 2) { + // Load 2 consecutive 32-byte weight blocks into one 512-bit register + __m512i xq8_3 = _mm512_loadu_si512((const __m512i*)(px)); + __m512i xq8_2 = _mm512_srli_epi16(xq8_3, 2); + __m512i xq8_1 = _mm512_srli_epi16(xq8_3, 4); + __m512i xq8_0 = _mm512_srli_epi16(xq8_3, 6); + + xq8_3 = _mm512_and_si512(xq8_3, mask); + xq8_2 = _mm512_and_si512(xq8_2, mask); + xq8_1 = _mm512_and_si512(xq8_1, mask); + xq8_0 = _mm512_and_si512(xq8_0, mask); + + // Load 2 consecutive 128-byte activation blocks (256 bytes total = 4 x 64) + __m512i yq8_0 = _mm512_loadu_si512((const __m512i*)(py)); + __m512i yq8_1 = _mm512_loadu_si512((const __m512i*)(py + 64)); + __m512i yq8_2 = _mm512_loadu_si512((const __m512i*)(py + 128)); + __m512i yq8_3 = _mm512_loadu_si512((const __m512i*)(py + 192)); + + xq8_0 = _mm512_maddubs_epi16(xq8_0, yq8_0); + xq8_1 = _mm512_maddubs_epi16(xq8_1, yq8_1); + xq8_2 = _mm512_maddubs_epi16(xq8_2, yq8_2); + xq8_3 = _mm512_maddubs_epi16(xq8_3, yq8_3); + + accu32 = _mm512_add_epi16(accu32, _mm512_add_epi16(xq8_0, xq8_1)); + accu32 = _mm512_add_epi16(accu32, _mm512_add_epi16(xq8_2, xq8_3)); + + px += 64; + py += 256; + } + // Handle odd remaining block + if (j < 32) { + __m256i xq8_3_256 = _mm256_loadu_si256((const __m256i*)(px)); + __m512i xq8_3 = _mm512_castsi256_si512(xq8_3_256); + __m512i xq8_2 = _mm512_srli_epi16(xq8_3, 2); + __m512i xq8_1 = _mm512_srli_epi16(xq8_3, 4); + __m512i xq8_0 = _mm512_srli_epi16(xq8_3, 6); + + xq8_3 = _mm512_and_si512(xq8_3, mask); + xq8_2 = _mm512_and_si512(xq8_2, mask); + xq8_1 = _mm512_and_si512(xq8_1, mask); + xq8_0 = _mm512_and_si512(xq8_0, mask); + + __m256i yq8_0_256 = _mm256_loadu_si256((const __m256i*)(py)); + __m256i yq8_1_256 = _mm256_loadu_si256((const __m256i*)(py + 32)); + __m256i yq8_2_256 = _mm256_loadu_si256((const __m256i*)(py + 64)); + __m256i yq8_3_256 = _mm256_loadu_si256((const __m256i*)(py + 96)); + + xq8_0 = _mm512_maddubs_epi16(xq8_0, _mm512_castsi256_si512(yq8_0_256)); + xq8_1 = _mm512_maddubs_epi16(xq8_1, _mm512_castsi256_si512(yq8_1_256)); + xq8_2 = _mm512_maddubs_epi16(xq8_2, _mm512_castsi256_si512(yq8_2_256)); + xq8_3 = _mm512_maddubs_epi16(xq8_3, _mm512_castsi256_si512(yq8_3_256)); + + accu32 = _mm512_add_epi16(accu32, _mm512_add_epi16(xq8_0, xq8_1)); + accu32 = _mm512_add_epi16(accu32, _mm512_add_epi16(xq8_2, xq8_3)); + } + accu = _mm512_add_epi32(_mm512_madd_epi16(accu32, one16), accu); + } + + for (int i = 0; i < groupla_num; i++) { + __m512i accula = _mm512_setzero_si512(); + const uint8_t *px = x_row + group32_num * 1024; + const int8_t *py = y + group32_num * 4096; + + int j = 0; + for (; j + 1 < la_num; j += 2) { + __m512i xq8_3 = _mm512_loadu_si512((const __m512i*)(px)); + __m512i xq8_2 = _mm512_srli_epi16(xq8_3, 2); + __m512i xq8_1 = _mm512_srli_epi16(xq8_3, 4); + __m512i xq8_0 = _mm512_srli_epi16(xq8_3, 6); + + xq8_3 = _mm512_and_si512(xq8_3, mask); + xq8_2 = _mm512_and_si512(xq8_2, mask); + xq8_1 = _mm512_and_si512(xq8_1, mask); + xq8_0 = _mm512_and_si512(xq8_0, mask); + + __m512i yq8_0 = _mm512_loadu_si512((const __m512i*)(py)); + __m512i yq8_1 = _mm512_loadu_si512((const __m512i*)(py + 64)); + __m512i yq8_2 = _mm512_loadu_si512((const __m512i*)(py + 128)); + __m512i yq8_3 = _mm512_loadu_si512((const __m512i*)(py + 192)); + + xq8_0 = _mm512_maddubs_epi16(xq8_0, yq8_0); + xq8_1 = _mm512_maddubs_epi16(xq8_1, yq8_1); + xq8_2 = _mm512_maddubs_epi16(xq8_2, yq8_2); + xq8_3 = _mm512_maddubs_epi16(xq8_3, yq8_3); + + accula = _mm512_add_epi16(accula, _mm512_add_epi16(xq8_0, xq8_1)); + accula = _mm512_add_epi16(accula, _mm512_add_epi16(xq8_2, xq8_3)); + + px += 64; + py += 256; + } + if (j < la_num) { + __m256i xq8_3_256 = _mm256_loadu_si256((const __m256i*)(px)); + __m512i xq8_3 = _mm512_castsi256_si512(xq8_3_256); + __m512i xq8_2 = _mm512_srli_epi16(xq8_3, 2); + __m512i xq8_1 = _mm512_srli_epi16(xq8_3, 4); + __m512i xq8_0 = _mm512_srli_epi16(xq8_3, 6); + + xq8_3 = _mm512_and_si512(xq8_3, mask); + xq8_2 = _mm512_and_si512(xq8_2, mask); + xq8_1 = _mm512_and_si512(xq8_1, mask); + xq8_0 = _mm512_and_si512(xq8_0, mask); + + __m256i yq8_0_256 = _mm256_loadu_si256((const __m256i*)(py)); + __m256i yq8_1_256 = _mm256_loadu_si256((const __m256i*)(py + 32)); + __m256i yq8_2_256 = _mm256_loadu_si256((const __m256i*)(py + 64)); + __m256i yq8_3_256 = _mm256_loadu_si256((const __m256i*)(py + 96)); + + xq8_0 = _mm512_maddubs_epi16(xq8_0, _mm512_castsi256_si512(yq8_0_256)); + xq8_1 = _mm512_maddubs_epi16(xq8_1, _mm512_castsi256_si512(yq8_1_256)); + xq8_2 = _mm512_maddubs_epi16(xq8_2, _mm512_castsi256_si512(yq8_2_256)); + xq8_3 = _mm512_maddubs_epi16(xq8_3, _mm512_castsi256_si512(yq8_3_256)); + + accula = _mm512_add_epi16(accula, _mm512_add_epi16(xq8_0, xq8_1)); + accula = _mm512_add_epi16(accula, _mm512_add_epi16(xq8_2, xq8_3)); + } + accu = _mm512_add_epi32(accu, _mm512_madd_epi16(accula, one16)); + } + + int sumi = hsum_i32_16(accu); + s[row] = (float)sumi; + } +#elif defined(__AVX2__) const uint8_t * x = (uint8_t *)vx; const int8_t * y = (int8_t *)vy; @@ -510,7 +662,184 @@ void ggml_vec_dot_i2_i8_s_1x4_32W(int n, float * s, size_t bs, const void * vx, } void ggml_vec_dot_i2_i8_s_1xN(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) { -#if defined(__AVX2__) +#if defined(__AVX512F__) && defined(__AVX512BW__) + const uint8_t * x = (uint8_t *)vx; + const int8_t * y = (int8_t *)vy; + + const int nb = n / QK_I2_S; + const int group32_num = nb / 32; + const int la_num = nb % 32; + const int groupla_num = nb % 32 != 0 ? 1 : 0; + + const __m512i mask = _mm512_set1_epi8(0x03); + const __m512i one16 = _mm512_set1_epi16(1); + + for (int row = 0; row < nrc; row += PARALLEL_SIZE) { + __m512i accu[PARALLEL_SIZE]; + const uint8_t * x_row[PARALLEL_SIZE]; + for (int rb = 0; rb < PARALLEL_SIZE; rb++) { + accu[rb] = _mm512_setzero_si512(); + x_row[rb] = x + (row + rb) * bx / 4; + } + + for (int i = 0; i < group32_num; i++) { + const uint8_t * px[PARALLEL_SIZE]; + __m512i accu32[PARALLEL_SIZE]; + for (int rb = 0; rb < PARALLEL_SIZE; rb++) { + px[rb] = x_row[rb] + i * 1024; + accu32[rb] = _mm512_setzero_si512(); + } + const int8_t *py = y + i * 4096; + + int j = 0; + for (; j + 1 < 32; j += 2) { + __m512i yq8_0 = _mm512_loadu_si512((const __m512i*)(py)); + __m512i yq8_1 = _mm512_loadu_si512((const __m512i*)(py + 64)); + __m512i yq8_2 = _mm512_loadu_si512((const __m512i*)(py + 128)); + __m512i yq8_3 = _mm512_loadu_si512((const __m512i*)(py + 192)); + for (int rb = 0; rb < PARALLEL_SIZE; rb++) { + __m512i xq8_3 = _mm512_loadu_si512((const __m512i*)(px[rb])); + __m512i xq8_2 = _mm512_srli_epi16(xq8_3, 2); + __m512i xq8_1 = _mm512_srli_epi16(xq8_3, 4); + __m512i xq8_0 = _mm512_srli_epi16(xq8_3, 6); + + xq8_3 = _mm512_and_si512(xq8_3, mask); + xq8_2 = _mm512_and_si512(xq8_2, mask); + xq8_1 = _mm512_and_si512(xq8_1, mask); + xq8_0 = _mm512_and_si512(xq8_0, mask); + + xq8_0 = _mm512_maddubs_epi16(xq8_0, yq8_0); + xq8_1 = _mm512_maddubs_epi16(xq8_1, yq8_1); + xq8_2 = _mm512_maddubs_epi16(xq8_2, yq8_2); + xq8_3 = _mm512_maddubs_epi16(xq8_3, yq8_3); + + accu32[rb] = _mm512_add_epi16(accu32[rb], _mm512_add_epi16(xq8_0, xq8_1)); + accu32[rb] = _mm512_add_epi16(accu32[rb], _mm512_add_epi16(xq8_2, xq8_3)); + + px[rb] += 64; + } + py += 256; + } + if (j < 32) { + __m256i yq8_0_256 = _mm256_loadu_si256((const __m256i*)(py)); + __m256i yq8_1_256 = _mm256_loadu_si256((const __m256i*)(py + 32)); + __m256i yq8_2_256 = _mm256_loadu_si256((const __m256i*)(py + 64)); + __m256i yq8_3_256 = _mm256_loadu_si256((const __m256i*)(py + 96)); + __m512i yq8_0 = _mm512_castsi256_si512(yq8_0_256); + __m512i yq8_1 = _mm512_castsi256_si512(yq8_1_256); + __m512i yq8_2 = _mm512_castsi256_si512(yq8_2_256); + __m512i yq8_3 = _mm512_castsi256_si512(yq8_3_256); + for (int rb = 0; rb < PARALLEL_SIZE; rb++) { + __m256i xq8_3_256 = _mm256_loadu_si256((const __m256i*)(px[rb])); + __m512i xq8_3 = _mm512_castsi256_si512(xq8_3_256); + __m512i xq8_2 = _mm512_srli_epi16(xq8_3, 2); + __m512i xq8_1 = _mm512_srli_epi16(xq8_3, 4); + __m512i xq8_0 = _mm512_srli_epi16(xq8_3, 6); + + xq8_3 = _mm512_and_si512(xq8_3, mask); + xq8_2 = _mm512_and_si512(xq8_2, mask); + xq8_1 = _mm512_and_si512(xq8_1, mask); + xq8_0 = _mm512_and_si512(xq8_0, mask); + + xq8_0 = _mm512_maddubs_epi16(xq8_0, yq8_0); + xq8_1 = _mm512_maddubs_epi16(xq8_1, yq8_1); + xq8_2 = _mm512_maddubs_epi16(xq8_2, yq8_2); + xq8_3 = _mm512_maddubs_epi16(xq8_3, yq8_3); + + accu32[rb] = _mm512_add_epi16(accu32[rb], _mm512_add_epi16(xq8_0, xq8_1)); + accu32[rb] = _mm512_add_epi16(accu32[rb], _mm512_add_epi16(xq8_2, xq8_3)); + + px[rb] += 32; + } + } + for (int rb = 0; rb < PARALLEL_SIZE; rb++) { + accu[rb] = _mm512_add_epi32(_mm512_madd_epi16(accu32[rb], one16), accu[rb]); + } + } + + for (int i = 0; i < groupla_num; i++) { + const int8_t *py = y + group32_num * 4096; + const uint8_t * px[PARALLEL_SIZE]; + __m512i accula[PARALLEL_SIZE]; + for (int rb = 0; rb < PARALLEL_SIZE; rb++) { + px[rb] = x_row[rb] + group32_num * 1024; + accula[rb] = _mm512_setzero_si512(); + } + + int j = 0; + for (; j + 1 < la_num; j += 2) { + __m512i yq8_0 = _mm512_loadu_si512((const __m512i*)(py)); + __m512i yq8_1 = _mm512_loadu_si512((const __m512i*)(py + 64)); + __m512i yq8_2 = _mm512_loadu_si512((const __m512i*)(py + 128)); + __m512i yq8_3 = _mm512_loadu_si512((const __m512i*)(py + 192)); + + for (int rb = 0; rb < PARALLEL_SIZE; rb++) { + __m512i xq8_3 = _mm512_loadu_si512((const __m512i*)(px[rb])); + __m512i xq8_2 = _mm512_srli_epi16(xq8_3, 2); + __m512i xq8_1 = _mm512_srli_epi16(xq8_3, 4); + __m512i xq8_0 = _mm512_srli_epi16(xq8_3, 6); + + xq8_3 = _mm512_and_si512(xq8_3, mask); + xq8_2 = _mm512_and_si512(xq8_2, mask); + xq8_1 = _mm512_and_si512(xq8_1, mask); + xq8_0 = _mm512_and_si512(xq8_0, mask); + + xq8_0 = _mm512_maddubs_epi16(xq8_0, yq8_0); + xq8_1 = _mm512_maddubs_epi16(xq8_1, yq8_1); + xq8_2 = _mm512_maddubs_epi16(xq8_2, yq8_2); + xq8_3 = _mm512_maddubs_epi16(xq8_3, yq8_3); + + accula[rb] = _mm512_add_epi16(accula[rb], _mm512_add_epi16(xq8_0, xq8_1)); + accula[rb] = _mm512_add_epi16(accula[rb], _mm512_add_epi16(xq8_2, xq8_3)); + + px[rb] += 64; + } + py += 256; + } + if (j < la_num) { + __m256i yq8_0_256 = _mm256_loadu_si256((const __m256i*)(py)); + __m256i yq8_1_256 = _mm256_loadu_si256((const __m256i*)(py + 32)); + __m256i yq8_2_256 = _mm256_loadu_si256((const __m256i*)(py + 64)); + __m256i yq8_3_256 = _mm256_loadu_si256((const __m256i*)(py + 96)); + __m512i yq8_0 = _mm512_castsi256_si512(yq8_0_256); + __m512i yq8_1 = _mm512_castsi256_si512(yq8_1_256); + __m512i yq8_2 = _mm512_castsi256_si512(yq8_2_256); + __m512i yq8_3 = _mm512_castsi256_si512(yq8_3_256); + + for (int rb = 0; rb < PARALLEL_SIZE; rb++) { + __m256i xq8_3_256 = _mm256_loadu_si256((const __m256i*)(px[rb])); + __m512i xq8_3 = _mm512_castsi256_si512(xq8_3_256); + __m512i xq8_2 = _mm512_srli_epi16(xq8_3, 2); + __m512i xq8_1 = _mm512_srli_epi16(xq8_3, 4); + __m512i xq8_0 = _mm512_srli_epi16(xq8_3, 6); + + xq8_3 = _mm512_and_si512(xq8_3, mask); + xq8_2 = _mm512_and_si512(xq8_2, mask); + xq8_1 = _mm512_and_si512(xq8_1, mask); + xq8_0 = _mm512_and_si512(xq8_0, mask); + + xq8_0 = _mm512_maddubs_epi16(xq8_0, yq8_0); + xq8_1 = _mm512_maddubs_epi16(xq8_1, yq8_1); + xq8_2 = _mm512_maddubs_epi16(xq8_2, yq8_2); + xq8_3 = _mm512_maddubs_epi16(xq8_3, yq8_3); + + accula[rb] = _mm512_add_epi16(accula[rb], _mm512_add_epi16(xq8_0, xq8_1)); + accula[rb] = _mm512_add_epi16(accula[rb], _mm512_add_epi16(xq8_2, xq8_3)); + + px[rb] += 32; + } + } + for (int rb = 0; rb < PARALLEL_SIZE; rb++) { + accu[rb] = _mm512_add_epi32(accu[rb], _mm512_madd_epi16(accula[rb], one16)); + } + } + + for (int rb = 0; rb < PARALLEL_SIZE; rb++) { + int sumi = hsum_i32_16(accu[rb]); + s[row + rb] = (float)sumi; + } + } +#elif defined(__AVX2__) const uint8_t * x = (uint8_t *)vx; const int8_t * y = (int8_t *)vy; @@ -789,7 +1118,139 @@ void ggml_vec_dot_i2_i8_s_1xN(int n, float * s, size_t bs, const void * vx, size } void ggml_vec_dot_i2_i8_s_Nx1(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) { -#if defined(__AVX2__) +#if defined(__AVX512F__) && defined(__AVX512BW__) + const uint8_t * x = (uint8_t *)vx; + const int8_t * y = (int8_t *)vy; + + const int nb = n / QK_I2_S; + const int group32_num = nb / 32; + const int la_num = nb % 32; + const int groupla_num = nb % 32 != 0 ? 1 : 0; + + const __m512i mask = _mm512_set1_epi8(0x03); + const __m512i one16 = _mm512_set1_epi16(1); + + for (int col = 0; col < nrc; col += PARALLEL_SIZE) { + __m512i accu[PARALLEL_SIZE]; + + for (int iy = 0; iy < PARALLEL_SIZE; iy++) { + accu[iy] = _mm512_setzero_si512(); + } + + const int8_t * y_col = y + col * by; + + for (int i = 0; i < group32_num; i++) { + const uint8_t *px = x + i * 1024; + const int8_t *py = y_col + i * 4096; + __m512i accu32[PARALLEL_SIZE]; + + for (int iy = 0; iy < PARALLEL_SIZE; iy++) { + accu32[iy] = _mm512_setzero_si512(); + } + + int j = 0; + for (; j + 1 < 32; j += 2) { + __m512i xq8 = _mm512_loadu_si512((const __m512i*)(px)); + __m512i xq8_3 = _mm512_and_si512(xq8, mask); + __m512i xq8_2 = _mm512_and_si512(_mm512_srli_epi16(xq8, 2), mask); + __m512i xq8_1 = _mm512_and_si512(_mm512_srli_epi16(xq8, 4), mask); + __m512i xq8_0 = _mm512_and_si512(_mm512_srli_epi16(xq8, 6), mask); + + for (int iy = 0; iy < PARALLEL_SIZE; iy++) { + accu32[iy] = _mm512_add_epi16(accu32[iy], _mm512_add_epi16( + _mm512_add_epi16(_mm512_maddubs_epi16(xq8_0, _mm512_loadu_si512((const __m512i*)(py + 0 * 64 + iy * by))), + _mm512_maddubs_epi16(xq8_1, _mm512_loadu_si512((const __m512i*)(py + 1 * 64 + iy * by)))), + _mm512_add_epi16(_mm512_maddubs_epi16(xq8_2, _mm512_loadu_si512((const __m512i*)(py + 2 * 64 + iy * by))), + _mm512_maddubs_epi16(xq8_3, _mm512_loadu_si512((const __m512i*)(py + 3 * 64 + iy * by)))))); + } + + px += 64; + py += 256; + } + if (j < 32) { + __m256i xq8_256 = _mm256_loadu_si256((const __m256i*)(px)); + __m512i xq8 = _mm512_castsi256_si512(xq8_256); + __m512i xq8_3 = _mm512_and_si512(xq8, mask); + __m512i xq8_2 = _mm512_and_si512(_mm512_srli_epi16(xq8, 2), mask); + __m512i xq8_1 = _mm512_and_si512(_mm512_srli_epi16(xq8, 4), mask); + __m512i xq8_0 = _mm512_and_si512(_mm512_srli_epi16(xq8, 6), mask); + + for (int iy = 0; iy < PARALLEL_SIZE; iy++) { + accu32[iy] = _mm512_add_epi16(accu32[iy], _mm512_add_epi16( + _mm512_add_epi16(_mm512_maddubs_epi16(xq8_0, _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(py + 0 * 32 + iy * by)))), + _mm512_maddubs_epi16(xq8_1, _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(py + 1 * 32 + iy * by))))), + _mm512_add_epi16(_mm512_maddubs_epi16(xq8_2, _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(py + 2 * 32 + iy * by)))), + _mm512_maddubs_epi16(xq8_3, _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(py + 3 * 32 + iy * by))))))); + } + + px += 32; + py += 128; + } + + for (int iy = 0; iy < PARALLEL_SIZE; iy++) { + accu[iy] = _mm512_add_epi32(_mm512_madd_epi16(accu32[iy], one16), accu[iy]); + } + } + + for (int i = 0; i < groupla_num; i++) { + const uint8_t *px = x + group32_num * 1024; + const int8_t *py = y_col + group32_num * 4096; + __m512i accula[PARALLEL_SIZE]; + + for (int iy = 0; iy < PARALLEL_SIZE; iy++) { + accula[iy] = _mm512_setzero_si512(); + } + + int j = 0; + for (; j + 1 < la_num; j += 2) { + __m512i xq8 = _mm512_loadu_si512((const __m512i*)(px)); + __m512i xq8_3 = _mm512_and_si512(xq8, mask); + __m512i xq8_2 = _mm512_and_si512(_mm512_srli_epi16(xq8, 2), mask); + __m512i xq8_1 = _mm512_and_si512(_mm512_srli_epi16(xq8, 4), mask); + __m512i xq8_0 = _mm512_and_si512(_mm512_srli_epi16(xq8, 6), mask); + + for (int iy = 0; iy < PARALLEL_SIZE; iy++) { + accula[iy] = _mm512_add_epi16(accula[iy], _mm512_add_epi16( + _mm512_add_epi16(_mm512_maddubs_epi16(xq8_0, _mm512_loadu_si512((const __m512i*)(py + 0 * 64 + iy * by))), + _mm512_maddubs_epi16(xq8_1, _mm512_loadu_si512((const __m512i*)(py + 1 * 64 + iy * by)))), + _mm512_add_epi16(_mm512_maddubs_epi16(xq8_2, _mm512_loadu_si512((const __m512i*)(py + 2 * 64 + iy * by))), + _mm512_maddubs_epi16(xq8_3, _mm512_loadu_si512((const __m512i*)(py + 3 * 64 + iy * by)))))); + } + + px += 64; + py += 256; + } + if (j < la_num) { + __m256i xq8_256 = _mm256_loadu_si256((const __m256i*)(px)); + __m512i xq8 = _mm512_castsi256_si512(xq8_256); + __m512i xq8_3 = _mm512_and_si512(xq8, mask); + __m512i xq8_2 = _mm512_and_si512(_mm512_srli_epi16(xq8, 2), mask); + __m512i xq8_1 = _mm512_and_si512(_mm512_srli_epi16(xq8, 4), mask); + __m512i xq8_0 = _mm512_and_si512(_mm512_srli_epi16(xq8, 6), mask); + + for (int iy = 0; iy < PARALLEL_SIZE; iy++) { + accula[iy] = _mm512_add_epi16(accula[iy], _mm512_add_epi16( + _mm512_add_epi16(_mm512_maddubs_epi16(xq8_0, _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(py + 0 * 32 + iy * by)))), + _mm512_maddubs_epi16(xq8_1, _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(py + 1 * 32 + iy * by))))), + _mm512_add_epi16(_mm512_maddubs_epi16(xq8_2, _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(py + 2 * 32 + iy * by)))), + _mm512_maddubs_epi16(xq8_3, _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(py + 3 * 32 + iy * by))))))); + } + + px += 32; + py += 128; + } + + for (int iy = 0; iy < PARALLEL_SIZE; iy++) { + accu[iy] = _mm512_add_epi32(_mm512_madd_epi16(accula[iy], one16), accu[iy]); + } + } + + for (int iy = 0; iy < PARALLEL_SIZE; iy++) { + int sumi = hsum_i32_16(accu[iy]); + s[(col + iy) * bs] = (float)sumi; + } + } +#elif defined(__AVX2__) const uint8_t * x = (uint8_t *)vx; const int8_t * y = (int8_t *)vy; @@ -808,7 +1269,7 @@ void ggml_vec_dot_i2_i8_s_Nx1(int n, float * s, size_t bs, const void * vx, size accu[iy] = _mm256_setzero_si256(); } - int8_t * y_col = y + col * by; + const int8_t * y_col = y + col * by; for (int i = 0; i < group32_num; i++) { const uint8_t *px = x + i * 1024; diff --git a/utils/convert-bitnet-embedding-to-gguf.py b/utils/convert-bitnet-embedding-to-gguf.py new file mode 100644 index 000000000..3a4340734 --- /dev/null +++ b/utils/convert-bitnet-embedding-to-gguf.py @@ -0,0 +1,502 @@ +#!/usr/bin/env python3 + +from __future__ import annotations + +import argparse +import json +import logging +import os +import sys +from hashlib import sha256 +from pathlib import Path +from typing import Any, Iterator + +import numpy as np +import torch + +# Allow using the local gguf-py if present +if "NO_LOCAL_GGUF" not in os.environ: + _local_gguf = Path(__file__).parent / "gguf-py" + if _local_gguf.exists(): + sys.path.insert(1, str(_local_gguf)) +import gguf + +logger = logging.getLogger("convert-bitnet-embedding") + +# --------------------------------------------------------------------------- +# Tensor name mapping: HuggingFace -> GGUF +# --------------------------------------------------------------------------- + +def build_tensor_name_map(n_layers: int) -> dict[str, str]: + """Build HF tensor name -> GGUF tensor name mapping.""" + mapping: dict[str, str] = { + "embed_tokens.weight": "token_embd.weight", + "norm.weight": "output_norm.weight", + } + + for i in range(n_layers): + pfx = f"layers.{i}" + blk = f"blk.{i}" + + mapping.update({ + # Layer norms + f"{pfx}.input_layernorm.weight": f"{blk}.attn_norm.weight", + f"{pfx}.post_attention_layernorm.weight": f"{blk}.ffn_norm.weight", + + # Self-attention projections + f"{pfx}.self_attn.q_proj.weight": f"{blk}.attn_q.weight", + f"{pfx}.self_attn.k_proj.weight": f"{blk}.attn_k.weight", + f"{pfx}.self_attn.v_proj.weight": f"{blk}.attn_v.weight", + f"{pfx}.self_attn.o_proj.weight": f"{blk}.attn_output.weight", + + # QK head norms (standard Qwen3) + f"{pfx}.self_attn.q_norm.weight": f"{blk}.attn_q_norm.weight", + f"{pfx}.self_attn.k_norm.weight": f"{blk}.attn_k_norm.weight", + + # Per-projection input norms (BitNet-specific) + f"{pfx}.self_attn.q_proj.norm.weight": f"{blk}.attn_q_norm_in.weight", + f"{pfx}.self_attn.k_proj.norm.weight": f"{blk}.attn_k_norm_in.weight", + f"{pfx}.self_attn.v_proj.norm.weight": f"{blk}.attn_v_norm_in.weight", + f"{pfx}.self_attn.o_proj.norm.weight": f"{blk}.attn_output_norm_in.weight", + + # MLP projections + f"{pfx}.mlp.gate_proj.weight": f"{blk}.ffn_gate.weight", + f"{pfx}.mlp.up_proj.weight": f"{blk}.ffn_up.weight", + f"{pfx}.mlp.down_proj.weight": f"{blk}.ffn_down.weight", + + # Per-projection input norms for MLP (BitNet-specific) + f"{pfx}.mlp.gate_proj.norm.weight": f"{blk}.ffn_gate_norm_in.weight", + f"{pfx}.mlp.up_proj.norm.weight": f"{blk}.ffn_up_norm_in.weight", + f"{pfx}.mlp.down_proj.norm.weight": f"{blk}.ffn_down_norm_in.weight", + }) + + return mapping + + +# --------------------------------------------------------------------------- +# Tokenizer handling (GPT-2 / BPE for Qwen3) +# --------------------------------------------------------------------------- + +def get_vocab_base_pre(tokenizer) -> str: + # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that + # is specific for the BPE pre-tokenizer used by the model + # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can + # use in llama.cpp to implement the same pre-tokenizer + + chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n\U0001f680 (normal) \U0001f636‍\U0001f32b️ (multiple emojis concatenated) ✅ \U0001f999\U0001f999 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច\U0001f601 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````""""......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL' + + chktok = tokenizer.encode(chktxt) + chkhsh = sha256(str(chktok).encode()).hexdigest() + + logger.debug(f"chktok: {chktok}") + logger.debug(f"chkhsh: {chkhsh}") + + res = None + + # NOTE: if you get an error here, you need to update the convert_hf_to_gguf_update.py script + # or pull the latest version of the model from Huggingface + # don't edit the hashes manually! + if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5": + # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B + res = "llama-bpe" + if chkhsh == "049ecf7629871e3041641907f3de7c733e4dbfdc736f57d882ba0b0845599754": + # ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-base + res = "deepseek-llm" + if chkhsh == "347715f544604f9118bb75ed199f68779f423cabb20db6de6f31b908d04d7821": + # ref: https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base + res = "deepseek-coder" + if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed": + # ref: https://huggingface.co/tiiuae/falcon-7b + res = "falcon" + if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454": + # ref: https://huggingface.co/openai-community/gpt2 + res = "gpt-2" + if chkhsh == "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c": + # ref: https://huggingface.co/Qwen/Qwen3-Embedding-0.6B + res = "qwen2" + + if res is None: + logger.warning("\n") + logger.warning("**************************************************************************************") + logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!") + logger.warning("** There are 2 possible reasons for this:") + logger.warning("** - the model has not been added to convert_hf_to_gguf_update.py yet") + logger.warning("** - the pre-tokenization config has changed upstream") + logger.warning("** Check your model files and convert_hf_to_gguf_update.py and update them accordingly.") + logger.warning("** ref: https://github.com/ggml-org/llama.cpp/pull/6920") + logger.warning("**") + logger.warning(f"** chkhsh: {chkhsh}") + logger.warning("**************************************************************************************") + logger.warning("\n") + raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()") + + logger.debug(f"tokenizer.ggml.pre: {repr(res)}") + logger.debug(f"chkhsh: {chkhsh}") + + return res + + +def _does_token_look_special(token: str) -> bool: + """Check if a token looks like a special token (e.g., <|...|>, <...>).""" + if not token: + return False + # Matches patterns like <|endoftext|>, , , [CLS], [SEP], etc. + if token.startswith(("<|", "<", "[")) and token.endswith(("|>", ">", "]")): + return True + return False + + +def set_vocab(gguf_writer: gguf.GGUFWriter, dir_model: Path, hparams: dict): + """Set GPT-2 BPE vocab for Qwen3.""" + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(dir_model) + vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) + + tokpre = get_vocab_base_pre(tokenizer) + + tokens: list[str] = [] + toktypes: list[int] = [] + + reverse_vocab = {id_: tok for tok, id_ in tokenizer.vocab.items()} + added_vocab = tokenizer.get_added_vocab() + + added_tokens_decoder = tokenizer.added_tokens_decoder + + for i in range(vocab_size): + if i not in reverse_vocab: + tokens.append(f"[PAD{i}]") + toktypes.append(gguf.TokenType.UNUSED) + elif reverse_vocab[i] in added_vocab: + token = reverse_vocab[i] + + # Only encode-decode non-normalized tokens (matching llama.cpp upstream) + if not added_tokens_decoder[i].normalized: + token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) + + if added_tokens_decoder[i].special or _does_token_look_special(token): + toktypes.append(gguf.TokenType.CONTROL) + else: + # Pre-normalize user-defined spaces (for Gemma-style tokenizers) + token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") + toktypes.append(gguf.TokenType.USER_DEFINED) + + tokens.append(token) + else: + tokens.append(reverse_vocab[i]) + toktypes.append(gguf.TokenType.NORMAL) + + gguf_writer.add_tokenizer_model("gpt2") + gguf_writer.add_tokenizer_pre(tokpre) + gguf_writer.add_token_list(tokens) + gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(dir_model, load_merges=True) + # Override EOS token: PyTorch tokenizer appends <|endoftext|> (151643) as the + # sentence-end marker, not <|im_end|> (151645). For last-token pooling to work + # correctly, llama.cpp must append the same token. + special_vocab.special_token_ids["eos"] = 151643 + special_vocab.add_to_gguf(gguf_writer) + + # Embedding models need EOS token appended for last-token pooling + gguf_writer.add_add_eos_token(True) + + +# --------------------------------------------------------------------------- +# GGUF metadata +# --------------------------------------------------------------------------- + +def set_gguf_parameters(gguf_writer: gguf.GGUFWriter, hparams: dict, dir_model: Path, ftype: int): + gguf_writer.add_name(dir_model.name) + + n_layers = hparams["num_hidden_layers"] + n_embd = hparams["hidden_size"] + n_head = hparams["num_attention_heads"] + n_head_kv = hparams.get("num_key_value_heads", n_head) + n_ff = hparams["intermediate_size"] + + gguf_writer.add_block_count(n_layers) + gguf_writer.add_context_length(hparams.get("max_position_embeddings", 32768)) + gguf_writer.add_embedding_length(n_embd) + gguf_writer.add_feed_forward_length(n_ff) + gguf_writer.add_head_count(n_head) + gguf_writer.add_head_count_kv(n_head_kv) + gguf_writer.add_vocab_size(hparams["vocab_size"]) + + head_dim = hparams.get("head_dim", n_embd // n_head) + gguf_writer.add_rope_dimension_count(head_dim) + gguf_writer.add_key_length(head_dim) + gguf_writer.add_value_length(head_dim) + + if hparams.get("rope_theta") is not None: + gguf_writer.add_rope_freq_base(hparams["rope_theta"]) + if hparams.get("rms_norm_eps") is not None: + gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"]) + + gguf_writer.add_file_type(ftype) + + # Pooling type for embedding models + # Try to read from modules.json / 1_Pooling/config.json (sentence-transformers convention) + pooling_type = None + module_path = dir_model / "modules.json" + if module_path.is_file(): + with open(module_path, encoding="utf-8") as f: + modules = json.load(f) + for mod in modules: + if mod["type"].endswith("Pooling"): + pooling_path = dir_model / mod["path"] / "config.json" + if pooling_path.is_file(): + with open(pooling_path, encoding="utf-8") as f: + pooling = json.load(f) + if pooling.get("pooling_mode_mean_tokens"): + pooling_type = gguf.PoolingType.MEAN + elif pooling.get("pooling_mode_cls_token"): + pooling_type = gguf.PoolingType.CLS + elif pooling.get("pooling_mode_lasttoken"): + pooling_type = gguf.PoolingType.LAST + break + if pooling_type is None: + # Default to MEAN pooling for embedding models + logger.info(" No pooling config found, defaulting to MEAN pooling") + pooling_type = gguf.PoolingType.MEAN + gguf_writer.add_pooling_type(pooling_type) + + logger.info(f" n_layers={n_layers}, n_embd={n_embd}, n_head={n_head}, n_head_kv={n_head_kv}, n_ff={n_ff}") + + +# --------------------------------------------------------------------------- +# Tensor iteration from safetensors +# --------------------------------------------------------------------------- + +def iter_tensors(dir_model: Path) -> Iterator[tuple[str, torch.Tensor]]: + """Yield (name, tensor) from safetensors files.""" + from safetensors import safe_open + + safetensor_files = sorted(dir_model.glob("*.safetensors")) + if not safetensor_files: + raise FileNotFoundError(f"No .safetensors files in {dir_model}") + + for sf_path in safetensor_files: + logger.info(f"Loading {sf_path.name}") + with safe_open(str(sf_path), framework="pt", device="cpu") as f: + for name in f.keys(): + yield name, f.get_tensor(name) + + +# --------------------------------------------------------------------------- +# I2_S ternary packing (platform-independent) +# --------------------------------------------------------------------------- +# +# I2_S format (from dequantize_row_i2_s in ggml-quants.c): +# - Every 128 values form a block, packed into 32 bytes +# - Each byte stores 4 values at positions [0*32+gp, 1*32+gp, 2*32+gp, 3*32+gp] +# where gp is the byte index within the 32-byte group +# - Encoding per byte: c0=(b>>6)&3, c1=(b>>4)&3, c2=(b>>2)&3, c3=(b>>0)&3 +# - Value mapping: 0 -> -1, 1 -> 0, 2 -> +1, 3 -> 0 +# - Scale is stored as a separate tensor (tensor_name + "_scale") + +def quantize_to_i2_s(w: np.ndarray) -> np.ndarray: + """Quantize float weights to ternary and pack into I2_S layout. + + Uses the same quantization as BitLinear weight_quant_minmax(): + scale = 1.0 / mean(|w|) + q = round(w * scale).clamp(-1, 1) + dequant = q / scale = q * mean(|w|) + + The I2_S format is self-contained: packed ternary bytes followed by a f32 scale + appended at the end of the data buffer. + + Args: + w: float weight tensor of shape (M, K) + + Returns: + packed_data: uint8 array containing I2_S packed bytes + scale (as 4 trailing bytes) + """ + M, K = w.shape + n = M * K + w_flat = w.flatten().astype(np.float32) + + # BitLinear weight_quant_minmax: scale = 1/mean(|w|), then round & clamp + abs_mean = np.mean(np.abs(w_flat)) + abs_mean = max(abs_mean, 1e-5) + inv_scale = 1.0 / abs_mean + q_float = np.round(w_flat * inv_scale).clip(-1, 1) # ternary: {-1, 0, 1} + + # scale for dequantization = abs_mean (i.e., dequant = q * abs_mean) + scale = np.float32(abs_mean) + + # Map ternary {-1, 0, 1} -> I2_S encoding {0, 1, 2} + # -1 -> 0, 0 -> 1, +1 -> 2 + q = np.ones(n, dtype=np.uint8) # default to 1 (zero) + q[q_float > 0.5] = 2 # +1 -> 2 + q[q_float < -0.5] = 0 # -1 -> 0 + + # Pack into I2_S layout: 128-value blocks, interleaved into 32 bytes + # Pad to multiple of 128 + pad_len = (128 - n % 128) % 128 + if pad_len: + q = np.pad(q, (0, pad_len), constant_values=1) + + n_padded = len(q) + n_blocks = n_padded // 128 + + q = q.reshape(n_blocks, 4, 32) + + # Pack: byte = (c0 << 6) | (c1 << 4) | (c2 << 2) | c3 + packed = (q[:, 0, :].astype(np.uint8) << 6) | \ + (q[:, 1, :].astype(np.uint8) << 4) | \ + (q[:, 2, :].astype(np.uint8) << 2) | \ + (q[:, 3, :].astype(np.uint8)) + + packed = packed.reshape(-1).astype(np.uint8) + + # I2_S format: packed_bytes + 32-byte aligned tail (scale in first 4 bytes of tail) + # Total size = n_elements / 4 + 32 (as defined in ggml.c) + packed_size = n // 4 + total_size = packed_size + 32 + result = np.zeros(total_size, dtype=np.uint8) + result[:len(packed)] = packed[:packed_size] + # Write scale as float32 at offset packed_size + result[packed_size:packed_size+4] = np.frombuffer(scale.tobytes(), dtype=np.uint8) + + return result + + +# --------------------------------------------------------------------------- +# Main conversion +# --------------------------------------------------------------------------- + +def main(): + parser = argparse.ArgumentParser(description="Convert bitnet-embeddings to GGUF") + parser.add_argument("model", type=Path, help="Model directory") + parser.add_argument("--outfile", type=Path, default=None, help="Output GGUF file") + parser.add_argument("--outtype", choices=["f32", "f16", "i2_s"], default="f16", + help="Output type: f32, f16, or i2_s (ternary quantized)") + parser.add_argument("--verbose", action="store_true") + args = parser.parse_args() + + logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) + + dir_model = args.model + if not dir_model.is_dir(): + logger.error(f"{dir_model} is not a directory") + sys.exit(1) + + # Default output filename + if args.outfile is None: + suffix = {"f32": "-f32", "f16": "-f16", "i2_s": "-f16-new-i2_s"}[args.outtype] + args.outfile = dir_model / f"{dir_model.name}{suffix}.gguf" + + # Load config + with open(dir_model / "config.json") as f: + hparams = json.load(f) + + arch = hparams.get("model_type", "qwen3") + assert arch == "qwen3", f"Expected qwen3 architecture, got {arch}" + + n_layers = hparams["num_hidden_layers"] + + # Determine ftype + if args.outtype == "f32": + ftype = 0 # GGML F32 + elif args.outtype == "f16": + ftype = 1 # GGML F16 + else: # i2_s + ftype = 40 # LLAMA_FTYPE_MOSTLY_I2_S + + logger.info(f"Converting {dir_model.name} to GGUF ({args.outtype})") + + # Create GGUF writer + gguf_writer = gguf.GGUFWriter(str(args.outfile), "qwen3") + + # Set parameters + set_gguf_parameters(gguf_writer, hparams, dir_model, ftype) + + # Set vocab + logger.info("Setting tokenizer/vocab...") + set_vocab(gguf_writer, dir_model, hparams) + + # Build tensor name map + tensor_map = build_tensor_name_map(n_layers) + + # Process tensors + logger.info("Processing tensors...") + tensor_count = 0 + for hf_name, data_torch in iter_tensors(dir_model): + # Skip tensors we don't need + if hf_name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")): + continue + + # Strip "model." prefix if present + name = hf_name + if name.startswith("model."): + name = name[len("model."):] + + # Look up GGUF name + gguf_name = tensor_map.get(name) + if gguf_name is None: + logger.warning(f"Skipping unmapped tensor: {hf_name}") + continue + + old_dtype = data_torch.dtype + + # Convert bf16 -> f32 first (bf16 not directly supported by gguf) + if data_torch.dtype == torch.bfloat16: + data_torch = data_torch.to(torch.float32) + + data = data_torch.squeeze().numpy() + n_dims = len(data.shape) + data_shape = data.shape + + # Determine if this is a linear weight suitable for ternary quantization + is_norm = gguf_name.endswith("_norm.weight") or gguf_name.endswith("_norm_in.weight") + is_embed = gguf_name == "token_embd.weight" + is_linear_weight = n_dims == 2 and not is_norm and not is_embed + suit_i2 = is_linear_weight + + if args.outtype == "i2_s" and suit_i2: + # --- I2_S ternary packing (scale embedded in data) --- + packed = quantize_to_i2_s(data) + data_qtype = gguf.GGMLQuantizationType.I2_S + + shape_str = f"{{{', '.join(str(n) for n in reversed(data_shape))}}}" + logger.info(f" {gguf_name}: {list(data_shape)} {old_dtype} -> I2_S, shape = {shape_str}") + + gguf_writer.add_tensor(gguf_name, packed, raw_shape=data_shape, raw_dtype=data_qtype) + tensor_count += 1 + + elif args.outtype in ("f16", "i2_s") and (is_linear_weight or is_embed): + # 2D weight tensors (linear + embedding) -> f16 + data = data.astype(np.float16) + logger.info(f" {gguf_name}: {list(data_torch.shape)} {old_dtype} -> float16") + gguf_writer.add_tensor(gguf_name, data) + tensor_count += 1 + + else: + # norms, 1D tensors + if args.outtype in ("f16", "i2_s"): + data = data.astype(np.float16) + logger.info(f" {gguf_name}: {list(data_torch.shape)} {old_dtype} -> float16") + else: + if data.dtype != np.float32: + data = data.astype(np.float32) + logger.info(f" {gguf_name}: {list(data_torch.shape)} {old_dtype} -> float32") + gguf_writer.add_tensor(gguf_name, data) + tensor_count += 1 + + logger.info(f"Total tensors written: {tensor_count}") + + # Note: output.weight (lm_head) is skipped for embedding models — + # it is not needed (no token generation) and saves ~297MB for this model. + + # Write GGUF + logger.info(f"Writing to {args.outfile}...") + gguf_writer.write_header_to_file() + gguf_writer.write_kv_data_to_file() + gguf_writer.write_tensors_to_file() + gguf_writer.close() + + logger.info("Done!") + + +if __name__ == "__main__": + main() From 9a3f3a22a739f097969f612d25e826747b84143c Mon Sep 17 00:00:00 2001 From: isHuangXin Date: Thu, 21 May 2026 14:56:53 +0800 Subject: [PATCH 2/3] [feat] Add GGUF conversion and inference support for BitNet embedding 270m (Gemma3) - Add convert-bitnet-embedding-270m-to-gguf.py for Gemma3-based 270m models - Support f32, f16, and I2_S ternary quantization output types - Add AVX512BW SIMD paths for I2_S dot product in ggml-bitnet-mad.cpp - Add immintrin.h include and bitnet-lut-kernels.h guard in ggml-bitnet-lut.cpp - Add documentation for Gemma3 GGUF conversion implementation - Update llama.cpp submodule with Gemma3 architecture support --- 3rdparty/llama.cpp | 2 +- ...itnet-embeddings-gemma3-gguf-conversion.md | 336 +++++++++++++ .../convert-bitnet-embedding-270m-to-gguf.py | 441 ++++++++++++++++++ 3 files changed, 778 insertions(+), 1 deletion(-) create mode 100644 docs/bitnet-embeddings-gemma3-gguf-conversion.md create mode 100644 utils/convert-bitnet-embedding-270m-to-gguf.py diff --git a/3rdparty/llama.cpp b/3rdparty/llama.cpp index 13e129947..a0d4c71d7 160000 --- a/3rdparty/llama.cpp +++ b/3rdparty/llama.cpp @@ -1 +1 @@ -Subproject commit 13e129947db43cbbcbfa985c72c443c2f2757f15 +Subproject commit a0d4c71d70f5837451f8faec122c7f0e8aa242aa diff --git a/docs/bitnet-embeddings-gemma3-gguf-conversion.md b/docs/bitnet-embeddings-gemma3-gguf-conversion.md new file mode 100644 index 000000000..236b78d1e --- /dev/null +++ b/docs/bitnet-embeddings-gemma3-gguf-conversion.md @@ -0,0 +1,336 @@ +# BitNet Embeddings (Gemma3) GGUF Conversion Implementation + +## 1. Background + +`bitnet-embeddings-270m` is a Gemma3-based embedding model with BitNet per-projection RMSNorm (`BitLinear`). Each linear projection (q/k/v/o/gate/up/down) has a `.norm.weight` that applies RMSNorm to the input **before** the matmul: + +``` +x → RMSNorm(x, norm.weight) → activation_quant(8bit) → matmul(weight_quant(ternary)) +``` + +This pattern does **not** exist in any standard llama.cpp architecture: +- Standard Gemma3: no per-projection norms +- Standard BitNet: has `attn_sub_norm`/`ffn_sub_norm` at different positions (after attention/gate*up, not before each projection) + +### Model Config + +- Architecture: `Gemma3TextModel` +- hidden_size: 640, num_attention_heads: 4, num_key_value_heads: 1 +- head_dim: 256 (note: != hidden_size/num_heads = 160) +- intermediate_size: 2048, num_hidden_layers: 18 +- hidden_activation: gelu_pytorch_tanh +- vocab_size: 262144 +- rope_theta: 10000.0, rms_norm_eps: 1e-06 +- query_pre_attn_scalar: 256 +- tie_word_embeddings: true (implied, no separate output.weight) + +### Gemma3 vs Gemma2 Key Differences + +| Feature | Gemma2 | Gemma3 | +|---------|--------|--------| +| QK head norms | No | Yes (`q_norm`, `k_norm`) | +| Pre-FFW norm | `ffn_norm` | `pre_feedforward_layernorm` → `ffn_norm` | +| Post-FFW norm | `post_ffw_norm` | `post_feedforward_layernorm` → `post_ffw_norm` | +| Post-attn norm | `post_attention_norm` | Same | +| Activation | GELU | GELU | +| Embedding scaling | sqrt(n_embd) | sqrt(n_embd) | + +### Per-Layer Tensors (7 extra norm tensors per layer) + +| Tensor | Shape | +|--------|-------| +| `self_attn.q_proj.norm.weight` | [640] | +| `self_attn.k_proj.norm.weight` | [640] | +| `self_attn.v_proj.norm.weight` | [640] | +| `self_attn.o_proj.norm.weight` | [1024] | +| `mlp.gate_proj.norm.weight` | [640] | +| `mlp.up_proj.norm.weight` | [640] | +| `mlp.down_proj.norm.weight` | [2048] | + +--- + +## 2. GGUF Tensor Name Mapping + +| HF Name | GGUF Name | Notes | +|----------|-----------|-------| +| `embed_tokens.weight` | `token_embd.weight` | | +| `norm.weight` | `output_norm.weight` | | +| `layers.{i}.input_layernorm.weight` | `blk.{i}.attn_norm.weight` | | +| `layers.{i}.post_attention_layernorm.weight` | `blk.{i}.post_attention_norm.weight` | | +| `layers.{i}.pre_feedforward_layernorm.weight` | `blk.{i}.ffn_norm.weight` | | +| `layers.{i}.post_feedforward_layernorm.weight` | `blk.{i}.post_ffw_norm.weight` | | +| `layers.{i}.self_attn.q_proj.weight` | `blk.{i}.attn_q.weight` | | +| `layers.{i}.self_attn.k_proj.weight` | `blk.{i}.attn_k.weight` | | +| `layers.{i}.self_attn.v_proj.weight` | `blk.{i}.attn_v.weight` | | +| `layers.{i}.self_attn.o_proj.weight` | `blk.{i}.attn_output.weight` | | +| `layers.{i}.self_attn.q_norm.weight` | `blk.{i}.attn_q_norm.weight` | QK head norm | +| `layers.{i}.self_attn.k_norm.weight` | `blk.{i}.attn_k_norm.weight` | QK head norm | +| `layers.{i}.self_attn.q_proj.norm.weight` | `blk.{i}.attn_q_norm_in.weight` | BitNet per-projection | +| `layers.{i}.self_attn.k_proj.norm.weight` | `blk.{i}.attn_k_norm_in.weight` | BitNet per-projection | +| `layers.{i}.self_attn.v_proj.norm.weight` | `blk.{i}.attn_v_norm_in.weight` | BitNet per-projection | +| `layers.{i}.self_attn.o_proj.norm.weight` | `blk.{i}.attn_output_norm_in.weight` | BitNet per-projection | +| `layers.{i}.mlp.gate_proj.weight` | `blk.{i}.ffn_gate.weight` | | +| `layers.{i}.mlp.up_proj.weight` | `blk.{i}.ffn_up.weight` | | +| `layers.{i}.mlp.down_proj.weight` | `blk.{i}.ffn_down.weight` | | +| `layers.{i}.mlp.gate_proj.norm.weight` | `blk.{i}.ffn_gate_norm_in.weight` | BitNet per-projection | +| `layers.{i}.mlp.up_proj.norm.weight` | `blk.{i}.ffn_up_norm_in.weight` | BitNet per-projection | +| `layers.{i}.mlp.down_proj.norm.weight` | `blk.{i}.ffn_down_norm_in.weight` | BitNet per-projection | + +--- + +## 3. Conversion Script + +### `utils/convert-bitnet-embedding-270m-to-gguf.py` + +Standalone conversion script (safetensors → GGUF). Key features: + +- Hardcoded HF→GGUF tensor name mapping (no dependency on llama.cpp's Python converter) +- Supports three output types: + - `--outtype f32`: all weights in float32 + - `--outtype f16`: 2D weights and embeddings as float16, norms as float16 + - `--outtype i2_s`: ternary weights packed in I2_S layout, non-ternary weights as float16 +- Writes `key_length` and `value_length` metadata for head_dim=256 +- Writes `query_pre_attn_scalar = 256` for correct attention scaling +- GemmaTokenizerFast (BPE) tokenizer handling with pre-tokenizer hash verification +- Pooling type auto-detection from `modules.json` / `1_Pooling/config.json` (sentence-transformers convention) +- EOS token auto-set by SpecialVocab from tokenizer_config.json (eos_token_id=1) +- Architecture string: `"gemma3"` + +### I2_S Ternary Packing + +The I2_S format packs ternary weights {-1, 0, +1} into 2-bit representation: + +- Quantization: `scale = 1/mean(|w|)`, `q = round(w * scale).clamp(-1, 1)` +- Encoding: `-1 → 0`, `0 → 1`, `+1 → 2` +- Every 128 values form a block, packed into 32 bytes +- Each byte stores 4 values: `byte = (c0 << 6) | (c1 << 4) | (c2 << 2) | c3` +- Scale (float32) is appended at the end of the packed data buffer + +### Tensor Type Assignment + +| Tensor Type | f16 mode | i2_s mode | +|-------------|----------|-----------| +| 2D linear weights | float16 | I2_S ternary packed | +| Embedding weights | float16 | float16 | +| Norm weights (1D) | float16 | float16 | + +Note: `output.weight` (lm_head) is skipped for embedding models — it is not needed (no token generation). + +--- + +## 4. C++ Modifications (`3rdparty/llama.cpp/src/llama.cpp`) + +### 4.1 New Architecture: `LLM_ARCH_GEMMA3` + +Added after `LLM_ARCH_GEMMA2` in the `llm_arch` enum with name mapping `"gemma3"`. + +### 4.2 Tensor Enums (shared with Qwen3) + +Reuses the 7 per-projection norm tensor enums added for Qwen3: + +```cpp +LLM_TENSOR_ATTN_Q_NORM_IN, +LLM_TENSOR_ATTN_K_NORM_IN, +LLM_TENSOR_ATTN_V_NORM_IN, +LLM_TENSOR_ATTN_OUT_NORM_IN, +LLM_TENSOR_FFN_GATE_NORM_IN, +LLM_TENSOR_FFN_UP_NORM_IN, +LLM_TENSOR_FFN_DOWN_NORM_IN, +``` + +### 4.3 Tensor Name Mappings for `LLM_ARCH_GEMMA3` + +```cpp +{ LLM_TENSOR_TOKEN_EMBD, "token_embd" }, +{ LLM_TENSOR_OUTPUT_NORM, "output_norm" }, +{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, +{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, +{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, +{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, +{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, +{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, +{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, +{ LLM_TENSOR_ATTN_Q_NORM_IN, "blk.%d.attn_q_norm_in" }, +{ LLM_TENSOR_ATTN_K_NORM_IN, "blk.%d.attn_k_norm_in" }, +{ LLM_TENSOR_ATTN_V_NORM_IN, "blk.%d.attn_v_norm_in" }, +{ LLM_TENSOR_ATTN_OUT_NORM_IN, "blk.%d.attn_output_norm_in" }, +{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" }, +{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, +{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, +{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, +{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, +{ LLM_TENSOR_FFN_GATE_NORM_IN, "blk.%d.ffn_gate_norm_in" }, +{ LLM_TENSOR_FFN_UP_NORM_IN, "blk.%d.ffn_up_norm_in" }, +{ LLM_TENSOR_FFN_DOWN_NORM_IN, "blk.%d.ffn_down_norm_in" }, +{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" }, +``` + +### 4.4 load_tensors (LLM_ARCH_GEMMA3) + +Based on Gemma2's tensor loading with additions: + +- QK head norms: `attn_q_norm`, `attn_k_norm` +- All 7 BitNet per-projection norm_in tensors (TENSOR_NOT_REQUIRED) + +```cpp +layer.attn_q_norm_in = create_tensor(tn(...), {n_embd}, TENSOR_NOT_REQUIRED); +layer.attn_k_norm_in = create_tensor(tn(...), {n_embd}, TENSOR_NOT_REQUIRED); +layer.attn_v_norm_in = create_tensor(tn(...), {n_embd}, TENSOR_NOT_REQUIRED); +layer.attn_out_norm_in = create_tensor(tn(...), {n_embd_head_k * n_head}, TENSOR_NOT_REQUIRED); +layer.ffn_gate_norm_in = create_tensor(tn(...), {n_embd}, TENSOR_NOT_REQUIRED); +layer.ffn_up_norm_in = create_tensor(tn(...), {n_embd}, TENSOR_NOT_REQUIRED); +layer.ffn_down_norm_in = create_tensor(tn(...), {n_ff}, TENSOR_NOT_REQUIRED); +``` + +### 4.5 build_gemma3() Graph Function + +Combines Gemma2's structure with Qwen3's per-projection norm pattern: + +**Key features:** +- Embedding scaling by `sqrt(n_embd)` (Gemma convention) +- GELU activation (gelu_pytorch_tanh) +- QK head norms after Q/K projection +- Conditional per-projection RMSNorm (backward compatible) +- Post-attention and post-FFN layer norms +- `wo=NULL` pattern for `attn_out_norm_in` (same as Qwen3) +- `query_pre_attn_scalar` for attention scaling + +**Attention per-projection norms:** +``` +// Before Q/K/V matmul: +if (layer.attn_q_norm_in) { + cur_q = ggml_rms_norm(ctx, cur, hparams.f_norm_rms_eps); + cur_q = ggml_mul(ctx, cur_q, layer.attn_q_norm_in); +} else { + cur_q = cur; +} +Qcur = ggml_mul_mat(ctx, layer.wq, cur_q); +// QK head norms applied after projection +Qcur = ggml_rms_norm(ctx, Qcur, hparams.f_norm_rms_eps); +Qcur = ggml_mul(ctx, Qcur, layer.attn_q_norm); +``` + +**O_proj norm** with `wo=NULL` pattern: +``` +cur = llm_build_kv(..., wo=NULL, ...); +if (layer.attn_out_norm_in) { + cur = ggml_rms_norm(ctx, cur, hparams.f_norm_rms_eps); + cur = ggml_mul(ctx, cur, layer.attn_out_norm_in); +} +cur = ggml_mul_mat(ctx, layer.wo, cur); +``` + +**FFN per-projection norms with GELU:** +``` +if (layer.ffn_gate_norm_in) { + tmp_gate = rms_norm(cur) * gate_norm_in; +} else { + tmp_gate = cur; +} +tmp_gate = matmul(gate_proj, tmp_gate); +tmp_gate = gelu(tmp_gate); // GELU, not SILU +// ... +``` + +--- + +## 5. GGUF Conversion Process + +There are two GGUF files to produce, from **two different source models**: + +| GGUF Output | Source Model | Description | +|-------------|-------------|-------------| +| `multilingual-e5-270m-f16.gguf` | `multilingual-e5-270m-260311` (standard Gemma3) | F16 baseline, standard float16 weights | +| `bitnet-embeddings-270m-i2_s.gguf` | `bitnet-embeddings-270m` (BitNet ternary) | I2_S ternary packed weights | + +### 5.1 F16 GGUF: from multilingual-e5-270m-260311 + +```bash +python3 utils/convert-bitnet-embedding-270m-to-gguf.py \ + /path/to/multilingual-e5-270m-260311 \ + --outtype f16 +``` + +**What happens:** +1. Load `model.safetensors` (standard Gemma3 weights, bfloat16) +2. Convert all 2D weights (projections, embeddings) to float16 +3. Convert norm weights to float16 +4. Write GGUF with `gemma3` architecture metadata and tokenizer + +### 5.2 I2_S GGUF: from bitnet-embeddings-270m + +```bash +python3 utils/convert-bitnet-embedding-270m-to-gguf.py \ + /path/to/bitnet-embeddings-270m \ + --outtype i2_s +``` + +**What happens:** +1. Load `model.safetensors` (BitNet ternary weights, bfloat16) +2. Map HF tensor names to GGUF names, including 7 extra `*_norm_in` tensors per layer +3. For each 2D linear weight: quantize to I2_S ternary packed format +4. Keep embeddings (`token_embd.weight`) in float16 +5. Keep all norm weights in float16 +6. Skip `output.weight` (lm_head, not needed for embedding models) +7. Write GGUF with `I2_S` type tag for quantized tensors + +### 5.3 Why Two Different Source Models? + +- `multilingual-e5-270m-260311` is the **teacher/baseline model** with standard float weights, used as the F16 performance reference +- `bitnet-embeddings-270m` is the **1-bit quantized student model** with ternary weights and per-projection BitLinear norms, converted to I2_S for efficient CPU inference +- Benchmarking compares both to measure the throughput gain and quality trade-off of ternary quantization + +### 5.4 Tensor Type Summary + +| Tensor | F16 (from e5-270m) | I2_S (from bitnet-270m) | +|--------|---------------------|-------------------------| +| Linear projections (q/k/v/o/gate/up/down) | float16 | I2_S (2-bit packed + float32 scale) | +| Embedding (`token_embd.weight`) | float16 | float16 | +| Per-projection norms (`*_norm_in`) | N/A (not present) | float16 | +| Layer norms (attn_norm, ffn_norm, etc.) | float16 | float16 | +| QK head norms (`attn_q_norm`, `attn_k_norm`) | float16 | float16 | +| `output.weight` (lm_head) | skipped | skipped | + +--- + +## 6. Additional Changes + +### 6.1 ggml.c: F16 Norm Weight Support + +Added `ggml_compute_forward_mul_f32_f16()` function to support element-wise multiplication where norm weights are stored in float16. Modified `ggml_compute_forward_mul()` to dispatch based on `src1->type`. + +### 6.2 gguf-py: I2_S Type + +Added `I2_S = 36` to `GGMLQuantizationType` enum and `(4, 1)` quant size in `constants.py`. + +### 6.3 CMakeLists.txt: BitNet LUT Kernels Guard + +Guarded `bitnet-lut-kernels.h` include with `if (GGML_BITNET_ARM_TL1 OR GGML_BITNET_X86_TL2)` to prevent build errors when LUT kernels are not available. + +### 6.4 ggml-bitnet-mad.cpp: AVX512 SIMD + +Added AVX512BW SIMD paths for I2_S dot product functions: +- `ggml_vec_dot_i2_i8_s_1x1` +- `ggml_vec_dot_i2_i8_s_1xN` +- `ggml_vec_dot_i2_i8_s_Nx1` + +--- + +## 7. Build and Run + +```bash +# Build with BitNet repo (includes I2_S support) +cmake -S /path/to/BitNet -B build -DCMAKE_BUILD_TYPE=Release +cmake --build build --target llama-embedding llama-bench -j$(nproc) + +# Run embedding inference +build/bin/llama-embedding -m bitnet-embeddings-270m-i2_s.gguf \ + -p "hello world" --embd-normalize 2 --embd-output-format array + +# Benchmark: F16 vs I2_S +build/bin/llama-bench -m multilingual-e5-270m-f16.gguf \ + -t 8 -p 128,256,512,1024,2048 -n 32,64 -r 3 -ngl 0 + +build/bin/llama-bench -m bitnet-embeddings-270m-i2_s.gguf \ + -t 8 -p 128,256,512,1024,2048 -n 32,64 -r 3 -ngl 0 +``` diff --git a/utils/convert-bitnet-embedding-270m-to-gguf.py b/utils/convert-bitnet-embedding-270m-to-gguf.py new file mode 100644 index 000000000..4408452ee --- /dev/null +++ b/utils/convert-bitnet-embedding-270m-to-gguf.py @@ -0,0 +1,441 @@ +#!/usr/bin/env python3 + +from __future__ import annotations + +import argparse +import json +import logging +import os +import sys +from hashlib import sha256 +from pathlib import Path +from typing import Any, Iterator + +import numpy as np +import torch + +# Allow using the local gguf-py if present +if "NO_LOCAL_GGUF" not in os.environ: + _local_gguf = Path(__file__).parent / "gguf-py" + if _local_gguf.exists(): + sys.path.insert(1, str(_local_gguf)) +import gguf + +logger = logging.getLogger("convert-bitnet-embedding-270m") + +# --------------------------------------------------------------------------- +# Tensor name mapping: HuggingFace -> GGUF +# --------------------------------------------------------------------------- + +def build_tensor_name_map(n_layers: int) -> dict[str, str]: + """Build HF tensor name -> GGUF tensor name mapping.""" + mapping: dict[str, str] = { + "embed_tokens.weight": "token_embd.weight", + "norm.weight": "output_norm.weight", + } + + for i in range(n_layers): + pfx = f"layers.{i}" + blk = f"blk.{i}" + + mapping.update({ + # Layer norms + f"{pfx}.input_layernorm.weight": f"{blk}.attn_norm.weight", + f"{pfx}.post_attention_layernorm.weight": f"{blk}.post_attention_norm.weight", + f"{pfx}.pre_feedforward_layernorm.weight": f"{blk}.ffn_norm.weight", + f"{pfx}.post_feedforward_layernorm.weight": f"{blk}.post_ffw_norm.weight", + + # Self-attention projections + f"{pfx}.self_attn.q_proj.weight": f"{blk}.attn_q.weight", + f"{pfx}.self_attn.k_proj.weight": f"{blk}.attn_k.weight", + f"{pfx}.self_attn.v_proj.weight": f"{blk}.attn_v.weight", + f"{pfx}.self_attn.o_proj.weight": f"{blk}.attn_output.weight", + + # QK head norms (Gemma3) + f"{pfx}.self_attn.q_norm.weight": f"{blk}.attn_q_norm.weight", + f"{pfx}.self_attn.k_norm.weight": f"{blk}.attn_k_norm.weight", + + # Per-projection input norms (BitNet-specific) + f"{pfx}.self_attn.q_proj.norm.weight": f"{blk}.attn_q_norm_in.weight", + f"{pfx}.self_attn.k_proj.norm.weight": f"{blk}.attn_k_norm_in.weight", + f"{pfx}.self_attn.v_proj.norm.weight": f"{blk}.attn_v_norm_in.weight", + f"{pfx}.self_attn.o_proj.norm.weight": f"{blk}.attn_output_norm_in.weight", + + # MLP projections + f"{pfx}.mlp.gate_proj.weight": f"{blk}.ffn_gate.weight", + f"{pfx}.mlp.up_proj.weight": f"{blk}.ffn_up.weight", + f"{pfx}.mlp.down_proj.weight": f"{blk}.ffn_down.weight", + + # Per-projection input norms for MLP (BitNet-specific) + f"{pfx}.mlp.gate_proj.norm.weight": f"{blk}.ffn_gate_norm_in.weight", + f"{pfx}.mlp.up_proj.norm.weight": f"{blk}.ffn_up_norm_in.weight", + f"{pfx}.mlp.down_proj.norm.weight": f"{blk}.ffn_down_norm_in.weight", + }) + + return mapping + + +# --------------------------------------------------------------------------- +# Tokenizer handling (BPE for Gemma3) +# --------------------------------------------------------------------------- + +def get_vocab_base_pre(tokenizer) -> str: + chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n\U0001f680 (normal) \U0001f636‍\U0001f32b️ (multiple emojis concatenated) ✅ \U0001f999\U0001f999 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច\U0001f601 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````""""""......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL' + + chktok = tokenizer.encode(chktxt) + chkhsh = sha256(str(chktok).encode()).hexdigest() + + logger.debug(f"chktok: {chktok}") + logger.debug(f"chkhsh: {chkhsh}") + + res = None + + if chkhsh == "fcb6bf9f20f6c40fa4aa4f7f99607bd6c106ca2348efdacacdca8152e59dcfe9": + # ref: multilingual-e5-270m-260311 (Gemma3 tokenizer) + res = "default" + if chkhsh == "a8594e3edff7c29c003940395316294b2c623571571fc8d3d2d6571f5571cbe6": + # ref: google/gemma-2-9b + res = "default" + + if res is None: + logger.warning("\n") + logger.warning("**************************************************************************************") + logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!") + logger.warning(f"** chkhsh: {chkhsh}") + logger.warning("**************************************************************************************") + logger.warning("\n") + raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()") + + logger.debug(f"tokenizer.ggml.pre: {repr(res)}") + return res + + +def _does_token_look_special(token: str) -> bool: + if not token: + return False + if token.startswith(("<|", "<", "[")) and token.endswith(("|>", ">", "]")): + return True + return False + + +def set_vocab(gguf_writer: gguf.GGUFWriter, dir_model: Path, hparams: dict): + """Set BPE vocab for Gemma3.""" + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(dir_model) + vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) + + tokpre = get_vocab_base_pre(tokenizer) + + tokens: list[str] = [] + toktypes: list[int] = [] + + reverse_vocab = {id_: tok for tok, id_ in tokenizer.vocab.items()} + added_vocab = tokenizer.get_added_vocab() + + added_tokens_decoder = tokenizer.added_tokens_decoder + + for i in range(vocab_size): + if i not in reverse_vocab: + tokens.append(f"[PAD{i}]") + toktypes.append(gguf.TokenType.UNUSED) + elif reverse_vocab[i] in added_vocab: + token = reverse_vocab[i] + + if not added_tokens_decoder[i].normalized: + token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) + + if added_tokens_decoder[i].special or _does_token_look_special(token): + toktypes.append(gguf.TokenType.CONTROL) + else: + token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") + toktypes.append(gguf.TokenType.USER_DEFINED) + + tokens.append(token) + else: + tokens.append(reverse_vocab[i]) + toktypes.append(gguf.TokenType.NORMAL) + + gguf_writer.add_tokenizer_model("gpt2") + gguf_writer.add_tokenizer_pre(tokpre) + gguf_writer.add_token_list(tokens) + gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(dir_model, load_merges=True) + special_vocab.add_to_gguf(gguf_writer) + + +# --------------------------------------------------------------------------- +# GGUF metadata +# --------------------------------------------------------------------------- + +def set_gguf_parameters(gguf_writer: gguf.GGUFWriter, hparams: dict, dir_model: Path, ftype: int): + gguf_writer.add_name(dir_model.name) + + n_layers = hparams["num_hidden_layers"] + n_embd = hparams["hidden_size"] + n_head = hparams["num_attention_heads"] + n_head_kv = hparams.get("num_key_value_heads", n_head) + n_ff = hparams["intermediate_size"] + + gguf_writer.add_block_count(n_layers) + gguf_writer.add_context_length(hparams.get("max_position_embeddings", 32768)) + gguf_writer.add_embedding_length(n_embd) + gguf_writer.add_feed_forward_length(n_ff) + gguf_writer.add_head_count(n_head) + gguf_writer.add_head_count_kv(n_head_kv) + gguf_writer.add_vocab_size(hparams["vocab_size"]) + + head_dim = hparams.get("head_dim", n_embd // n_head) + gguf_writer.add_rope_dimension_count(head_dim) + gguf_writer.add_key_length(head_dim) + gguf_writer.add_value_length(head_dim) + + if hparams.get("rope_theta") is not None: + gguf_writer.add_rope_freq_base(hparams["rope_theta"]) + if hparams.get("rms_norm_eps") is not None: + gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"]) + + gguf_writer.add_file_type(ftype) + + # Pooling type for embedding models + pooling_type = None + module_path = dir_model / "modules.json" + if module_path.is_file(): + with open(module_path, encoding="utf-8") as f: + modules = json.load(f) + for mod in modules: + if mod["type"].endswith("Pooling"): + pooling_path = dir_model / mod["path"] / "config.json" + if pooling_path.is_file(): + with open(pooling_path, encoding="utf-8") as f: + pooling = json.load(f) + if pooling.get("pooling_mode_mean_tokens"): + pooling_type = gguf.PoolingType.MEAN + elif pooling.get("pooling_mode_cls_token"): + pooling_type = gguf.PoolingType.CLS + elif pooling.get("pooling_mode_lasttoken"): + pooling_type = gguf.PoolingType.LAST + break + if pooling_type is None: + logger.info(" No pooling config found, defaulting to MEAN pooling") + pooling_type = gguf.PoolingType.MEAN + gguf_writer.add_pooling_type(pooling_type) + + logger.info(f" n_layers={n_layers}, n_embd={n_embd}, n_head={n_head}, n_head_kv={n_head_kv}, n_ff={n_ff}, head_dim={head_dim}") + + +# --------------------------------------------------------------------------- +# Tensor iteration from safetensors +# --------------------------------------------------------------------------- + +def iter_tensors(dir_model: Path) -> Iterator[tuple[str, torch.Tensor]]: + """Yield (name, tensor) from safetensors files.""" + from safetensors import safe_open + + safetensor_files = sorted(dir_model.glob("*.safetensors")) + if not safetensor_files: + raise FileNotFoundError(f"No .safetensors files in {dir_model}") + + for sf_path in safetensor_files: + logger.info(f"Loading {sf_path.name}") + with safe_open(str(sf_path), framework="pt", device="cpu") as f: + for name in f.keys(): + yield name, f.get_tensor(name) + + +# --------------------------------------------------------------------------- +# I2_S ternary packing (platform-independent) +# --------------------------------------------------------------------------- + +def quantize_to_i2_s(w: np.ndarray) -> np.ndarray: + """Quantize float weights to ternary and pack into I2_S layout. + + Uses the same quantization as BitLinear weight_quant_minmax(): + scale = 1.0 / mean(|w|) + q = round(w * scale).clamp(-1, 1) + dequant = q / scale = q * mean(|w|) + + Args: + w: float weight tensor of shape (M, K) + + Returns: + packed_data: uint8 array containing I2_S packed bytes + scale (as 4 trailing bytes) + """ + M, K = w.shape + n = M * K + w_flat = w.flatten().astype(np.float32) + + abs_mean = np.mean(np.abs(w_flat)) + abs_mean = max(abs_mean, 1e-5) + inv_scale = 1.0 / abs_mean + q_float = np.round(w_flat * inv_scale).clip(-1, 1) + + scale = np.float32(abs_mean) + + # Map ternary {-1, 0, 1} -> I2_S encoding {0, 1, 2} + q = np.ones(n, dtype=np.uint8) + q[q_float > 0.5] = 2 + q[q_float < -0.5] = 0 + + # Pack into I2_S layout: 128-value blocks, interleaved into 32 bytes + pad_len = (128 - n % 128) % 128 + if pad_len: + q = np.pad(q, (0, pad_len), constant_values=1) + + n_padded = len(q) + n_blocks = n_padded // 128 + + q = q.reshape(n_blocks, 4, 32) + + packed = (q[:, 0, :].astype(np.uint8) << 6) | \ + (q[:, 1, :].astype(np.uint8) << 4) | \ + (q[:, 2, :].astype(np.uint8) << 2) | \ + (q[:, 3, :].astype(np.uint8)) + + packed = packed.reshape(-1).astype(np.uint8) + + packed_size = n // 4 + total_size = packed_size + 32 + result = np.zeros(total_size, dtype=np.uint8) + result[:len(packed)] = packed[:packed_size] + result[packed_size:packed_size+4] = np.frombuffer(scale.tobytes(), dtype=np.uint8) + + return result + + +# --------------------------------------------------------------------------- +# Main conversion +# --------------------------------------------------------------------------- + +def main(): + parser = argparse.ArgumentParser(description="Convert bitnet-embeddings-270m (Gemma3) to GGUF") + parser.add_argument("model", type=Path, help="Model directory") + parser.add_argument("--outfile", type=Path, default=None, help="Output GGUF file") + parser.add_argument("--outtype", choices=["f32", "f16", "i2_s"], default="f16", + help="Output type: f32, f16, or i2_s (ternary quantized)") + parser.add_argument("--verbose", action="store_true") + args = parser.parse_args() + + logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) + + dir_model = args.model + if not dir_model.is_dir(): + logger.error(f"{dir_model} is not a directory") + sys.exit(1) + + # Default output filename + if args.outfile is None: + suffix = {"f32": "-f32", "f16": "-f16", "i2_s": "-f16-new-i2_s"}[args.outtype] + args.outfile = dir_model / f"{dir_model.name}{suffix}.gguf" + + # Load config + with open(dir_model / "config.json") as f: + hparams = json.load(f) + + arch = hparams.get("model_type", "gemma3_text") + assert arch == "gemma3_text", f"Expected gemma3_text architecture, got {arch}" + + n_layers = hparams["num_hidden_layers"] + + # Determine ftype + if args.outtype == "f32": + ftype = 0 # GGML F32 + elif args.outtype == "f16": + ftype = 1 # GGML F16 + else: # i2_s + ftype = 40 # LLAMA_FTYPE_MOSTLY_I2_S + + logger.info(f"Converting {dir_model.name} to GGUF ({args.outtype})") + + # Create GGUF writer + gguf_writer = gguf.GGUFWriter(str(args.outfile), "gemma3") + + # Set parameters + set_gguf_parameters(gguf_writer, hparams, dir_model, ftype) + + # Set vocab + logger.info("Setting tokenizer/vocab...") + set_vocab(gguf_writer, dir_model, hparams) + + # Build tensor name map + tensor_map = build_tensor_name_map(n_layers) + + # Process tensors + logger.info("Processing tensors...") + tensor_count = 0 + for hf_name, data_torch in iter_tensors(dir_model): + # Skip tensors we don't need + if hf_name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")): + continue + + # Strip "model." prefix if present + name = hf_name + if name.startswith("model."): + name = name[len("model."):] + + # Look up GGUF name + gguf_name = tensor_map.get(name) + if gguf_name is None: + logger.warning(f"Skipping unmapped tensor: {hf_name}") + continue + + old_dtype = data_torch.dtype + + # Convert bf16 -> f32 first (bf16 not directly supported by gguf) + if data_torch.dtype == torch.bfloat16: + data_torch = data_torch.to(torch.float32) + + data = data_torch.squeeze().numpy() + n_dims = len(data.shape) + data_shape = data.shape + + # Determine if this is a linear weight suitable for ternary quantization + is_norm = gguf_name.endswith("_norm.weight") or gguf_name.endswith("_norm_in.weight") + is_embed = gguf_name == "token_embd.weight" + is_linear_weight = n_dims == 2 and not is_norm and not is_embed + suit_i2 = is_linear_weight + + if args.outtype == "i2_s" and suit_i2: + # --- I2_S ternary packing (scale embedded in data) --- + packed = quantize_to_i2_s(data) + data_qtype = gguf.GGMLQuantizationType.I2_S + + shape_str = f"{{{', '.join(str(n) for n in reversed(data_shape))}}}" + logger.info(f" {gguf_name}: {list(data_shape)} {old_dtype} -> I2_S, shape = {shape_str}") + + gguf_writer.add_tensor(gguf_name, packed, raw_shape=data_shape, raw_dtype=data_qtype) + tensor_count += 1 + + elif args.outtype in ("f16", "i2_s") and (is_linear_weight or is_embed): + # 2D weight tensors (linear + embedding) -> f16 + data = data.astype(np.float16) + logger.info(f" {gguf_name}: {list(data_torch.shape)} {old_dtype} -> float16") + gguf_writer.add_tensor(gguf_name, data) + tensor_count += 1 + + else: + # norms, 1D tensors + if args.outtype in ("f16", "i2_s"): + data = data.astype(np.float16) + logger.info(f" {gguf_name}: {list(data_torch.shape)} {old_dtype} -> float16") + else: + if data.dtype != np.float32: + data = data.astype(np.float32) + logger.info(f" {gguf_name}: {list(data_torch.shape)} {old_dtype} -> float32") + gguf_writer.add_tensor(gguf_name, data) + tensor_count += 1 + + logger.info(f"Total tensors written: {tensor_count}") + + # Write GGUF + logger.info(f"Writing to {args.outfile}...") + gguf_writer.write_header_to_file() + gguf_writer.write_kv_data_to_file() + gguf_writer.write_tensors_to_file() + gguf_writer.close() + + logger.info("Done!") + + +if __name__ == "__main__": + main() From 5720fc7c030483f8e20f824ea8345c1c2efbe252 Mon Sep 17 00:00:00 2001 From: isHuangXin Date: Sun, 24 May 2026 18:24:01 +0800 Subject: [PATCH 3/3] [refactor] Unify GGUF conversion script and docs for Qwen3/Gemma3 embedding models - Merge convert-bitnet-embedding-270m-to-gguf.py into convert-bitnet-embedding-to-gguf.py with auto-detection of model architecture (qwen3/gemma3_text) from config.json - Merge separate Qwen3 and Gemma3 conversion docs into a single bitnet-embeddings-gguf-conversion.md - Remove redundant per-architecture scripts and docs --- ...itnet-embeddings-gemma3-gguf-conversion.md | 336 ------------- docs/bitnet-embeddings-gguf-conversion.md | 410 ++++++++++++++++ ...bitnet-embeddings-qwen3-gguf-conversion.md | 302 ------------ .../convert-bitnet-embedding-270m-to-gguf.py | 441 ------------------ utils/convert-bitnet-embedding-to-gguf.py | 117 +++-- 5 files changed, 483 insertions(+), 1123 deletions(-) delete mode 100644 docs/bitnet-embeddings-gemma3-gguf-conversion.md create mode 100644 docs/bitnet-embeddings-gguf-conversion.md delete mode 100644 docs/bitnet-embeddings-qwen3-gguf-conversion.md delete mode 100644 utils/convert-bitnet-embedding-270m-to-gguf.py diff --git a/docs/bitnet-embeddings-gemma3-gguf-conversion.md b/docs/bitnet-embeddings-gemma3-gguf-conversion.md deleted file mode 100644 index 236b78d1e..000000000 --- a/docs/bitnet-embeddings-gemma3-gguf-conversion.md +++ /dev/null @@ -1,336 +0,0 @@ -# BitNet Embeddings (Gemma3) GGUF Conversion Implementation - -## 1. Background - -`bitnet-embeddings-270m` is a Gemma3-based embedding model with BitNet per-projection RMSNorm (`BitLinear`). Each linear projection (q/k/v/o/gate/up/down) has a `.norm.weight` that applies RMSNorm to the input **before** the matmul: - -``` -x → RMSNorm(x, norm.weight) → activation_quant(8bit) → matmul(weight_quant(ternary)) -``` - -This pattern does **not** exist in any standard llama.cpp architecture: -- Standard Gemma3: no per-projection norms -- Standard BitNet: has `attn_sub_norm`/`ffn_sub_norm` at different positions (after attention/gate*up, not before each projection) - -### Model Config - -- Architecture: `Gemma3TextModel` -- hidden_size: 640, num_attention_heads: 4, num_key_value_heads: 1 -- head_dim: 256 (note: != hidden_size/num_heads = 160) -- intermediate_size: 2048, num_hidden_layers: 18 -- hidden_activation: gelu_pytorch_tanh -- vocab_size: 262144 -- rope_theta: 10000.0, rms_norm_eps: 1e-06 -- query_pre_attn_scalar: 256 -- tie_word_embeddings: true (implied, no separate output.weight) - -### Gemma3 vs Gemma2 Key Differences - -| Feature | Gemma2 | Gemma3 | -|---------|--------|--------| -| QK head norms | No | Yes (`q_norm`, `k_norm`) | -| Pre-FFW norm | `ffn_norm` | `pre_feedforward_layernorm` → `ffn_norm` | -| Post-FFW norm | `post_ffw_norm` | `post_feedforward_layernorm` → `post_ffw_norm` | -| Post-attn norm | `post_attention_norm` | Same | -| Activation | GELU | GELU | -| Embedding scaling | sqrt(n_embd) | sqrt(n_embd) | - -### Per-Layer Tensors (7 extra norm tensors per layer) - -| Tensor | Shape | -|--------|-------| -| `self_attn.q_proj.norm.weight` | [640] | -| `self_attn.k_proj.norm.weight` | [640] | -| `self_attn.v_proj.norm.weight` | [640] | -| `self_attn.o_proj.norm.weight` | [1024] | -| `mlp.gate_proj.norm.weight` | [640] | -| `mlp.up_proj.norm.weight` | [640] | -| `mlp.down_proj.norm.weight` | [2048] | - ---- - -## 2. GGUF Tensor Name Mapping - -| HF Name | GGUF Name | Notes | -|----------|-----------|-------| -| `embed_tokens.weight` | `token_embd.weight` | | -| `norm.weight` | `output_norm.weight` | | -| `layers.{i}.input_layernorm.weight` | `blk.{i}.attn_norm.weight` | | -| `layers.{i}.post_attention_layernorm.weight` | `blk.{i}.post_attention_norm.weight` | | -| `layers.{i}.pre_feedforward_layernorm.weight` | `blk.{i}.ffn_norm.weight` | | -| `layers.{i}.post_feedforward_layernorm.weight` | `blk.{i}.post_ffw_norm.weight` | | -| `layers.{i}.self_attn.q_proj.weight` | `blk.{i}.attn_q.weight` | | -| `layers.{i}.self_attn.k_proj.weight` | `blk.{i}.attn_k.weight` | | -| `layers.{i}.self_attn.v_proj.weight` | `blk.{i}.attn_v.weight` | | -| `layers.{i}.self_attn.o_proj.weight` | `blk.{i}.attn_output.weight` | | -| `layers.{i}.self_attn.q_norm.weight` | `blk.{i}.attn_q_norm.weight` | QK head norm | -| `layers.{i}.self_attn.k_norm.weight` | `blk.{i}.attn_k_norm.weight` | QK head norm | -| `layers.{i}.self_attn.q_proj.norm.weight` | `blk.{i}.attn_q_norm_in.weight` | BitNet per-projection | -| `layers.{i}.self_attn.k_proj.norm.weight` | `blk.{i}.attn_k_norm_in.weight` | BitNet per-projection | -| `layers.{i}.self_attn.v_proj.norm.weight` | `blk.{i}.attn_v_norm_in.weight` | BitNet per-projection | -| `layers.{i}.self_attn.o_proj.norm.weight` | `blk.{i}.attn_output_norm_in.weight` | BitNet per-projection | -| `layers.{i}.mlp.gate_proj.weight` | `blk.{i}.ffn_gate.weight` | | -| `layers.{i}.mlp.up_proj.weight` | `blk.{i}.ffn_up.weight` | | -| `layers.{i}.mlp.down_proj.weight` | `blk.{i}.ffn_down.weight` | | -| `layers.{i}.mlp.gate_proj.norm.weight` | `blk.{i}.ffn_gate_norm_in.weight` | BitNet per-projection | -| `layers.{i}.mlp.up_proj.norm.weight` | `blk.{i}.ffn_up_norm_in.weight` | BitNet per-projection | -| `layers.{i}.mlp.down_proj.norm.weight` | `blk.{i}.ffn_down_norm_in.weight` | BitNet per-projection | - ---- - -## 3. Conversion Script - -### `utils/convert-bitnet-embedding-270m-to-gguf.py` - -Standalone conversion script (safetensors → GGUF). Key features: - -- Hardcoded HF→GGUF tensor name mapping (no dependency on llama.cpp's Python converter) -- Supports three output types: - - `--outtype f32`: all weights in float32 - - `--outtype f16`: 2D weights and embeddings as float16, norms as float16 - - `--outtype i2_s`: ternary weights packed in I2_S layout, non-ternary weights as float16 -- Writes `key_length` and `value_length` metadata for head_dim=256 -- Writes `query_pre_attn_scalar = 256` for correct attention scaling -- GemmaTokenizerFast (BPE) tokenizer handling with pre-tokenizer hash verification -- Pooling type auto-detection from `modules.json` / `1_Pooling/config.json` (sentence-transformers convention) -- EOS token auto-set by SpecialVocab from tokenizer_config.json (eos_token_id=1) -- Architecture string: `"gemma3"` - -### I2_S Ternary Packing - -The I2_S format packs ternary weights {-1, 0, +1} into 2-bit representation: - -- Quantization: `scale = 1/mean(|w|)`, `q = round(w * scale).clamp(-1, 1)` -- Encoding: `-1 → 0`, `0 → 1`, `+1 → 2` -- Every 128 values form a block, packed into 32 bytes -- Each byte stores 4 values: `byte = (c0 << 6) | (c1 << 4) | (c2 << 2) | c3` -- Scale (float32) is appended at the end of the packed data buffer - -### Tensor Type Assignment - -| Tensor Type | f16 mode | i2_s mode | -|-------------|----------|-----------| -| 2D linear weights | float16 | I2_S ternary packed | -| Embedding weights | float16 | float16 | -| Norm weights (1D) | float16 | float16 | - -Note: `output.weight` (lm_head) is skipped for embedding models — it is not needed (no token generation). - ---- - -## 4. C++ Modifications (`3rdparty/llama.cpp/src/llama.cpp`) - -### 4.1 New Architecture: `LLM_ARCH_GEMMA3` - -Added after `LLM_ARCH_GEMMA2` in the `llm_arch` enum with name mapping `"gemma3"`. - -### 4.2 Tensor Enums (shared with Qwen3) - -Reuses the 7 per-projection norm tensor enums added for Qwen3: - -```cpp -LLM_TENSOR_ATTN_Q_NORM_IN, -LLM_TENSOR_ATTN_K_NORM_IN, -LLM_TENSOR_ATTN_V_NORM_IN, -LLM_TENSOR_ATTN_OUT_NORM_IN, -LLM_TENSOR_FFN_GATE_NORM_IN, -LLM_TENSOR_FFN_UP_NORM_IN, -LLM_TENSOR_FFN_DOWN_NORM_IN, -``` - -### 4.3 Tensor Name Mappings for `LLM_ARCH_GEMMA3` - -```cpp -{ LLM_TENSOR_TOKEN_EMBD, "token_embd" }, -{ LLM_TENSOR_OUTPUT_NORM, "output_norm" }, -{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, -{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, -{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, -{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, -{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, -{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, -{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, -{ LLM_TENSOR_ATTN_Q_NORM_IN, "blk.%d.attn_q_norm_in" }, -{ LLM_TENSOR_ATTN_K_NORM_IN, "blk.%d.attn_k_norm_in" }, -{ LLM_TENSOR_ATTN_V_NORM_IN, "blk.%d.attn_v_norm_in" }, -{ LLM_TENSOR_ATTN_OUT_NORM_IN, "blk.%d.attn_output_norm_in" }, -{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" }, -{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, -{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, -{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, -{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, -{ LLM_TENSOR_FFN_GATE_NORM_IN, "blk.%d.ffn_gate_norm_in" }, -{ LLM_TENSOR_FFN_UP_NORM_IN, "blk.%d.ffn_up_norm_in" }, -{ LLM_TENSOR_FFN_DOWN_NORM_IN, "blk.%d.ffn_down_norm_in" }, -{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" }, -``` - -### 4.4 load_tensors (LLM_ARCH_GEMMA3) - -Based on Gemma2's tensor loading with additions: - -- QK head norms: `attn_q_norm`, `attn_k_norm` -- All 7 BitNet per-projection norm_in tensors (TENSOR_NOT_REQUIRED) - -```cpp -layer.attn_q_norm_in = create_tensor(tn(...), {n_embd}, TENSOR_NOT_REQUIRED); -layer.attn_k_norm_in = create_tensor(tn(...), {n_embd}, TENSOR_NOT_REQUIRED); -layer.attn_v_norm_in = create_tensor(tn(...), {n_embd}, TENSOR_NOT_REQUIRED); -layer.attn_out_norm_in = create_tensor(tn(...), {n_embd_head_k * n_head}, TENSOR_NOT_REQUIRED); -layer.ffn_gate_norm_in = create_tensor(tn(...), {n_embd}, TENSOR_NOT_REQUIRED); -layer.ffn_up_norm_in = create_tensor(tn(...), {n_embd}, TENSOR_NOT_REQUIRED); -layer.ffn_down_norm_in = create_tensor(tn(...), {n_ff}, TENSOR_NOT_REQUIRED); -``` - -### 4.5 build_gemma3() Graph Function - -Combines Gemma2's structure with Qwen3's per-projection norm pattern: - -**Key features:** -- Embedding scaling by `sqrt(n_embd)` (Gemma convention) -- GELU activation (gelu_pytorch_tanh) -- QK head norms after Q/K projection -- Conditional per-projection RMSNorm (backward compatible) -- Post-attention and post-FFN layer norms -- `wo=NULL` pattern for `attn_out_norm_in` (same as Qwen3) -- `query_pre_attn_scalar` for attention scaling - -**Attention per-projection norms:** -``` -// Before Q/K/V matmul: -if (layer.attn_q_norm_in) { - cur_q = ggml_rms_norm(ctx, cur, hparams.f_norm_rms_eps); - cur_q = ggml_mul(ctx, cur_q, layer.attn_q_norm_in); -} else { - cur_q = cur; -} -Qcur = ggml_mul_mat(ctx, layer.wq, cur_q); -// QK head norms applied after projection -Qcur = ggml_rms_norm(ctx, Qcur, hparams.f_norm_rms_eps); -Qcur = ggml_mul(ctx, Qcur, layer.attn_q_norm); -``` - -**O_proj norm** with `wo=NULL` pattern: -``` -cur = llm_build_kv(..., wo=NULL, ...); -if (layer.attn_out_norm_in) { - cur = ggml_rms_norm(ctx, cur, hparams.f_norm_rms_eps); - cur = ggml_mul(ctx, cur, layer.attn_out_norm_in); -} -cur = ggml_mul_mat(ctx, layer.wo, cur); -``` - -**FFN per-projection norms with GELU:** -``` -if (layer.ffn_gate_norm_in) { - tmp_gate = rms_norm(cur) * gate_norm_in; -} else { - tmp_gate = cur; -} -tmp_gate = matmul(gate_proj, tmp_gate); -tmp_gate = gelu(tmp_gate); // GELU, not SILU -// ... -``` - ---- - -## 5. GGUF Conversion Process - -There are two GGUF files to produce, from **two different source models**: - -| GGUF Output | Source Model | Description | -|-------------|-------------|-------------| -| `multilingual-e5-270m-f16.gguf` | `multilingual-e5-270m-260311` (standard Gemma3) | F16 baseline, standard float16 weights | -| `bitnet-embeddings-270m-i2_s.gguf` | `bitnet-embeddings-270m` (BitNet ternary) | I2_S ternary packed weights | - -### 5.1 F16 GGUF: from multilingual-e5-270m-260311 - -```bash -python3 utils/convert-bitnet-embedding-270m-to-gguf.py \ - /path/to/multilingual-e5-270m-260311 \ - --outtype f16 -``` - -**What happens:** -1. Load `model.safetensors` (standard Gemma3 weights, bfloat16) -2. Convert all 2D weights (projections, embeddings) to float16 -3. Convert norm weights to float16 -4. Write GGUF with `gemma3` architecture metadata and tokenizer - -### 5.2 I2_S GGUF: from bitnet-embeddings-270m - -```bash -python3 utils/convert-bitnet-embedding-270m-to-gguf.py \ - /path/to/bitnet-embeddings-270m \ - --outtype i2_s -``` - -**What happens:** -1. Load `model.safetensors` (BitNet ternary weights, bfloat16) -2. Map HF tensor names to GGUF names, including 7 extra `*_norm_in` tensors per layer -3. For each 2D linear weight: quantize to I2_S ternary packed format -4. Keep embeddings (`token_embd.weight`) in float16 -5. Keep all norm weights in float16 -6. Skip `output.weight` (lm_head, not needed for embedding models) -7. Write GGUF with `I2_S` type tag for quantized tensors - -### 5.3 Why Two Different Source Models? - -- `multilingual-e5-270m-260311` is the **teacher/baseline model** with standard float weights, used as the F16 performance reference -- `bitnet-embeddings-270m` is the **1-bit quantized student model** with ternary weights and per-projection BitLinear norms, converted to I2_S for efficient CPU inference -- Benchmarking compares both to measure the throughput gain and quality trade-off of ternary quantization - -### 5.4 Tensor Type Summary - -| Tensor | F16 (from e5-270m) | I2_S (from bitnet-270m) | -|--------|---------------------|-------------------------| -| Linear projections (q/k/v/o/gate/up/down) | float16 | I2_S (2-bit packed + float32 scale) | -| Embedding (`token_embd.weight`) | float16 | float16 | -| Per-projection norms (`*_norm_in`) | N/A (not present) | float16 | -| Layer norms (attn_norm, ffn_norm, etc.) | float16 | float16 | -| QK head norms (`attn_q_norm`, `attn_k_norm`) | float16 | float16 | -| `output.weight` (lm_head) | skipped | skipped | - ---- - -## 6. Additional Changes - -### 6.1 ggml.c: F16 Norm Weight Support - -Added `ggml_compute_forward_mul_f32_f16()` function to support element-wise multiplication where norm weights are stored in float16. Modified `ggml_compute_forward_mul()` to dispatch based on `src1->type`. - -### 6.2 gguf-py: I2_S Type - -Added `I2_S = 36` to `GGMLQuantizationType` enum and `(4, 1)` quant size in `constants.py`. - -### 6.3 CMakeLists.txt: BitNet LUT Kernels Guard - -Guarded `bitnet-lut-kernels.h` include with `if (GGML_BITNET_ARM_TL1 OR GGML_BITNET_X86_TL2)` to prevent build errors when LUT kernels are not available. - -### 6.4 ggml-bitnet-mad.cpp: AVX512 SIMD - -Added AVX512BW SIMD paths for I2_S dot product functions: -- `ggml_vec_dot_i2_i8_s_1x1` -- `ggml_vec_dot_i2_i8_s_1xN` -- `ggml_vec_dot_i2_i8_s_Nx1` - ---- - -## 7. Build and Run - -```bash -# Build with BitNet repo (includes I2_S support) -cmake -S /path/to/BitNet -B build -DCMAKE_BUILD_TYPE=Release -cmake --build build --target llama-embedding llama-bench -j$(nproc) - -# Run embedding inference -build/bin/llama-embedding -m bitnet-embeddings-270m-i2_s.gguf \ - -p "hello world" --embd-normalize 2 --embd-output-format array - -# Benchmark: F16 vs I2_S -build/bin/llama-bench -m multilingual-e5-270m-f16.gguf \ - -t 8 -p 128,256,512,1024,2048 -n 32,64 -r 3 -ngl 0 - -build/bin/llama-bench -m bitnet-embeddings-270m-i2_s.gguf \ - -t 8 -p 128,256,512,1024,2048 -n 32,64 -r 3 -ngl 0 -``` diff --git a/docs/bitnet-embeddings-gguf-conversion.md b/docs/bitnet-embeddings-gguf-conversion.md new file mode 100644 index 000000000..a4ee919d6 --- /dev/null +++ b/docs/bitnet-embeddings-gguf-conversion.md @@ -0,0 +1,410 @@ +# BitNet Embeddings GGUF Conversion Implementation + +## 1. Background + +BitNet embedding models apply per-projection RMSNorm (`BitLinear`) before each linear projection (q/k/v/o/gate/up/down). Each projection has a `.norm.weight` that applies RMSNorm to the input **before** the matmul: + +``` +x → RMSNorm(x, norm.weight) → activation_quant(8bit) → matmul(weight_quant(ternary)) +``` + +This pattern does **not** exist in any standard llama.cpp architecture: +- Standard Qwen3/Gemma3: no per-projection norms +- Standard BitNet: has `attn_sub_norm`/`ffn_sub_norm` at different positions (after attention/gate*up, not before each projection) + +Currently two base architectures are supported: + +| | bitnet-embeddings-0.6b (Qwen3) | bitnet-embeddings-270m (Gemma3) | +|---|---|---| +| Architecture | `Qwen3Model` | `Gemma3TextModel` | +| hidden_size | 1024 | 640 | +| num_attention_heads | 16 | 4 | +| num_key_value_heads | 8 | 1 | +| head_dim | 128 (note: != hidden_size/num_heads = 64) | 256 (note: != hidden_size/num_heads = 160) | +| intermediate_size | 3072 | 2048 | +| num_hidden_layers | 28 | 18 | +| hidden_activation | SiLU | gelu_pytorch_tanh | +| vocab_size | 151936 | 262144 | +| rope_theta | 1000000 | 10000.0 | +| rms_norm_eps | 1e-06 | 1e-06 | +| query_pre_attn_scalar | N/A | 256 | +| tie_word_embeddings | true | true | + +### Gemma3 vs Qwen3 Key Differences + +| Feature | Qwen3 | Gemma3 | +|---------|-------|--------| +| Post-attn norm | No | Yes (`post_attention_norm`) | +| Post-FFW norm | No | Yes (`post_ffw_norm`) | +| Pre-FFW norm naming | `post_attention_layernorm` → `ffn_norm` | `pre_feedforward_layernorm` → `ffn_norm` | +| QK head norms | Yes | Yes | +| Activation | SiLU | GELU | +| Embedding scaling | No | sqrt(n_embd) | +| EOS token override | Yes (`<\|endoftext\|>` 151643) | No (auto from tokenizer) | + +### Per-Layer Tensors (7 extra norm tensors per layer) + +| Tensor | Qwen3 Shape | Gemma3 Shape | +|--------|-------------|--------------| +| `self_attn.q_proj.norm.weight` | [1024] | [640] | +| `self_attn.k_proj.norm.weight` | [1024] | [640] | +| `self_attn.v_proj.norm.weight` | [1024] | [640] | +| `self_attn.o_proj.norm.weight` | [2048] | [1024] | +| `mlp.gate_proj.norm.weight` | [1024] | [640] | +| `mlp.up_proj.norm.weight` | [1024] | [640] | +| `mlp.down_proj.norm.weight` | [3072] | [2048] | + +--- + +## 2. GGUF Tensor Name Mapping + +### Common Tensors (both architectures) + +| HF Name | GGUF Name | Notes | +|----------|-----------|-------| +| `embed_tokens.weight` | `token_embd.weight` | | +| `norm.weight` | `output_norm.weight` | | +| `layers.{i}.input_layernorm.weight` | `blk.{i}.attn_norm.weight` | | +| `layers.{i}.self_attn.q_proj.weight` | `blk.{i}.attn_q.weight` | | +| `layers.{i}.self_attn.k_proj.weight` | `blk.{i}.attn_k.weight` | | +| `layers.{i}.self_attn.v_proj.weight` | `blk.{i}.attn_v.weight` | | +| `layers.{i}.self_attn.o_proj.weight` | `blk.{i}.attn_output.weight` | | +| `layers.{i}.self_attn.q_norm.weight` | `blk.{i}.attn_q_norm.weight` | QK head norm | +| `layers.{i}.self_attn.k_norm.weight` | `blk.{i}.attn_k_norm.weight` | QK head norm | +| `layers.{i}.self_attn.q_proj.norm.weight` | `blk.{i}.attn_q_norm_in.weight` | BitNet per-projection | +| `layers.{i}.self_attn.k_proj.norm.weight` | `blk.{i}.attn_k_norm_in.weight` | BitNet per-projection | +| `layers.{i}.self_attn.v_proj.norm.weight` | `blk.{i}.attn_v_norm_in.weight` | BitNet per-projection | +| `layers.{i}.self_attn.o_proj.norm.weight` | `blk.{i}.attn_output_norm_in.weight` | BitNet per-projection | +| `layers.{i}.mlp.gate_proj.weight` | `blk.{i}.ffn_gate.weight` | | +| `layers.{i}.mlp.up_proj.weight` | `blk.{i}.ffn_up.weight` | | +| `layers.{i}.mlp.down_proj.weight` | `blk.{i}.ffn_down.weight` | | +| `layers.{i}.mlp.gate_proj.norm.weight` | `blk.{i}.ffn_gate_norm_in.weight` | BitNet per-projection | +| `layers.{i}.mlp.up_proj.norm.weight` | `blk.{i}.ffn_up_norm_in.weight` | BitNet per-projection | +| `layers.{i}.mlp.down_proj.norm.weight` | `blk.{i}.ffn_down_norm_in.weight` | BitNet per-projection | + +### Architecture-Specific Tensors + +**Qwen3:** + +| HF Name | GGUF Name | +|----------|-----------| +| `layers.{i}.post_attention_layernorm.weight` | `blk.{i}.ffn_norm.weight` | + +**Gemma3 (additional):** + +| HF Name | GGUF Name | +|----------|-----------| +| `layers.{i}.post_attention_layernorm.weight` | `blk.{i}.post_attention_norm.weight` | +| `layers.{i}.pre_feedforward_layernorm.weight` | `blk.{i}.ffn_norm.weight` | +| `layers.{i}.post_feedforward_layernorm.weight` | `blk.{i}.post_ffw_norm.weight` | + +--- + +## 3. Conversion Script + +### `utils/convert-bitnet-embedding-to-gguf.py` + +Unified standalone conversion script (safetensors → GGUF) that **auto-detects** the model architecture from `config.json`'s `model_type` field (`qwen3` or `gemma3_text`). Key features: + +- Hardcoded HF→GGUF tensor name mapping (no dependency on llama.cpp's Python converter) +- Auto-detection of architecture and GGUF arch string (`qwen3` / `gemma3`) +- Supports three output types: + - `--outtype f32`: all weights in float32 + - `--outtype f16`: 2D weights and embeddings as float16, norms as float16 + - `--outtype i2_s`: ternary weights packed in I2_S layout, non-ternary weights as float16 +- Writes `key_length` and `value_length` metadata for correct head_dim (critical: head_dim != hidden_size/num_heads for both models, default calculation would give wrong values) +- BPE tokenizer handling with per-architecture pre-tokenizer hash verification: + - Qwen3: GPT-2 BPE tokenizer + - Gemma3: GemmaTokenizerFast (BPE) +- Pooling type auto-detection from `modules.json` / `1_Pooling/config.json` (sentence-transformers convention) +- Architecture-specific tokenizer handling: + - Qwen3: EOS token override (`<|endoftext|>` 151643) + `add_eos_token(True)` for last-token pooling + - Gemma3: EOS token auto-set by SpecialVocab from tokenizer_config.json (eos_token_id=1) +- Gemma3: writes `query_pre_attn_scalar = 256` for correct attention scaling + +### I2_S Ternary Packing + +The I2_S format packs ternary weights {-1, 0, +1} into 2-bit representation: + +- Quantization: `scale = 1/mean(|w|)`, `q = round(w * scale).clamp(-1, 1)` +- Encoding: `-1 → 0`, `0 → 1`, `+1 → 2` +- Every 128 values form a block, packed into 32 bytes +- Each byte stores 4 values: `byte = (c0 << 6) | (c1 << 4) | (c2 << 2) | c3` +- Scale (float32) is appended at the end of the packed data buffer + +### Tensor Type Assignment + +| Tensor Type | f16 mode | i2_s mode | +|-------------|----------|-----------| +| 2D linear weights | float16 | I2_S ternary packed | +| Embedding weights | float16 | float16 | +| Norm weights (1D) | float16 | float16 | + +Note: `output.weight` (lm_head) is skipped for embedding models — it is not needed (no token generation). + +--- + +## 4. C++ Modifications (`3rdparty/llama.cpp/src/llama.cpp`) + +### 4.1 New Architecture: `LLM_ARCH_GEMMA3` + +Added after `LLM_ARCH_GEMMA2` in the `llm_arch` enum with name mapping `"gemma3"`. Qwen3 (`LLM_ARCH_QWEN3`) was added by the 0.6b adaptation. + +### 4.2 New Tensor Enums (shared across architectures) + +Added 7 new entries after `LLM_TENSOR_FFN_SUB_NORM`: + +```cpp +LLM_TENSOR_ATTN_Q_NORM_IN, +LLM_TENSOR_ATTN_K_NORM_IN, +LLM_TENSOR_ATTN_V_NORM_IN, +LLM_TENSOR_ATTN_OUT_NORM_IN, +LLM_TENSOR_FFN_GATE_NORM_IN, +LLM_TENSOR_FFN_UP_NORM_IN, +LLM_TENSOR_FFN_DOWN_NORM_IN, +``` + +### 4.3 Layer Struct Fields + +Added to `struct llama_layer`: + +```cpp +struct ggml_tensor * attn_q_norm_in; +struct ggml_tensor * attn_k_norm_in; +struct ggml_tensor * attn_v_norm_in; +struct ggml_tensor * attn_out_norm_in; +struct ggml_tensor * ffn_gate_norm_in; +struct ggml_tensor * ffn_up_norm_in; +struct ggml_tensor * ffn_down_norm_in; +``` + +### 4.4 Tensor Name Mappings + +Both `LLM_ARCH_QWEN3` and `LLM_ARCH_GEMMA3` include the 7 per-projection norm tensor mappings plus standard tensors (see Section 2 for full mapping). Key differences: + +- Qwen3 includes `LLM_TENSOR_OUTPUT` (`"output"`); Gemma3 does not (uses tied embeddings directly) +- Gemma3 additionally includes `LLM_TENSOR_ATTN_POST_NORM` (`"blk.%d.post_attention_norm"`) and `LLM_TENSOR_FFN_POST_NORM` (`"blk.%d.post_ffw_norm"`) + +### 4.5 load_tensors + +Both architectures load the 7 per-projection norm tensors as optional (`TENSOR_NOT_REQUIRED`): + +```cpp +layer.attn_q_norm_in = create_tensor(tn(...), {n_embd}, TENSOR_NOT_REQUIRED); +layer.attn_k_norm_in = create_tensor(tn(...), {n_embd}, TENSOR_NOT_REQUIRED); +layer.attn_v_norm_in = create_tensor(tn(...), {n_embd}, TENSOR_NOT_REQUIRED); +layer.attn_out_norm_in = create_tensor(tn(...), {n_embd_head_k * n_head}, TENSOR_NOT_REQUIRED); +layer.ffn_gate_norm_in = create_tensor(tn(...), {n_embd}, TENSOR_NOT_REQUIRED); +layer.ffn_up_norm_in = create_tensor(tn(...), {n_embd}, TENSOR_NOT_REQUIRED); +layer.ffn_down_norm_in = create_tensor(tn(...), {n_ff}, TENSOR_NOT_REQUIRED); +``` + +Note: `o_proj.norm` input dimension is `n_embd_head_k * n_head` (Qwen3: 2048, Gemma3: 1024), `down_proj.norm` input dimension is `n_ff` (Qwen3: 3072, Gemma3: 2048). + +Both graph functions use the same per-projection norm pattern. The logic is fully backward compatible — when no `*_norm_in` tensors exist, behavior is identical to the original. + +**Attention per-projection norms:** +``` +// Before Q/K/V matmul: +if (layer.attn_q_norm_in) { + cur_q = ggml_rms_norm(ctx, cur, hparams.f_norm_rms_eps); + cur_q = ggml_mul(ctx, cur_q, layer.attn_q_norm_in); +} else { + cur_q = cur; +} +Qcur = ggml_mul_mat(ctx, layer.wq, cur_q); +// QK head norms applied after projection +Qcur = ggml_rms_norm(ctx, Qcur, hparams.f_norm_rms_eps); +Qcur = ggml_mul(ctx, Qcur, layer.attn_q_norm); +``` + +**O_proj norm** requires special handling because `llm_build_kv()` normally applies `wo` internally. Solution: pass `wo=NULL` to `llm_build_kv()`, then apply norm + wo manually: + +``` +cur = llm_build_kv(..., wo=NULL, ...); // returns attention output without o_proj +if (layer.attn_out_norm_in) { + cur = ggml_rms_norm(ctx, cur, hparams.f_norm_rms_eps); + cur = ggml_mul(ctx, cur, layer.attn_out_norm_in); +} +cur = ggml_mul_mat(ctx, layer.wo, cur); +``` + +**FFN per-projection norms:** +``` +// Instead of llm_build_ffn(), manually: +if (layer.ffn_gate_norm_in) { + tmp_gate = rms_norm(cur) * gate_norm_in; +} else { + tmp_gate = cur; +} +tmp_gate = matmul(gate_proj, tmp_gate); +tmp_gate = activation(tmp_gate); // SiLU for Qwen3, GELU for Gemma3 +// Similarly for up_proj +tmp = tmp_gate * tmp_up; + +if (layer.ffn_down_norm_in) { + tmp = rms_norm(tmp) * down_norm_in; +} +cur = matmul(down_proj, tmp); +``` + +**Gemma3-specific differences:** +- Embedding scaling by `sqrt(n_embd)` (Gemma convention) +- GELU activation instead of SiLU +- Post-attention and post-FFN layer norms +- `query_pre_attn_scalar` for attention scaling + +--- + +## 5. GGUF Conversion Process + +Each model variant requires two GGUF files from **two different source models**: + +### 5.1 Qwen3 (0.6b) + +| GGUF Output | Source Model | Description | +|-------------|-------------|-------------| +| `embeddings-0.6b-f16.gguf` | `multilingual-e5-0.6b` (standard Qwen3) | F16 baseline | +| `bitnet-embeddings-0.6b-f16-i2_s.gguf` | `bitnet-embeddings-0.6b` (BitNet ternary) | I2_S ternary packed | + +**F16 (from standard Qwen3 model):** +```bash +python3 utils/convert-bitnet-embedding-to-gguf.py \ + /path/to/multilingual-e5-0.6b \ + --outtype f16 \ + --outfile embeddings-0.6b-f16.gguf +``` + +What happens: +1. Load `model.safetensors` (standard Qwen3 weights, bfloat16) +2. Convert all 2D weights (projections, embeddings) to float16 +3. Convert norm weights to float16 +4. Write GGUF with `qwen3` architecture metadata and tokenizer + +**Output:** ~1.11 GiB (595.78M params) + +**I2_S (from BitNet model):** +```bash +python3 utils/convert-bitnet-embedding-to-gguf.py \ + /path/to/bitnet-embeddings-0.6b \ + --outfile bitnet-embeddings-0.6b-f16-i2_s.gguf --outtype i2_s +``` + +What happens: +1. Load `model.safetensors` (BitNet ternary weights, bfloat16) +2. Map HF tensor names to GGUF names, including 7 extra `*_norm_in` tensors per layer +3. For each 2D linear weight: quantize to I2_S ternary packed format +4. Keep embeddings (`token_embd.weight`) in float16 +5. Keep all norm weights in float16 +6. Skip `output.weight` (lm_head, not needed for embedding models) +7. Write GGUF with `I2_S` type tag for quantized tensors + +**Output:** ~699 MiB (~50% of F16 size) + +### 5.2 Gemma3 (270m) + +| GGUF Output | Source Model | Description | +|-------------|-------------|-------------| +| `multilingual-e5-270m-f16.gguf` | `multilingual-e5-270m-260311` (standard Gemma3) | F16 baseline | +| `bitnet-embeddings-270m-i2_s.gguf` | `bitnet-embeddings-270m` (BitNet ternary) | I2_S ternary packed | + +**F16 (from standard Gemma3 model):** +```bash +python3 utils/convert-bitnet-embedding-to-gguf.py \ + /path/to/multilingual-e5-270m-260311 \ + --outtype f16 +``` + +What happens: +1. Load `model.safetensors` (standard Gemma3 weights, bfloat16) +2. Convert all 2D weights (projections, embeddings) to float16 +3. Convert norm weights to float16 +4. Write GGUF with `gemma3` architecture metadata and tokenizer + +**I2_S (from BitNet model):** +```bash +python3 utils/convert-bitnet-embedding-to-gguf.py \ + /path/to/bitnet-embeddings-270m \ + --outtype i2_s +``` + +What happens: +1. Load `model.safetensors` (BitNet ternary weights, bfloat16) +2. Map HF tensor names to GGUF names, including 7 extra `*_norm_in` tensors per layer +3. For each 2D linear weight: quantize to I2_S ternary packed format +4. Keep embeddings (`token_embd.weight`) in float16 +5. Keep all norm weights in float16 +6. Skip `output.weight` (lm_head, not needed for embedding models) +7. Write GGUF with `I2_S` type tag for quantized tensors + +### 5.3 Why Two Different Source Models? + +- `multilingual-e5-*` is the **teacher/baseline model** with standard float weights, used as the F16 performance reference +- `bitnet-embeddings-*` is the **1-bit quantized student model** with ternary weights and per-projection BitLinear norms, converted to I2_S for efficient CPU inference +- Benchmarking compares both to measure the throughput gain and quality trade-off of ternary quantization + +### 5.4 Tensor Type Summary + +| Tensor | F16 (baseline) | I2_S (BitNet) | +|--------|----------------|---------------| +| Linear projections (q/k/v/o/gate/up/down) | float16 | I2_S (2-bit packed + float32 scale) | +| Embedding (`token_embd.weight`) | float16 | float16 | +| Per-projection norms (`*_norm_in`) | N/A (not present) | float16 | +| Layer norms (attn_norm, ffn_norm, etc.) | float16 | float16 | +| QK head norms (`attn_q_norm`, `attn_k_norm`) | float16 | float16 | +| `output.weight` (lm_head) | skipped | skipped | + +--- + +## 6. Additional Changes + +### 6.1 ggml.c: F16 Norm Weight Support + +Added `ggml_compute_forward_mul_f32_f16()` function to support element-wise multiplication where norm weights are stored in float16. Modified `ggml_compute_forward_mul()` to dispatch based on `src1->type`. + +### 6.2 gguf-py: I2_S Type + +Added `I2_S = 36` to `GGMLQuantizationType` enum and `(4, 1)` quant size in `constants.py`. + +### 6.3 CMakeLists.txt: BitNet LUT Kernels Guard + +Guarded `bitnet-lut-kernels.h` include with `if (GGML_BITNET_ARM_TL1 OR GGML_BITNET_X86_TL2)` to prevent build errors when LUT kernels are not available. + +### 6.4 ggml-bitnet-mad.cpp: AVX512 SIMD + +Added AVX512BW SIMD paths for I2_S dot product functions: +- `ggml_vec_dot_i2_i8_s_1x1` +- `ggml_vec_dot_i2_i8_s_1xN` +- `ggml_vec_dot_i2_i8_s_Nx1` + +--- + +## 7. Build and Run + +```bash +# Build with BitNet repo (includes I2_S support) +cmake -S /path/to/BitNet -B build -DCMAKE_BUILD_TYPE=Release +cmake --build build --target llama-embedding llama-bench -j$(nproc) + +# Run embedding inference (Qwen3 example) +build/bin/llama-embedding -m bitnet-embeddings-0.6b-f16-i2_s.gguf \ + -p "hello world" --embd-normalize 2 --embd-output-format array + +# Run embedding inference (Gemma3 example) +build/bin/llama-embedding -m bitnet-embeddings-270m-i2_s.gguf \ + -p "hello world" --embd-normalize 2 --embd-output-format array + +# Benchmark: F16 vs I2_S (Qwen3) +build/bin/llama-bench -m embeddings-0.6b-f16.gguf \ + -t 8 -p 128,256,512,1024,2048 -n 32,64 -r 3 -ngl 0 + +build/bin/llama-bench -m bitnet-embeddings-0.6b-f16-i2_s.gguf \ + -t 8 -p 128,256,512,1024,2048 -n 32,64 -r 3 -ngl 0 + +# Benchmark: F16 vs I2_S (Gemma3) +build/bin/llama-bench -m multilingual-e5-270m-f16.gguf \ + -t 8 -p 128,256,512,1024,2048 -n 32,64 -r 3 -ngl 0 + +build/bin/llama-bench -m bitnet-embeddings-270m-i2_s.gguf \ + -t 8 -p 128,256,512,1024,2048 -n 32,64 -r 3 -ngl 0 +``` diff --git a/docs/bitnet-embeddings-qwen3-gguf-conversion.md b/docs/bitnet-embeddings-qwen3-gguf-conversion.md deleted file mode 100644 index 9d63c9300..000000000 --- a/docs/bitnet-embeddings-qwen3-gguf-conversion.md +++ /dev/null @@ -1,302 +0,0 @@ -# BitNet Embeddings (Qwen3) GGUF Conversion Implementation - -## 1. Background - -`bitnet-embeddings-0.6b` is a Qwen3-based embedding model with BitNet per-projection RMSNorm (`BitLinear`). Each linear projection (q/k/v/o/gate/up/down) has a `.norm.weight` that applies RMSNorm to the input **before** the matmul: - -``` -x → RMSNorm(x, norm.weight) → activation_quant(8bit) → matmul(weight_quant(ternary)) -``` - -This pattern does **not** exist in any standard llama.cpp architecture: -- Standard Qwen3: no per-projection norms -- Standard BitNet: has `attn_sub_norm`/`ffn_sub_norm` at different positions (after attention/gate*up, not before each projection) - -### Model Config - -- Architecture: `Qwen3Model` -- hidden_size: 1024, num_attention_heads: 16, num_key_value_heads: 8 -- head_dim: 128 (note: != hidden_size/num_heads = 64) -- intermediate_size: 3072, num_hidden_layers: 28 -- tie_word_embeddings: true -- rope_theta: 1000000, rms_norm_eps: 1e-06 - -### Per-Layer Tensors (7 extra norm tensors per layer) - -| Tensor | Shape | -|--------|-------| -| `self_attn.q_proj.norm.weight` | [1024] | -| `self_attn.k_proj.norm.weight` | [1024] | -| `self_attn.v_proj.norm.weight` | [1024] | -| `self_attn.o_proj.norm.weight` | [2048] | -| `mlp.gate_proj.norm.weight` | [1024] | -| `mlp.up_proj.norm.weight` | [1024] | -| `mlp.down_proj.norm.weight` | [3072] | - ---- - -## 2. GGUF Tensor Name Mapping - -| HF Name | GGUF Name | Notes | -|----------|-----------|-------| -| `embed_tokens.weight` | `token_embd.weight` | | -| `norm.weight` | `output_norm.weight` | | -| `layers.{i}.input_layernorm.weight` | `blk.{i}.attn_norm.weight` | | -| `layers.{i}.post_attention_layernorm.weight` | `blk.{i}.ffn_norm.weight` | | -| `layers.{i}.self_attn.q_proj.weight` | `blk.{i}.attn_q.weight` | | -| `layers.{i}.self_attn.k_proj.weight` | `blk.{i}.attn_k.weight` | | -| `layers.{i}.self_attn.v_proj.weight` | `blk.{i}.attn_v.weight` | | -| `layers.{i}.self_attn.o_proj.weight` | `blk.{i}.attn_output.weight` | | -| `layers.{i}.self_attn.q_norm.weight` | `blk.{i}.attn_q_norm.weight` | QK head norm | -| `layers.{i}.self_attn.k_norm.weight` | `blk.{i}.attn_k_norm.weight` | QK head norm | -| `layers.{i}.self_attn.q_proj.norm.weight` | `blk.{i}.attn_q_norm_in.weight` | BitNet per-projection | -| `layers.{i}.self_attn.k_proj.norm.weight` | `blk.{i}.attn_k_norm_in.weight` | BitNet per-projection | -| `layers.{i}.self_attn.v_proj.norm.weight` | `blk.{i}.attn_v_norm_in.weight` | BitNet per-projection | -| `layers.{i}.self_attn.o_proj.norm.weight` | `blk.{i}.attn_output_norm_in.weight` | BitNet per-projection | -| `layers.{i}.mlp.gate_proj.weight` | `blk.{i}.ffn_gate.weight` | | -| `layers.{i}.mlp.up_proj.weight` | `blk.{i}.ffn_up.weight` | | -| `layers.{i}.mlp.down_proj.weight` | `blk.{i}.ffn_down.weight` | | -| `layers.{i}.mlp.gate_proj.norm.weight` | `blk.{i}.ffn_gate_norm_in.weight` | BitNet per-projection | -| `layers.{i}.mlp.up_proj.norm.weight` | `blk.{i}.ffn_up_norm_in.weight` | BitNet per-projection | -| `layers.{i}.mlp.down_proj.norm.weight` | `blk.{i}.ffn_down_norm_in.weight` | BitNet per-projection | - ---- - -## 3. Conversion Script - -### `utils/convert-bitnet-embedding-to-gguf.py` - -Standalone conversion script (safetensors → GGUF). Key features: - -- Hardcoded HF→GGUF tensor name mapping (no dependency on llama.cpp's Python converter) -- Supports three output types: - - `--outtype f32`: all weights in float32 - - `--outtype f16`: 2D weights and embeddings as float16, norms as float16 - - `--outtype i2_s`: ternary weights packed in I2_S layout, non-ternary weights as float16 -- Writes `key_length` and `value_length` metadata for head_dim=128 (critical: default calculation would give wrong value 64) -- GPT-2 BPE tokenizer handling with pre-tokenizer hash verification -- Pooling type auto-detection from `modules.json` / `1_Pooling/config.json` (sentence-transformers convention) -- EOS token override: uses `<|endoftext|>` (151643) for correct last-token pooling -- Architecture string: `"qwen3"` - -### I2_S Ternary Packing - -The I2_S format packs ternary weights {-1, 0, +1} into 2-bit representation: - -- Quantization: `scale = 1/mean(|w|)`, `q = round(w * scale).clamp(-1, 1)` -- Encoding: `-1 → 0`, `0 → 1`, `+1 → 2` -- Every 128 values form a block, packed into 32 bytes -- Each byte stores 4 values: `byte = (c0 << 6) | (c1 << 4) | (c2 << 2) | c3` -- Scale (float32) is appended at the end of the packed data buffer - -### Tensor Type Assignment - -| Tensor Type | f16 mode | i2_s mode | -|-------------|----------|-----------| -| 2D linear weights | float16 | I2_S ternary packed | -| Embedding weights | float16 | float16 | -| Norm weights (1D) | float16 | float16 | - -Note: `output.weight` (lm_head) is skipped for embedding models — it is not needed (no token generation). - ---- - -## 4. C++ Modifications (`3rdparty/llama.cpp/src/llama.cpp`) - -### 4.1 New Tensor Enums - -Added 7 new entries after `LLM_TENSOR_FFN_SUB_NORM`: - -```cpp -LLM_TENSOR_ATTN_Q_NORM_IN, -LLM_TENSOR_ATTN_K_NORM_IN, -LLM_TENSOR_ATTN_V_NORM_IN, -LLM_TENSOR_ATTN_OUT_NORM_IN, -LLM_TENSOR_FFN_GATE_NORM_IN, -LLM_TENSOR_FFN_UP_NORM_IN, -LLM_TENSOR_FFN_DOWN_NORM_IN, -``` - -### 4.2 Tensor Name Mappings - -Added to `LLM_ARCH_QWEN3` tensor name map: - -```cpp -{ LLM_TENSOR_ATTN_Q_NORM_IN, "blk.%d.attn_q_norm_in" }, -{ LLM_TENSOR_ATTN_K_NORM_IN, "blk.%d.attn_k_norm_in" }, -{ LLM_TENSOR_ATTN_V_NORM_IN, "blk.%d.attn_v_norm_in" }, -{ LLM_TENSOR_ATTN_OUT_NORM_IN, "blk.%d.attn_output_norm_in" }, -{ LLM_TENSOR_FFN_GATE_NORM_IN, "blk.%d.ffn_gate_norm_in" }, -{ LLM_TENSOR_FFN_UP_NORM_IN, "blk.%d.ffn_up_norm_in" }, -{ LLM_TENSOR_FFN_DOWN_NORM_IN, "blk.%d.ffn_down_norm_in" }, -``` - -### 4.3 Layer Struct Fields - -Added to `struct llama_layer`: - -```cpp -struct ggml_tensor * attn_q_norm_in; -struct ggml_tensor * attn_k_norm_in; -struct ggml_tensor * attn_v_norm_in; -struct ggml_tensor * attn_out_norm_in; -struct ggml_tensor * ffn_gate_norm_in; -struct ggml_tensor * ffn_up_norm_in; -struct ggml_tensor * ffn_down_norm_in; -``` - -### 4.4 load_tensors (LLM_ARCH_QWEN3) - -Added optional loading with `TENSOR_NOT_REQUIRED`: - -```cpp -layer.attn_q_norm_in = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM_IN, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED); -layer.attn_k_norm_in = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM_IN, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED); -layer.attn_v_norm_in = create_tensor(tn(LLM_TENSOR_ATTN_V_NORM_IN, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED); -layer.attn_out_norm_in = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM_IN, "weight", i), {n_embd_head_k * n_head}, TENSOR_NOT_REQUIRED); -layer.ffn_gate_norm_in = create_tensor(tn(LLM_TENSOR_FFN_GATE_NORM_IN, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED); -layer.ffn_up_norm_in = create_tensor(tn(LLM_TENSOR_FFN_UP_NORM_IN, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED); -layer.ffn_down_norm_in = create_tensor(tn(LLM_TENSOR_FFN_DOWN_NORM_IN, "weight", i), {n_ff}, TENSOR_NOT_REQUIRED); -``` - -Note: `o_proj.norm` input dimension is `n_embd_head_k * n_head` (=2048), `down_proj.norm` input dimension is `n_ff` (=3072). - -### 4.5 build_qwen3() Graph Modifications - -The `build_qwen3()` function was modified to conditionally apply per-projection RMSNorm. The logic is fully backward compatible — when no `*_norm_in` tensors exist, behavior is identical to original. - -**Attention per-projection norms:** -``` -// Before Q/K/V matmul: -if (layer.attn_q_norm_in) { - cur_q = ggml_rms_norm(ctx, cur, hparams.f_norm_rms_eps); - cur_q = ggml_mul(ctx, cur_q, layer.attn_q_norm_in); -} else { - cur_q = cur; -} -Qcur = ggml_mul_mat(ctx, layer.wq, cur_q); -// Similarly for K, V -``` - -**O_proj norm** requires special handling because `llm_build_kv()` normally applies `wo` internally. Solution: pass `wo=NULL` to `llm_build_kv()`, then apply norm + wo manually: - -``` -cur = llm_build_kv(..., wo=NULL, ...); // returns attention output without o_proj -if (layer.attn_out_norm_in) { - cur = ggml_rms_norm(ctx, cur, hparams.f_norm_rms_eps); - cur = ggml_mul(ctx, cur, layer.attn_out_norm_in); -} -cur = ggml_mul_mat(ctx, layer.wo, cur); -``` - -**FFN per-projection norms:** -``` -// Instead of llm_build_ffn(), manually: -if (layer.ffn_gate_norm_in) { - tmp_gate = rms_norm(cur) * gate_norm_in; -} else { - tmp_gate = cur; -} -tmp_gate = matmul(gate_proj, tmp_gate); -// Similarly for up_proj -tmp = silu(tmp_gate) * tmp_up; - -if (layer.ffn_down_norm_in) { - tmp = rms_norm(tmp) * down_norm_in; -} -cur = matmul(down_proj, tmp); -``` - ---- - -## 5. GGUF Conversion Process - -There are two GGUF files to produce, from **two different source models**: - -| GGUF Output | Source Model | Description | -|-------------|-------------|-------------| -| `embeddings-0.6b-f16.gguf` | `multilingual-e5-0.6b` (standard Qwen3) | F16 baseline, standard float16 weights | -| `bitnet-embeddings-0.6b-f16-i2_s.gguf` | `bitnet-embeddings-0.6b` (BitNet ternary) | I2_S ternary packed weights | - -### 5.1 F16 GGUF: from multilingual-e5-0.6b - -The F16 GGUF is converted from the **standard (non-BitNet) model** `multilingual-e5-0.6b`, which has normal float weights and no per-projection RMSNorm. This uses llama.cpp's standard converter since it is a vanilla Qwen3 model: - -```bash -python3 /path/to/llama.cpp/convert_hf_to_gguf.py \ - /path/to/multilingual-e5-0.6b \ - --outtype f16 \ - --outfile embeddings-0.6b-f16.gguf -``` - -**What happens:** -1. Load `model.safetensors` (standard Qwen3 weights, bfloat16) -2. Convert all 2D weights (projections, embeddings) to float16 -3. Convert norm weights to float32 -4. Write GGUF with `qwen3` architecture metadata and tokenizer - -**Output:** ~1.11 GiB (595.78M params) - -### 5.2 I2_S GGUF: from bitnet-embeddings-0.6b - -The I2_S GGUF is converted from the **BitNet ternary model** `bitnet-embeddings-0.6b`, which has ternary weights {-1, 0, +1} and 7 extra per-projection RMSNorm tensors per layer. This uses the custom converter because the standard llama.cpp converter does not handle per-projection norms or I2_S quantization: - -```bash -python3 utils/convert-bitnet-embedding-to-gguf.py \ - /path/to/bitnet-embeddings-0.6b \ - --outfile bitnet-embeddings-0.6b-f16-i2_s.gguf --outtype i2_s -``` - -**What happens:** -1. Load `model.safetensors` (BitNet ternary weights, bfloat16) -2. Map HF tensor names to GGUF names, including 7 extra `*_norm_in` tensors per layer (see Section 2) -3. For each 2D linear weight (q/k/v/o/gate/up/down projections): - - Compute scale: `scale = 1 / mean(|w|)` - - Quantize: `q = round(w * scale).clamp(-1, 1)` - - Encode: `-1 -> 0`, `0 -> 1`, `+1 -> 2` - - Pack every 128 values into 32 bytes (4 values per byte, 2 bits each) - - Append per-row float32 scale -4. Keep embeddings (`token_embd.weight`) in float16 (not ternary) -5. Keep all norm weights in float16 -6. Skip `output.weight` (lm_head, not needed for embedding models) -7. Write GGUF with `I2_S` type tag for quantized tensors - -**Output:** ~699 MiB (~50% of F16 size) - -### 5.3 Why Two Different Source Models? - -- `multilingual-e5-0.6b` is the **teacher/baseline model** with standard float weights, used as the F16 performance reference -- `bitnet-embeddings-0.6b` is the **1-bit quantized student model** with ternary weights and per-projection BitLinear norms, converted to I2_S for efficient CPU inference -- Benchmarking compares both to measure the throughput gain and quality trade-off of ternary quantization - -### 5.4 Tensor Type Summary - -| Tensor | F16 (from e5-0.6b) | I2_S (from bitnet-0.6b) | -|--------|---------------------|-------------------------| -| Linear projections (q/k/v/o/gate/up/down) | float16 | I2_S (2-bit packed + float32 scale) | -| Embedding (`token_embd.weight`) | float16 | float16 | -| Per-projection norms (`*_norm_in`) | N/A (not present) | float16 | -| Layer norms (`attn_norm`, `ffn_norm`) | float32 | float16 | -| QK head norms (`attn_q_norm`, `attn_k_norm`) | float32 | float32 | -| `output.weight` (lm_head) | present | skipped | - ---- - -## 6. Build and Run - -```bash -# Build with BitNet repo (includes I2_S support) -cmake -S /path/to/BitNet -B build -DCMAKE_BUILD_TYPE=Release -cmake --build build --target llama-embedding llama-bench -j$(nproc) - -# Run embedding inference -build/bin/llama-embedding -m bitnet-embeddings-0.6b-f16-i2_s.gguf \ - -p "hello world" --embd-normalize 2 --embd-output-format array - -# Benchmark: F16 vs I2_S -build/bin/llama-bench -m embeddings-0.6b-f16.gguf \ - -t 8 -p 128,256,512,1024,2048 -n 32,64 -r 3 -ngl 0 - -build/bin/llama-bench -m bitnet-embeddings-0.6b-f16-i2_s.gguf \ - -t 8 -p 128,256,512,1024,2048 -n 32,64 -r 3 -ngl 0 -``` diff --git a/utils/convert-bitnet-embedding-270m-to-gguf.py b/utils/convert-bitnet-embedding-270m-to-gguf.py deleted file mode 100644 index 4408452ee..000000000 --- a/utils/convert-bitnet-embedding-270m-to-gguf.py +++ /dev/null @@ -1,441 +0,0 @@ -#!/usr/bin/env python3 - -from __future__ import annotations - -import argparse -import json -import logging -import os -import sys -from hashlib import sha256 -from pathlib import Path -from typing import Any, Iterator - -import numpy as np -import torch - -# Allow using the local gguf-py if present -if "NO_LOCAL_GGUF" not in os.environ: - _local_gguf = Path(__file__).parent / "gguf-py" - if _local_gguf.exists(): - sys.path.insert(1, str(_local_gguf)) -import gguf - -logger = logging.getLogger("convert-bitnet-embedding-270m") - -# --------------------------------------------------------------------------- -# Tensor name mapping: HuggingFace -> GGUF -# --------------------------------------------------------------------------- - -def build_tensor_name_map(n_layers: int) -> dict[str, str]: - """Build HF tensor name -> GGUF tensor name mapping.""" - mapping: dict[str, str] = { - "embed_tokens.weight": "token_embd.weight", - "norm.weight": "output_norm.weight", - } - - for i in range(n_layers): - pfx = f"layers.{i}" - blk = f"blk.{i}" - - mapping.update({ - # Layer norms - f"{pfx}.input_layernorm.weight": f"{blk}.attn_norm.weight", - f"{pfx}.post_attention_layernorm.weight": f"{blk}.post_attention_norm.weight", - f"{pfx}.pre_feedforward_layernorm.weight": f"{blk}.ffn_norm.weight", - f"{pfx}.post_feedforward_layernorm.weight": f"{blk}.post_ffw_norm.weight", - - # Self-attention projections - f"{pfx}.self_attn.q_proj.weight": f"{blk}.attn_q.weight", - f"{pfx}.self_attn.k_proj.weight": f"{blk}.attn_k.weight", - f"{pfx}.self_attn.v_proj.weight": f"{blk}.attn_v.weight", - f"{pfx}.self_attn.o_proj.weight": f"{blk}.attn_output.weight", - - # QK head norms (Gemma3) - f"{pfx}.self_attn.q_norm.weight": f"{blk}.attn_q_norm.weight", - f"{pfx}.self_attn.k_norm.weight": f"{blk}.attn_k_norm.weight", - - # Per-projection input norms (BitNet-specific) - f"{pfx}.self_attn.q_proj.norm.weight": f"{blk}.attn_q_norm_in.weight", - f"{pfx}.self_attn.k_proj.norm.weight": f"{blk}.attn_k_norm_in.weight", - f"{pfx}.self_attn.v_proj.norm.weight": f"{blk}.attn_v_norm_in.weight", - f"{pfx}.self_attn.o_proj.norm.weight": f"{blk}.attn_output_norm_in.weight", - - # MLP projections - f"{pfx}.mlp.gate_proj.weight": f"{blk}.ffn_gate.weight", - f"{pfx}.mlp.up_proj.weight": f"{blk}.ffn_up.weight", - f"{pfx}.mlp.down_proj.weight": f"{blk}.ffn_down.weight", - - # Per-projection input norms for MLP (BitNet-specific) - f"{pfx}.mlp.gate_proj.norm.weight": f"{blk}.ffn_gate_norm_in.weight", - f"{pfx}.mlp.up_proj.norm.weight": f"{blk}.ffn_up_norm_in.weight", - f"{pfx}.mlp.down_proj.norm.weight": f"{blk}.ffn_down_norm_in.weight", - }) - - return mapping - - -# --------------------------------------------------------------------------- -# Tokenizer handling (BPE for Gemma3) -# --------------------------------------------------------------------------- - -def get_vocab_base_pre(tokenizer) -> str: - chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n\U0001f680 (normal) \U0001f636‍\U0001f32b️ (multiple emojis concatenated) ✅ \U0001f999\U0001f999 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច\U0001f601 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````""""""......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL' - - chktok = tokenizer.encode(chktxt) - chkhsh = sha256(str(chktok).encode()).hexdigest() - - logger.debug(f"chktok: {chktok}") - logger.debug(f"chkhsh: {chkhsh}") - - res = None - - if chkhsh == "fcb6bf9f20f6c40fa4aa4f7f99607bd6c106ca2348efdacacdca8152e59dcfe9": - # ref: multilingual-e5-270m-260311 (Gemma3 tokenizer) - res = "default" - if chkhsh == "a8594e3edff7c29c003940395316294b2c623571571fc8d3d2d6571f5571cbe6": - # ref: google/gemma-2-9b - res = "default" - - if res is None: - logger.warning("\n") - logger.warning("**************************************************************************************") - logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!") - logger.warning(f"** chkhsh: {chkhsh}") - logger.warning("**************************************************************************************") - logger.warning("\n") - raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()") - - logger.debug(f"tokenizer.ggml.pre: {repr(res)}") - return res - - -def _does_token_look_special(token: str) -> bool: - if not token: - return False - if token.startswith(("<|", "<", "[")) and token.endswith(("|>", ">", "]")): - return True - return False - - -def set_vocab(gguf_writer: gguf.GGUFWriter, dir_model: Path, hparams: dict): - """Set BPE vocab for Gemma3.""" - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(dir_model) - vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) - - tokpre = get_vocab_base_pre(tokenizer) - - tokens: list[str] = [] - toktypes: list[int] = [] - - reverse_vocab = {id_: tok for tok, id_ in tokenizer.vocab.items()} - added_vocab = tokenizer.get_added_vocab() - - added_tokens_decoder = tokenizer.added_tokens_decoder - - for i in range(vocab_size): - if i not in reverse_vocab: - tokens.append(f"[PAD{i}]") - toktypes.append(gguf.TokenType.UNUSED) - elif reverse_vocab[i] in added_vocab: - token = reverse_vocab[i] - - if not added_tokens_decoder[i].normalized: - token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) - - if added_tokens_decoder[i].special or _does_token_look_special(token): - toktypes.append(gguf.TokenType.CONTROL) - else: - token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") - toktypes.append(gguf.TokenType.USER_DEFINED) - - tokens.append(token) - else: - tokens.append(reverse_vocab[i]) - toktypes.append(gguf.TokenType.NORMAL) - - gguf_writer.add_tokenizer_model("gpt2") - gguf_writer.add_tokenizer_pre(tokpre) - gguf_writer.add_token_list(tokens) - gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(dir_model, load_merges=True) - special_vocab.add_to_gguf(gguf_writer) - - -# --------------------------------------------------------------------------- -# GGUF metadata -# --------------------------------------------------------------------------- - -def set_gguf_parameters(gguf_writer: gguf.GGUFWriter, hparams: dict, dir_model: Path, ftype: int): - gguf_writer.add_name(dir_model.name) - - n_layers = hparams["num_hidden_layers"] - n_embd = hparams["hidden_size"] - n_head = hparams["num_attention_heads"] - n_head_kv = hparams.get("num_key_value_heads", n_head) - n_ff = hparams["intermediate_size"] - - gguf_writer.add_block_count(n_layers) - gguf_writer.add_context_length(hparams.get("max_position_embeddings", 32768)) - gguf_writer.add_embedding_length(n_embd) - gguf_writer.add_feed_forward_length(n_ff) - gguf_writer.add_head_count(n_head) - gguf_writer.add_head_count_kv(n_head_kv) - gguf_writer.add_vocab_size(hparams["vocab_size"]) - - head_dim = hparams.get("head_dim", n_embd // n_head) - gguf_writer.add_rope_dimension_count(head_dim) - gguf_writer.add_key_length(head_dim) - gguf_writer.add_value_length(head_dim) - - if hparams.get("rope_theta") is not None: - gguf_writer.add_rope_freq_base(hparams["rope_theta"]) - if hparams.get("rms_norm_eps") is not None: - gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"]) - - gguf_writer.add_file_type(ftype) - - # Pooling type for embedding models - pooling_type = None - module_path = dir_model / "modules.json" - if module_path.is_file(): - with open(module_path, encoding="utf-8") as f: - modules = json.load(f) - for mod in modules: - if mod["type"].endswith("Pooling"): - pooling_path = dir_model / mod["path"] / "config.json" - if pooling_path.is_file(): - with open(pooling_path, encoding="utf-8") as f: - pooling = json.load(f) - if pooling.get("pooling_mode_mean_tokens"): - pooling_type = gguf.PoolingType.MEAN - elif pooling.get("pooling_mode_cls_token"): - pooling_type = gguf.PoolingType.CLS - elif pooling.get("pooling_mode_lasttoken"): - pooling_type = gguf.PoolingType.LAST - break - if pooling_type is None: - logger.info(" No pooling config found, defaulting to MEAN pooling") - pooling_type = gguf.PoolingType.MEAN - gguf_writer.add_pooling_type(pooling_type) - - logger.info(f" n_layers={n_layers}, n_embd={n_embd}, n_head={n_head}, n_head_kv={n_head_kv}, n_ff={n_ff}, head_dim={head_dim}") - - -# --------------------------------------------------------------------------- -# Tensor iteration from safetensors -# --------------------------------------------------------------------------- - -def iter_tensors(dir_model: Path) -> Iterator[tuple[str, torch.Tensor]]: - """Yield (name, tensor) from safetensors files.""" - from safetensors import safe_open - - safetensor_files = sorted(dir_model.glob("*.safetensors")) - if not safetensor_files: - raise FileNotFoundError(f"No .safetensors files in {dir_model}") - - for sf_path in safetensor_files: - logger.info(f"Loading {sf_path.name}") - with safe_open(str(sf_path), framework="pt", device="cpu") as f: - for name in f.keys(): - yield name, f.get_tensor(name) - - -# --------------------------------------------------------------------------- -# I2_S ternary packing (platform-independent) -# --------------------------------------------------------------------------- - -def quantize_to_i2_s(w: np.ndarray) -> np.ndarray: - """Quantize float weights to ternary and pack into I2_S layout. - - Uses the same quantization as BitLinear weight_quant_minmax(): - scale = 1.0 / mean(|w|) - q = round(w * scale).clamp(-1, 1) - dequant = q / scale = q * mean(|w|) - - Args: - w: float weight tensor of shape (M, K) - - Returns: - packed_data: uint8 array containing I2_S packed bytes + scale (as 4 trailing bytes) - """ - M, K = w.shape - n = M * K - w_flat = w.flatten().astype(np.float32) - - abs_mean = np.mean(np.abs(w_flat)) - abs_mean = max(abs_mean, 1e-5) - inv_scale = 1.0 / abs_mean - q_float = np.round(w_flat * inv_scale).clip(-1, 1) - - scale = np.float32(abs_mean) - - # Map ternary {-1, 0, 1} -> I2_S encoding {0, 1, 2} - q = np.ones(n, dtype=np.uint8) - q[q_float > 0.5] = 2 - q[q_float < -0.5] = 0 - - # Pack into I2_S layout: 128-value blocks, interleaved into 32 bytes - pad_len = (128 - n % 128) % 128 - if pad_len: - q = np.pad(q, (0, pad_len), constant_values=1) - - n_padded = len(q) - n_blocks = n_padded // 128 - - q = q.reshape(n_blocks, 4, 32) - - packed = (q[:, 0, :].astype(np.uint8) << 6) | \ - (q[:, 1, :].astype(np.uint8) << 4) | \ - (q[:, 2, :].astype(np.uint8) << 2) | \ - (q[:, 3, :].astype(np.uint8)) - - packed = packed.reshape(-1).astype(np.uint8) - - packed_size = n // 4 - total_size = packed_size + 32 - result = np.zeros(total_size, dtype=np.uint8) - result[:len(packed)] = packed[:packed_size] - result[packed_size:packed_size+4] = np.frombuffer(scale.tobytes(), dtype=np.uint8) - - return result - - -# --------------------------------------------------------------------------- -# Main conversion -# --------------------------------------------------------------------------- - -def main(): - parser = argparse.ArgumentParser(description="Convert bitnet-embeddings-270m (Gemma3) to GGUF") - parser.add_argument("model", type=Path, help="Model directory") - parser.add_argument("--outfile", type=Path, default=None, help="Output GGUF file") - parser.add_argument("--outtype", choices=["f32", "f16", "i2_s"], default="f16", - help="Output type: f32, f16, or i2_s (ternary quantized)") - parser.add_argument("--verbose", action="store_true") - args = parser.parse_args() - - logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) - - dir_model = args.model - if not dir_model.is_dir(): - logger.error(f"{dir_model} is not a directory") - sys.exit(1) - - # Default output filename - if args.outfile is None: - suffix = {"f32": "-f32", "f16": "-f16", "i2_s": "-f16-new-i2_s"}[args.outtype] - args.outfile = dir_model / f"{dir_model.name}{suffix}.gguf" - - # Load config - with open(dir_model / "config.json") as f: - hparams = json.load(f) - - arch = hparams.get("model_type", "gemma3_text") - assert arch == "gemma3_text", f"Expected gemma3_text architecture, got {arch}" - - n_layers = hparams["num_hidden_layers"] - - # Determine ftype - if args.outtype == "f32": - ftype = 0 # GGML F32 - elif args.outtype == "f16": - ftype = 1 # GGML F16 - else: # i2_s - ftype = 40 # LLAMA_FTYPE_MOSTLY_I2_S - - logger.info(f"Converting {dir_model.name} to GGUF ({args.outtype})") - - # Create GGUF writer - gguf_writer = gguf.GGUFWriter(str(args.outfile), "gemma3") - - # Set parameters - set_gguf_parameters(gguf_writer, hparams, dir_model, ftype) - - # Set vocab - logger.info("Setting tokenizer/vocab...") - set_vocab(gguf_writer, dir_model, hparams) - - # Build tensor name map - tensor_map = build_tensor_name_map(n_layers) - - # Process tensors - logger.info("Processing tensors...") - tensor_count = 0 - for hf_name, data_torch in iter_tensors(dir_model): - # Skip tensors we don't need - if hf_name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")): - continue - - # Strip "model." prefix if present - name = hf_name - if name.startswith("model."): - name = name[len("model."):] - - # Look up GGUF name - gguf_name = tensor_map.get(name) - if gguf_name is None: - logger.warning(f"Skipping unmapped tensor: {hf_name}") - continue - - old_dtype = data_torch.dtype - - # Convert bf16 -> f32 first (bf16 not directly supported by gguf) - if data_torch.dtype == torch.bfloat16: - data_torch = data_torch.to(torch.float32) - - data = data_torch.squeeze().numpy() - n_dims = len(data.shape) - data_shape = data.shape - - # Determine if this is a linear weight suitable for ternary quantization - is_norm = gguf_name.endswith("_norm.weight") or gguf_name.endswith("_norm_in.weight") - is_embed = gguf_name == "token_embd.weight" - is_linear_weight = n_dims == 2 and not is_norm and not is_embed - suit_i2 = is_linear_weight - - if args.outtype == "i2_s" and suit_i2: - # --- I2_S ternary packing (scale embedded in data) --- - packed = quantize_to_i2_s(data) - data_qtype = gguf.GGMLQuantizationType.I2_S - - shape_str = f"{{{', '.join(str(n) for n in reversed(data_shape))}}}" - logger.info(f" {gguf_name}: {list(data_shape)} {old_dtype} -> I2_S, shape = {shape_str}") - - gguf_writer.add_tensor(gguf_name, packed, raw_shape=data_shape, raw_dtype=data_qtype) - tensor_count += 1 - - elif args.outtype in ("f16", "i2_s") and (is_linear_weight or is_embed): - # 2D weight tensors (linear + embedding) -> f16 - data = data.astype(np.float16) - logger.info(f" {gguf_name}: {list(data_torch.shape)} {old_dtype} -> float16") - gguf_writer.add_tensor(gguf_name, data) - tensor_count += 1 - - else: - # norms, 1D tensors - if args.outtype in ("f16", "i2_s"): - data = data.astype(np.float16) - logger.info(f" {gguf_name}: {list(data_torch.shape)} {old_dtype} -> float16") - else: - if data.dtype != np.float32: - data = data.astype(np.float32) - logger.info(f" {gguf_name}: {list(data_torch.shape)} {old_dtype} -> float32") - gguf_writer.add_tensor(gguf_name, data) - tensor_count += 1 - - logger.info(f"Total tensors written: {tensor_count}") - - # Write GGUF - logger.info(f"Writing to {args.outfile}...") - gguf_writer.write_header_to_file() - gguf_writer.write_kv_data_to_file() - gguf_writer.write_tensors_to_file() - gguf_writer.close() - - logger.info("Done!") - - -if __name__ == "__main__": - main() diff --git a/utils/convert-bitnet-embedding-to-gguf.py b/utils/convert-bitnet-embedding-to-gguf.py index 3a4340734..9c62ac542 100644 --- a/utils/convert-bitnet-embedding-to-gguf.py +++ b/utils/convert-bitnet-embedding-to-gguf.py @@ -23,11 +23,17 @@ logger = logging.getLogger("convert-bitnet-embedding") +# Supported architectures: model_type -> gguf arch name +SUPPORTED_ARCHS = { + "qwen3": "qwen3", + "gemma3_text": "gemma3", +} + # --------------------------------------------------------------------------- # Tensor name mapping: HuggingFace -> GGUF # --------------------------------------------------------------------------- -def build_tensor_name_map(n_layers: int) -> dict[str, str]: +def build_tensor_name_map(n_layers: int, arch: str) -> dict[str, str]: """Build HF tensor name -> GGUF tensor name mapping.""" mapping: dict[str, str] = { "embed_tokens.weight": "token_embd.weight", @@ -41,7 +47,6 @@ def build_tensor_name_map(n_layers: int) -> dict[str, str]: mapping.update({ # Layer norms f"{pfx}.input_layernorm.weight": f"{blk}.attn_norm.weight", - f"{pfx}.post_attention_layernorm.weight": f"{blk}.ffn_norm.weight", # Self-attention projections f"{pfx}.self_attn.q_proj.weight": f"{blk}.attn_q.weight", @@ -49,7 +54,7 @@ def build_tensor_name_map(n_layers: int) -> dict[str, str]: f"{pfx}.self_attn.v_proj.weight": f"{blk}.attn_v.weight", f"{pfx}.self_attn.o_proj.weight": f"{blk}.attn_output.weight", - # QK head norms (standard Qwen3) + # QK head norms f"{pfx}.self_attn.q_norm.weight": f"{blk}.attn_q_norm.weight", f"{pfx}.self_attn.k_norm.weight": f"{blk}.attn_k_norm.weight", @@ -70,20 +75,29 @@ def build_tensor_name_map(n_layers: int) -> dict[str, str]: f"{pfx}.mlp.down_proj.norm.weight": f"{blk}.ffn_down_norm_in.weight", }) + if arch == "qwen3": + mapping[f"{pfx}.post_attention_layernorm.weight"] = f"{blk}.ffn_norm.weight" + elif arch == "gemma3_text": + mapping.update({ + f"{pfx}.post_attention_layernorm.weight": f"{blk}.post_attention_norm.weight", + f"{pfx}.pre_feedforward_layernorm.weight": f"{blk}.ffn_norm.weight", + f"{pfx}.post_feedforward_layernorm.weight": f"{blk}.post_ffw_norm.weight", + }) + return mapping # --------------------------------------------------------------------------- -# Tokenizer handling (GPT-2 / BPE for Qwen3) +# Tokenizer handling # --------------------------------------------------------------------------- -def get_vocab_base_pre(tokenizer) -> str: +def get_vocab_base_pre(tokenizer, arch: str) -> str: # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that # is specific for the BPE pre-tokenizer used by the model # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can # use in llama.cpp to implement the same pre-tokenizer - chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n\U0001f680 (normal) \U0001f636‍\U0001f32b️ (multiple emojis concatenated) ✅ \U0001f999\U0001f999 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច\U0001f601 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````""""......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL' + chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n\U0001f680 (normal) \U0001f636‍\U0001f32b️ (multiple emojis concatenated) ✅ \U0001f999\U0001f999 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច\U0001f601 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````""""""......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL' chktok = tokenizer.encode(chktxt) chkhsh = sha256(str(chktok).encode()).hexdigest() @@ -93,27 +107,35 @@ def get_vocab_base_pre(tokenizer) -> str: res = None - # NOTE: if you get an error here, you need to update the convert_hf_to_gguf_update.py script - # or pull the latest version of the model from Huggingface - # don't edit the hashes manually! - if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5": - # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B - res = "llama-bpe" - if chkhsh == "049ecf7629871e3041641907f3de7c733e4dbfdc736f57d882ba0b0845599754": - # ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-base - res = "deepseek-llm" - if chkhsh == "347715f544604f9118bb75ed199f68779f423cabb20db6de6f31b908d04d7821": - # ref: https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base - res = "deepseek-coder" - if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed": - # ref: https://huggingface.co/tiiuae/falcon-7b - res = "falcon" - if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454": - # ref: https://huggingface.co/openai-community/gpt2 - res = "gpt-2" - if chkhsh == "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c": - # ref: https://huggingface.co/Qwen/Qwen3-Embedding-0.6B - res = "qwen2" + if arch == "qwen3": + # NOTE: if you get an error here, you need to update the convert_hf_to_gguf_update.py script + # or pull the latest version of the model from Huggingface + # don't edit the hashes manually! + if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5": + # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B + res = "llama-bpe" + if chkhsh == "049ecf7629871e3041641907f3de7c733e4dbfdc736f57d882ba0b0845599754": + # ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-base + res = "deepseek-llm" + if chkhsh == "347715f544604f9118bb75ed199f68779f423cabb20db6de6f31b908d04d7821": + # ref: https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base + res = "deepseek-coder" + if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed": + # ref: https://huggingface.co/tiiuae/falcon-7b + res = "falcon" + if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454": + # ref: https://huggingface.co/openai-community/gpt2 + res = "gpt-2" + if chkhsh == "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c": + # ref: https://huggingface.co/Qwen/Qwen3-Embedding-0.6B + res = "qwen2" + elif arch == "gemma3_text": + if chkhsh == "fcb6bf9f20f6c40fa4aa4f7f99607bd6c106ca2348efdacacdca8152e59dcfe9": + # ref: multilingual-e5-270m-260311 (Gemma3 tokenizer) + res = "default" + if chkhsh == "a8594e3edff7c29c003940395316294b2c623571571fc8d3d2d6571f5571cbe6": + # ref: google/gemma-2-9b + res = "default" if res is None: logger.warning("\n") @@ -146,13 +168,13 @@ def _does_token_look_special(token: str) -> bool: return False -def set_vocab(gguf_writer: gguf.GGUFWriter, dir_model: Path, hparams: dict): - """Set GPT-2 BPE vocab for Qwen3.""" +def set_vocab(gguf_writer: gguf.GGUFWriter, dir_model: Path, hparams: dict, arch: str): + """Set BPE vocab.""" from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(dir_model) vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) - tokpre = get_vocab_base_pre(tokenizer) + tokpre = get_vocab_base_pre(tokenizer, arch) tokens: list[str] = [] toktypes: list[int] = [] @@ -191,14 +213,18 @@ def set_vocab(gguf_writer: gguf.GGUFWriter, dir_model: Path, hparams: dict): gguf_writer.add_token_types(toktypes) special_vocab = gguf.SpecialVocab(dir_model, load_merges=True) - # Override EOS token: PyTorch tokenizer appends <|endoftext|> (151643) as the - # sentence-end marker, not <|im_end|> (151645). For last-token pooling to work - # correctly, llama.cpp must append the same token. - special_vocab.special_token_ids["eos"] = 151643 + + if arch == "qwen3": + # Override EOS token: PyTorch tokenizer appends <|endoftext|> (151643) as the + # sentence-end marker, not <|im_end|> (151645). For last-token pooling to work + # correctly, llama.cpp must append the same token. + special_vocab.special_token_ids["eos"] = 151643 + special_vocab.add_to_gguf(gguf_writer) - # Embedding models need EOS token appended for last-token pooling - gguf_writer.add_add_eos_token(True) + if arch == "qwen3": + # Embedding models need EOS token appended for last-token pooling + gguf_writer.add_add_eos_token(True) # --------------------------------------------------------------------------- @@ -260,7 +286,7 @@ def set_gguf_parameters(gguf_writer: gguf.GGUFWriter, hparams: dict, dir_model: pooling_type = gguf.PoolingType.MEAN gguf_writer.add_pooling_type(pooling_type) - logger.info(f" n_layers={n_layers}, n_embd={n_embd}, n_head={n_head}, n_head_kv={n_head_kv}, n_ff={n_ff}") + logger.info(f" n_layers={n_layers}, n_embd={n_embd}, n_head={n_head}, n_head_kv={n_head_kv}, n_ff={n_ff}, head_dim={head_dim}") # --------------------------------------------------------------------------- @@ -366,7 +392,7 @@ def quantize_to_i2_s(w: np.ndarray) -> np.ndarray: # --------------------------------------------------------------------------- def main(): - parser = argparse.ArgumentParser(description="Convert bitnet-embeddings to GGUF") + parser = argparse.ArgumentParser(description="Convert bitnet-embeddings (Qwen3/Gemma3) to GGUF") parser.add_argument("model", type=Path, help="Model directory") parser.add_argument("--outfile", type=Path, default=None, help="Output GGUF file") parser.add_argument("--outtype", choices=["f32", "f16", "i2_s"], default="f16", @@ -390,9 +416,12 @@ def main(): with open(dir_model / "config.json") as f: hparams = json.load(f) - arch = hparams.get("model_type", "qwen3") - assert arch == "qwen3", f"Expected qwen3 architecture, got {arch}" + arch = hparams.get("model_type", "") + if arch not in SUPPORTED_ARCHS: + logger.error(f"Unsupported model_type '{arch}'. Supported: {list(SUPPORTED_ARCHS.keys())}") + sys.exit(1) + gguf_arch = SUPPORTED_ARCHS[arch] n_layers = hparams["num_hidden_layers"] # Determine ftype @@ -403,20 +432,20 @@ def main(): else: # i2_s ftype = 40 # LLAMA_FTYPE_MOSTLY_I2_S - logger.info(f"Converting {dir_model.name} to GGUF ({args.outtype})") + logger.info(f"Converting {dir_model.name} (arch={arch}) to GGUF ({args.outtype})") # Create GGUF writer - gguf_writer = gguf.GGUFWriter(str(args.outfile), "qwen3") + gguf_writer = gguf.GGUFWriter(str(args.outfile), gguf_arch) # Set parameters set_gguf_parameters(gguf_writer, hparams, dir_model, ftype) # Set vocab logger.info("Setting tokenizer/vocab...") - set_vocab(gguf_writer, dir_model, hparams) + set_vocab(gguf_writer, dir_model, hparams, arch) # Build tensor name map - tensor_map = build_tensor_name_map(n_layers) + tensor_map = build_tensor_name_map(n_layers, arch) # Process tensors logger.info("Processing tensors...")