Skip to content

Commit

Permalink
hacked together mixtra-moe conversion script
Browse files Browse the repository at this point in the history
  • Loading branch information
lxe committed Dec 8, 2023
1 parent bcc0eb4 commit 2dd8944
Show file tree
Hide file tree
Showing 3 changed files with 187 additions and 3 deletions.
6 changes: 3 additions & 3 deletions convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@

NDArray: TypeAlias = 'np.ndarray[Any, Any]'

ARCH = gguf.MODEL_ARCH.LLAMA
ARCH = gguf.MODEL_ARCH.MOE

DEFAULT_CONCURRENCY = 8
#
Expand Down Expand Up @@ -241,7 +241,7 @@ def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
n_ff = config["intermediate_size"],
n_head = (n_head := config["num_attention_heads"]),
n_head_kv = config.get("num_key_value_heads", n_head),
f_norm_eps = config["rms_norm_eps"],
f_norm_eps = config["norm_eps"],
f_rope_freq_base = config.get("rope_theta"),
rope_scaling_type = rope_scaling_type,
f_rope_scale = f_rope_scale,
Expand Down Expand Up @@ -271,7 +271,7 @@ def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
n_embd = config["dim"],
n_layer = config["n_layers"],
n_ctx = n_ctx,
n_ff = model["layers.0.feed_forward.w1.weight"].shape[0],
n_ff = model["layers.0.feed_forward.experts.0.w1.weight"].shape[0],
n_head = (n_head := config["n_heads"]),
n_head_kv = config.get("n_kv_heads", n_head),
f_norm_eps = config["norm_eps"],
Expand Down
86 changes: 86 additions & 0 deletions gguf-py/gguf/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ class MODEL_ARCH(IntEnum):
BLOOM = auto()
STABLELM = auto()
QWEN = auto()
MOE = auto()


class MODEL_TENSOR(IntEnum):
Expand All @@ -117,6 +118,30 @@ class MODEL_TENSOR(IntEnum):
FFN_NORM = auto()
ATTN_Q_NORM = auto()
ATTN_K_NORM = auto()
FFN_EXPERT_0_W1 = auto()
FFN_EXPERT_0_W2 = auto()
FFN_EXPERT_0_W3 = auto()
FFN_EXPERT_1_W1 = auto()
FFN_EXPERT_1_W2 = auto()
FFN_EXPERT_1_W3 = auto()
FFN_EXPERT_2_W1 = auto()
FFN_EXPERT_2_W2 = auto()
FFN_EXPERT_2_W3 = auto()
FFN_EXPERT_3_W1 = auto()
FFN_EXPERT_3_W2 = auto()
FFN_EXPERT_3_W3 = auto()
FFN_EXPERT_4_W1 = auto()
FFN_EXPERT_4_W2 = auto()
FFN_EXPERT_4_W3 = auto()
FFN_EXPERT_5_W1 = auto()
FFN_EXPERT_5_W2 = auto()
FFN_EXPERT_5_W3 = auto()
FFN_EXPERT_6_W1 = auto()
FFN_EXPERT_6_W2 = auto()
FFN_EXPERT_6_W3 = auto()
FFN_EXPERT_7_W1 = auto()
FFN_EXPERT_7_W2 = auto()
FFN_EXPERT_7_W3 = auto()


MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
Expand All @@ -134,6 +159,7 @@ class MODEL_TENSOR(IntEnum):
MODEL_ARCH.BLOOM: "bloom",
MODEL_ARCH.STABLELM: "stablelm",
MODEL_ARCH.QWEN: "qwen",
MODEL_ARCH.MOE: "moe",
}

TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
Expand All @@ -158,6 +184,30 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
MODEL_TENSOR.FFN_EXPERT_0_W1: "layers.{bid}.feed_forward.experts.0.w1",
MODEL_TENSOR.FFN_EXPERT_0_W2: "layers.{bid}.feed_forward.experts.0.w2",
MODEL_TENSOR.FFN_EXPERT_0_W3: "layers.{bid}.feed_forward.experts.0.w3",
MODEL_TENSOR.FFN_EXPERT_1_W1: "layers.{bid}.feed_forward.experts.1.w1",
MODEL_TENSOR.FFN_EXPERT_1_W2: "layers.{bid}.feed_forward.experts.1.w2",
MODEL_TENSOR.FFN_EXPERT_1_W3: "layers.{bid}.feed_forward.experts.1.w3",
MODEL_TENSOR.FFN_EXPERT_2_W1: "layers.{bid}.feed_forward.experts.2.w1",
MODEL_TENSOR.FFN_EXPERT_2_W2: "layers.{bid}.feed_forward.experts.2.w2",
MODEL_TENSOR.FFN_EXPERT_2_W3: "layers.{bid}.feed_forward.experts.2.w3",
MODEL_TENSOR.FFN_EXPERT_3_W1: "layers.{bid}.feed_forward.experts.3.w1",
MODEL_TENSOR.FFN_EXPERT_3_W2: "layers.{bid}.feed_forward.experts.3.w2",
MODEL_TENSOR.FFN_EXPERT_3_W3: "layers.{bid}.feed_forward.experts.3.w3",
MODEL_TENSOR.FFN_EXPERT_4_W1: "layers.{bid}.feed_forward.experts.4.w1",
MODEL_TENSOR.FFN_EXPERT_4_W2: "layers.{bid}.feed_forward.experts.4.w2",
MODEL_TENSOR.FFN_EXPERT_4_W3: "layers.{bid}.feed_forward.experts.4.w3",
MODEL_TENSOR.FFN_EXPERT_5_W1: "layers.{bid}.feed_forward.experts.5.w1",
MODEL_TENSOR.FFN_EXPERT_5_W2: "layers.{bid}.feed_forward.experts.5.w2",
MODEL_TENSOR.FFN_EXPERT_5_W3: "layers.{bid}.feed_forward.experts.5.w3",
MODEL_TENSOR.FFN_EXPERT_6_W1: "layers.{bid}.feed_forward.experts.6.w1",
MODEL_TENSOR.FFN_EXPERT_6_W2: "layers.{bid}.feed_forward.experts.6.w2",
MODEL_TENSOR.FFN_EXPERT_6_W3: "layers.{bid}.feed_forward.experts.6.w3",
MODEL_TENSOR.FFN_EXPERT_7_W1: "layers.{bid}.feed_forward.experts.7.w1",
MODEL_TENSOR.FFN_EXPERT_7_W2: "layers.{bid}.feed_forward.experts.7.w2",
MODEL_TENSOR.FFN_EXPERT_7_W3: "layers.{bid}.feed_forward.experts.7.w3",
}

MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
Expand Down Expand Up @@ -333,6 +383,42 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
],
MODEL_ARCH.MOE: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_EXPERT_0_W1,
MODEL_TENSOR.FFN_EXPERT_0_W2,
MODEL_TENSOR.FFN_EXPERT_0_W3,
MODEL_TENSOR.FFN_EXPERT_1_W1,
MODEL_TENSOR.FFN_EXPERT_1_W2,
MODEL_TENSOR.FFN_EXPERT_1_W3,
MODEL_TENSOR.FFN_EXPERT_2_W1,
MODEL_TENSOR.FFN_EXPERT_2_W2,
MODEL_TENSOR.FFN_EXPERT_2_W3,
MODEL_TENSOR.FFN_EXPERT_3_W1,
MODEL_TENSOR.FFN_EXPERT_3_W2,
MODEL_TENSOR.FFN_EXPERT_3_W3,
MODEL_TENSOR.FFN_EXPERT_4_W1,
MODEL_TENSOR.FFN_EXPERT_4_W2,
MODEL_TENSOR.FFN_EXPERT_4_W3,
MODEL_TENSOR.FFN_EXPERT_5_W1,
MODEL_TENSOR.FFN_EXPERT_5_W2,
MODEL_TENSOR.FFN_EXPERT_5_W3,
MODEL_TENSOR.FFN_EXPERT_6_W1,
MODEL_TENSOR.FFN_EXPERT_6_W2,
MODEL_TENSOR.FFN_EXPERT_6_W3,
MODEL_TENSOR.FFN_EXPERT_7_W1,
MODEL_TENSOR.FFN_EXPERT_7_W2,
MODEL_TENSOR.FFN_EXPERT_7_W3,
],
MODEL_ARCH.GPT2: [
# TODO
],
Expand Down
98 changes: 98 additions & 0 deletions gguf-py/gguf/tensor_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,7 @@ class TensorNameMap:
"model.layers.{bid}.mlp.gate_proj", # llama-hf refact
"layers.{bid}.feed_forward.w1", # llama-pth
"transformer.h.{bid}.mlp.w2", # qwen
"layers.{bid}.feed_forward.gate" # moe
),

# Feed-forward down
Expand Down Expand Up @@ -196,6 +197,102 @@ class TensorNameMap:
MODEL_TENSOR.ROPE_FREQS: (
"language_model.encoder.layers.{bid}.self_attention.rotary_emb.inv_freq", # persimmon
),

MODEL_TENSOR.FFN_EXPERT_0_W1: (
"layers.{bid}.feed_forward.experts.0.w1",
),

MODEL_TENSOR.FFN_EXPERT_0_W2: (
"layers.{bid}.feed_forward.experts.0.w2",
),

MODEL_TENSOR.FFN_EXPERT_0_W3: (
"layers.{bid}.feed_forward.experts.0.w3",
),

MODEL_TENSOR.FFN_EXPERT_1_W1: (
"layers.{bid}.feed_forward.experts.1.w1",
),

MODEL_TENSOR.FFN_EXPERT_1_W2: (
"layers.{bid}.feed_forward.experts.1.w2",
),

MODEL_TENSOR.FFN_EXPERT_1_W3: (
"layers.{bid}.feed_forward.experts.1.w3",
),

MODEL_TENSOR.FFN_EXPERT_2_W1: (
"layers.{bid}.feed_forward.experts.2.w1",
),

MODEL_TENSOR.FFN_EXPERT_2_W2: (
"layers.{bid}.feed_forward.experts.2.w2",
),

MODEL_TENSOR.FFN_EXPERT_2_W3: (
"layers.{bid}.feed_forward.experts.2.w3",
),

MODEL_TENSOR.FFN_EXPERT_3_W1: (
"layers.{bid}.feed_forward.experts.3.w1",
),

MODEL_TENSOR.FFN_EXPERT_3_W2: (
"layers.{bid}.feed_forward.experts.3.w2",
),

MODEL_TENSOR.FFN_EXPERT_3_W3: (
"layers.{bid}.feed_forward.experts.3.w3",
),

MODEL_TENSOR.FFN_EXPERT_4_W1: (
"layers.{bid}.feed_forward.experts.4.w1",
),

MODEL_TENSOR.FFN_EXPERT_4_W2: (
"layers.{bid}.feed_forward.experts.4.w2",
),

MODEL_TENSOR.FFN_EXPERT_4_W3: (
"layers.{bid}.feed_forward.experts.4.w3",
),

MODEL_TENSOR.FFN_EXPERT_5_W1: (
"layers.{bid}.feed_forward.experts.5.w1",
),

MODEL_TENSOR.FFN_EXPERT_5_W2: (
"layers.{bid}.feed_forward.experts.5.w2",
),

MODEL_TENSOR.FFN_EXPERT_5_W3: (
"layers.{bid}.feed_forward.experts.5.w3",
),

MODEL_TENSOR.FFN_EXPERT_6_W1: (
"layers.{bid}.feed_forward.experts.6.w1",
),

MODEL_TENSOR.FFN_EXPERT_6_W2: (
"layers.{bid}.feed_forward.experts.6.w2",
),

MODEL_TENSOR.FFN_EXPERT_6_W3: (
"layers.{bid}.feed_forward.experts.6.w3",
),

MODEL_TENSOR.FFN_EXPERT_7_W1: (
"layers.{bid}.feed_forward.experts.7.w1",
),

MODEL_TENSOR.FFN_EXPERT_7_W2: (
"layers.{bid}.feed_forward.experts.7.w2",
),

MODEL_TENSOR.FFN_EXPERT_7_W3: (
"layers.{bid}.feed_forward.experts.7.w3",
),
}

mapping: dict[str, tuple[MODEL_TENSOR, str]]
Expand All @@ -211,6 +308,7 @@ def __init__(self, arch: MODEL_ARCH, n_blocks: int):
self.mapping[key] = (tensor, tensor_name)
for bid in range(n_blocks):
for tensor, keys in self.block_mappings_cfg.items():
print(tensor, keys)
if tensor not in MODEL_TENSORS[arch]:
continue
tensor_name = TENSOR_NAMES[tensor].format(bid = bid)
Expand Down

0 comments on commit 2dd8944

Please sign in to comment.