Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

llama : add Mixtral support #4406

Merged
merged 47 commits into from
Dec 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
dff8cbe
convert : support Mixtral as LLAMA arch
ggerganov Dec 9, 2023
d38e41e
convert : fix n_ff typo
ggerganov Dec 9, 2023
a3eefe9
llama : model loading
ggerganov Dec 9, 2023
861cd67
ggml : sync latest ggml_mul_mat_id
ggerganov Dec 9, 2023
aedfad1
llama : update graph to support MoE
ggerganov Dec 9, 2023
af1a096
llama : fix cur -> cur_expert
ggerganov Dec 9, 2023
7ea3695
llama : first working version
ggerganov Dec 9, 2023
8b185b7
llama : fix expert weighting in the FFN
ggerganov Dec 9, 2023
7372b62
ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu o…
ggerganov Dec 9, 2023
ee8fb39
ggml : add n_as argument to ggml_mul_mat_id
slaren Dec 9, 2023
9064b1c
ggml : fix ggml_get_rows to take into account ne02 / ne11
ggerganov Dec 9, 2023
2cbcba8
metal : add more general support for ggml_get_rows + tests
ggerganov Dec 9, 2023
06dfde3
llama : add basic support for offloading moe with CUDA
slaren Dec 9, 2023
7e2006b
metal : add/mul/div use general kernel when src1 not cont
ggerganov Dec 9, 2023
8c5b66e
metal : reduce the kernel launches for ggml_mul_mat_id
ggerganov Dec 9, 2023
ac3f7d8
ggml : get_rows : support non-contiguos tensors with gaps, generalize…
slaren Dec 9, 2023
2e4db48
ggml : update get_rows f16 and q
slaren Dec 9, 2023
62b95f9
cuda : support non-contiguous src1 in get_rows
slaren Dec 9, 2023
0710b0f
llama : offload missing ffn_moe_silu
slaren Dec 9, 2023
016f9bb
metal : fix ggml_get_rows to work with non-cont src1
ggerganov Dec 10, 2023
6cfb31f
metal : add indirect mat-vec kernels for all quantization types
ggerganov Dec 10, 2023
d1259b7
llama : do not quantize expert gating tensors
ggerganov Dec 10, 2023
e640cbe
llama : add n_expert and n_expert_used to hparams + change quants
ggerganov Dec 10, 2023
cefebb3
test-backend-ops : add moe test
slaren Dec 10, 2023
8614aa7
cuda : fix get_rows when ncols is odd
slaren Dec 10, 2023
65923a8
convert : determine n_ctx correctly
ggerganov Dec 10, 2023
b0b83dd
metal : fix ggml_mul_mat_id for F32
ggerganov Dec 10, 2023
54ba263
test-backend-ops : make experts more evenly probable (test_moe)
ggerganov Dec 10, 2023
54d254b
test-backend-ops : cleanup, add moe test for batches
slaren Dec 10, 2023
f1380d7
test-backend-ops : add cpy from f32 -> all types test
slaren Dec 10, 2023
b002981
test-backend-ops : fix dequantize block offset
slaren Dec 11, 2023
8cbaed1
llama : fix hard-coded number of experts
ggerganov Dec 11, 2023
ffda94c
test-backend-ops : simplify and disable slow tests to avoid CI timeout
slaren Dec 11, 2023
33e50f1
test-backend-ops : disable MOE test with thread sanitizer
slaren Dec 11, 2023
296c945
cuda : fix mul_mat_id with multi gpu
slaren Dec 11, 2023
7dc75e3
convert : use 1e6 rope_freq_base for mixtral
slaren Dec 11, 2023
f1cbfab
convert : fix style
slaren Dec 11, 2023
6a419f4
convert : support safetensors format
ggerganov Dec 12, 2023
a742d9f
gguf-py : bump version
slaren Dec 12, 2023
08eb991
metal : add cpy f16 -> f32 kernel
ggerganov Dec 12, 2023
a51bc0c
metal : fix binary ops for ne10 % 4 != 0
ggerganov Dec 12, 2023
ea4402b
test-backend-ops : add one more sum_rows test
ggerganov Dec 12, 2023
90c12e6
ggml : do not use BLAS with ggml_mul_mat_id
ggerganov Dec 12, 2023
82e4f64
convert-hf : support for mixtral-instruct (#4428)
Mrkvak Dec 12, 2023
ab558ac
metal : fix soft_max kernels
ggerganov Dec 13, 2023
109e7aa
metal : limit kernels to not use more than the allowed threads
ggerganov Dec 13, 2023
e1241d9
metal : switch to execution barriers + fix one of the barriers
ggerganov Dec 13, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -396,6 +396,11 @@ ifdef LLAMA_CUBLAS
MK_LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
OBJS += ggml-cuda.o
NVCCFLAGS = --forward-unknown-to-host-compiler -use_fast_math

ifdef LLAMA_DEBUG
NVCCFLAGS += -lineinfo
endif

ifdef LLAMA_CUDA_NVCC
NVCC = $(LLAMA_CUDA_NVCC)
else
Expand Down
21 changes: 20 additions & 1 deletion convert-hf-to-gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,18 @@ def set_gguf_parameters(self):
self.gguf_writer.add_embedding_length(n_embd)
if (n_ff := self.hparams.get("intermediate_size")) is not None:
self.gguf_writer.add_feed_forward_length(n_ff)
if (n_head := self.hparams.get("num_attention_head")) is not None:
if (n_head := self.hparams.get("num_attention_heads")) is not None:
self.gguf_writer.add_head_count(n_head)
if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None:
self.gguf_writer.add_head_count_kv(n_head_kv)

if (n_rms_eps := self.hparams.get("rms_norm_eps")) is not None:
self.gguf_writer.add_layer_norm_rms_eps(n_rms_eps)
if (n_experts := self.hparams.get("num_local_experts")) is not None:
self.gguf_writer.add_expert_count(n_experts)
if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
self.gguf_writer.add_expert_used_count(n_experts_used)

self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True))

def write_tensors(self):
Expand Down Expand Up @@ -170,6 +180,8 @@ def from_model_architecture(model_architecture):
return StableLMModel
if model_architecture == "QWenLMHeadModel":
return QwenModel
if model_architecture == "MixtralForCausalLM":
return MixtralModel
return Model

def _is_model_safetensors(self) -> bool:
Expand Down Expand Up @@ -207,6 +219,8 @@ def _get_model_architecture(self) -> gguf.MODEL_ARCH:
return gguf.MODEL_ARCH.STABLELM
if arch == "QWenLMHeadModel":
return gguf.MODEL_ARCH.QWEN
if arch == "MixtralForCausalLM":
return gguf.MODEL_ARCH.LLAMA

raise NotImplementedError(f'Architecture "{arch}" not supported!')

Expand Down Expand Up @@ -837,6 +851,11 @@ def set_gguf_parameters(self):
self.gguf_writer.add_layer_norm_eps(1e-5)


class MixtralModel(Model):
def set_vocab(self):
self._set_vocab_sentencepiece()


class QwenModel(Model):
@staticmethod
def token_bytes_to_string(b):
Expand Down
74 changes: 57 additions & 17 deletions convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
ARCH = gguf.MODEL_ARCH.LLAMA

DEFAULT_CONCURRENCY = 8

#
# data types
#
Expand All @@ -62,10 +63,10 @@ class UnquantizedDataType(DataType):
pass


DT_F16 = UnquantizedDataType('F16', dtype = np.dtype(np.float16), valid_conversions = ['F32', 'Q8_0'])
DT_F32 = UnquantizedDataType('F32', dtype = np.dtype(np.float32), valid_conversions = ['F16', 'Q8_0'])
DT_I32 = UnquantizedDataType('I32', dtype = np.dtype(np.int16), valid_conversions = [])
DT_BF16 = UnquantizedDataType('BF16', dtype = np.dtype(np.uint16), valid_conversions = ['F32', 'F16', 'Q8_0'])
DT_F16 = UnquantizedDataType('F16', dtype = np.dtype(np.float16), valid_conversions = ['F32', 'Q8_0'])
DT_F32 = UnquantizedDataType('F32', dtype = np.dtype(np.float32), valid_conversions = ['F16', 'Q8_0'])
DT_I32 = UnquantizedDataType('I32', dtype = np.dtype(np.int16), valid_conversions = [])
DT_BF16 = UnquantizedDataType('BF16', dtype = np.dtype(np.uint16), valid_conversions = ['F32', 'F16', 'Q8_0'])


@dataclass(frozen=True)
Expand Down Expand Up @@ -151,14 +152,16 @@ def type_for_tensor(self, name: str, tensor: LazyTensor) -> DataType:

@dataclass
class Params:
n_vocab: int
n_embd: int
n_layer: int
n_ctx: int
n_ff: int
n_head: int
n_head_kv: int
f_norm_eps: float
n_vocab: int
n_embd: int
n_layer: int
n_ctx: int
n_ff: int
n_head: int
n_head_kv: int
n_experts: int | None = None
n_experts_used: int | None = None
f_norm_eps: float | None = None

rope_scaling_type: gguf.RopeScalingType | None = None
f_rope_freq_base: float | None = None
Expand Down Expand Up @@ -233,6 +236,13 @@ def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
raise Exception("failed to guess 'n_ctx'. This model is unknown or unsupported.\n"
"Suggestion: provide 'config.json' of the model in the same directory containing model files.")

n_experts = None
n_experts_used = None

if "num_local_experts" in config:
n_experts = config["num_local_experts"]
n_experts_used = config["num_experts_per_tok"]

return Params(
n_vocab = config["vocab_size"],
n_embd = config["hidden_size"],
Expand All @@ -241,6 +251,8 @@ def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
n_ff = config["intermediate_size"],
n_head = (n_head := config["num_attention_heads"]),
n_head_kv = config.get("num_key_value_heads", n_head),
n_experts = n_experts,
n_experts_used = n_experts_used,
f_norm_eps = config["rms_norm_eps"],
f_rope_freq_base = config.get("rope_theta"),
rope_scaling_type = rope_scaling_type,
Expand All @@ -255,8 +267,15 @@ def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
config = json.load(open(config_path))

n_experts = None
n_experts_used = None
f_rope_freq_base = None

# hack to determine LLaMA v1 vs v2 vs CodeLlama
if config.get("rope_theta") == 1000000:
if config.get("moe"):
# Mixtral
n_ctx = 32768
elif config.get("rope_theta") == 1000000:
# CodeLlama
n_ctx = 16384
elif config["norm_eps"] == 1e-05:
Expand All @@ -266,16 +285,27 @@ def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
# LLaMA v1
n_ctx = 2048

if "layers.0.feed_forward.w1.weight" in model:
n_ff = model["layers.0.feed_forward.w1.weight"].shape[0]

if config.get("moe"):
n_ff = model["layers.0.feed_forward.experts.0.w1.weight"].shape[0]
n_experts = config["moe"]["num_experts"]
n_experts_used = config["moe"]["num_experts_per_tok"]
f_rope_freq_base = 1e6

return Params(
n_vocab = model["tok_embeddings.weight"].shape[0],
n_embd = config["dim"],
n_layer = config["n_layers"],
n_ctx = n_ctx,
n_ff = model["layers.0.feed_forward.w1.weight"].shape[0],
n_ff = n_ff,
n_head = (n_head := config["n_heads"]),
n_head_kv = config.get("n_kv_heads", n_head),
n_experts = n_experts,
n_experts_used = n_experts_used,
f_norm_eps = config["norm_eps"],
f_rope_freq_base = config.get("rope_theta"),
f_rope_freq_base = config.get("rope_theta", f_rope_freq_base),
)

@staticmethod
Expand Down Expand Up @@ -832,7 +862,17 @@ def add_meta_arch(self, params: Params) -> None:
self.gguf.add_rope_dimension_count(params.n_embd // params.n_head)
self.gguf.add_head_count (params.n_head)
self.gguf.add_head_count_kv (params.n_head_kv)
self.gguf.add_layer_norm_rms_eps (params.f_norm_eps)

if params.n_experts:
self.gguf.add_expert_count(params.n_experts)

if params.n_experts_used:
self.gguf.add_expert_used_count(params.n_experts_used)

if params.f_norm_eps:
self.gguf.add_layer_norm_rms_eps(params.f_norm_eps)
else:
raise ValueError('f_norm_eps is None')

if params.f_rope_freq_base is not None:
self.gguf.add_rope_freq_base(params.f_rope_freq_base)
Expand Down Expand Up @@ -956,7 +996,7 @@ def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyM


def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType:
wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0) +".weight"].data_type
wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0) + ".weight"].data_type

if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32):
return GGMLFileType.AllF32
Expand Down
Loading
Loading