Skip to content

Commit

Permalink
llama : first working version
Browse files Browse the repository at this point in the history
  • Loading branch information
ggerganov committed Dec 9, 2023
1 parent af1a096 commit 7ea3695
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 5 deletions.
9 changes: 7 additions & 2 deletions ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -4105,7 +4105,9 @@ struct ggml_tensor * ggml_mul_mat_id(
result->src[0] = ids;
result->src[1] = b;

for (int64_t i = 0; i < n_as; i++) {
// TODO: n_as is the selected experts, but it should be the total number of experts
//for (int64_t i = 0; i < n_as; i++) {
for (int64_t i = 0; i < 8; i++) {
struct ggml_tensor * a = as[i];
GGML_ASSERT(ggml_are_same_shape(as[0], a));
GGML_ASSERT(ggml_can_mul_mat(a, b));
Expand Down Expand Up @@ -9758,7 +9760,10 @@ static void ggml_compute_forward_mul_mat_id(

for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]);
GGML_ASSERT(row_id >= 0 && row_id < ids->ne[0]);

// TODO: this assert seems wrong?
//printf("row_id = %d, ids->ne[0] = %d, id = %d\n", row_id, ids->ne[0], id);
//GGML_ASSERT(row_id >= 0 && row_id < ids->ne[0]);

const struct ggml_tensor * src0_row = dst->src[row_id + 2];
ggml_compute_forward_mul_mat(params, src0_row, src1, dst, i01, 1);
Expand Down
2 changes: 1 addition & 1 deletion ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@
#define GGML_MAX_DIMS 4
#define GGML_MAX_PARAMS 1024
#define GGML_MAX_CONTEXTS 64
#define GGML_MAX_SRC 6
#define GGML_MAX_SRC 10
#define GGML_MAX_NAME 64
#define GGML_MAX_OP_PARAMS 64
#define GGML_DEFAULT_N_THREADS 4
Expand Down
8 changes: 6 additions & 2 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4242,14 +4242,18 @@ struct llm_build_context {
LLM_NORM_RMS, cb, il);
cb(cur, "ffn_norm", il);

const int n_experts_per_tok = 2; // TODO: param
// TODO: param
const int n_experts = 8;
const int n_experts_per_tok = 2;

ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts]
ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts]

// select experts
ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_experts_per_tok); // [n_tokens, num_experts_per_tok]
ggml_tensor * weights = ggml_get_rows(ctx0, probs, selected_experts); // [n_tokens, num_experts_per_tok, 1]
//ggml_tensor * weights = ggml_get_rows(ctx0, probs, selected_experts); // [n_tokens, num_experts_per_tok, 1]
ggml_tensor * weights = ggml_get_rows(ctx0,
ggml_reshape_3d(ctx0, probs, 1, n_experts, n_tokens), selected_experts);
weights = ggml_div(ctx0, weights, ggml_sum_rows(ctx0, weights)); // [n_tokens, num_experts_per_tok, 1]

// compute expert outputs
Expand Down

0 comments on commit 7ea3695

Please sign in to comment.