Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Importance matrix support for legacy quants #4969

Merged
merged 2 commits into from
Jan 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
192 changes: 192 additions & 0 deletions ggml-quants.c
Original file line number Diff line number Diff line change
Expand Up @@ -515,6 +515,7 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
quantize_row_q4_0_reference(x, y, k);
}


void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k) {
const int qk = QK4_1;

Expand Down Expand Up @@ -3039,6 +3040,197 @@ size_t quantize_q6_K(const float * src, void * dst, int nrow, int n_per_row, int
return nrow * row_size;
}

static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restrict y, int n_per_row, const float * quant_weights) {
static_assert(QK4_0 == 32, "QK4_0 must be 32");

if (!quant_weights) {
quantize_row_q4_0_reference(x, y, n_per_row);
return;
}

float weight[QK4_0];
int8_t L[QK4_0];

float sum_x2 = 0;
for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
float sigma2 = sum_x2/n_per_row;

const int nb = n_per_row/QK4_0;
for (int ib = 0; ib < nb; ++ib) {
const float * xb = x + QK4_0 * ib;
const float * qw = quant_weights + QK4_0 * ib;
for (int j = 0; j < QK4_0; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
float d = make_qx_quants(QK4_0, 8, xb, L, 1, weight);
y[ib].d = GGML_FP32_TO_FP16(d);
for (int j = 0; j < 16; ++j) {
y[ib].qs[j] = L[j] | (L[j+16] << 4);
}
}
}

size_t quantize_q4_0(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
if (!quant_weights) {
return ggml_quantize_q4_0(src, dst, nrow*n_per_row, n_per_row, hist);
}
int row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
char * qrow = (char *)dst;
for (int row = 0; row < nrow; ++row) {
quantize_row_q4_0_impl(src, (block_q4_0*)qrow, n_per_row, quant_weights);
src += n_per_row;
qrow += row_size;
}
return nrow * row_size;
}

static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restrict y, int n_per_row, const float * quant_weights) {
static_assert(QK4_1 == 32, "QK4_1 must be 32");

if (!quant_weights) {
quantize_row_q4_1_reference(x, y, n_per_row);
return;
}

float weight[QK4_1];
uint8_t L[QK4_1], Laux[QK4_1];

float sum_x2 = 0;
for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
float sigma2 = sum_x2/n_per_row;

const int nb = n_per_row/QK4_1;
for (int ib = 0; ib < nb; ++ib) {
const float * xb = x + QK4_1 * ib;
const float * qw = quant_weights + QK4_1 * ib;
for (int j = 0; j < QK4_1; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
float min;
float d = make_qkx3_quants(QK4_1, 15, xb, weight, L, &min, Laux, -0.9f, 0.05f, 36, false);
y[ib].d = GGML_FP32_TO_FP16(d);
y[ib].m = GGML_FP32_TO_FP16(-min);
for (int j = 0; j < 16; ++j) {
y[ib].qs[j] = L[j] | (L[j+16] << 4);
}
}
}

size_t quantize_q4_1(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
if (!quant_weights) {
return ggml_quantize_q4_1(src, dst, nrow*n_per_row, n_per_row, hist);
}
int row_size = ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
char * qrow = (char *)dst;
for (int row = 0; row < nrow; ++row) {
quantize_row_q4_1_impl(src, (block_q4_1*)qrow, n_per_row, quant_weights);
src += n_per_row;
qrow += row_size;
}
return nrow * row_size;
}

static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restrict y, int n_per_row, const float * quant_weights) {
static_assert(QK5_0 == 32, "QK5_0 must be 32");

if (!quant_weights) {
quantize_row_q5_0_reference(x, y, n_per_row);
return;
}

float weight[QK5_0];
int8_t L[QK5_0];

float sum_x2 = 0;
for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
float sigma2 = sum_x2/n_per_row;

const int nb = n_per_row/QK5_0;
for (int ib = 0; ib < nb; ++ib) {
const float * xb = x + QK5_0 * ib;
const float * qw = quant_weights + QK5_0 * ib;
for (int j = 0; j < QK5_0; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
float d = make_qx_quants(QK5_0, 16, xb, L, 1, weight);
y[ib].d = GGML_FP32_TO_FP16(d);

uint32_t qh = 0;

for (int j = 0; j < 16; ++j) {
const uint8_t xi0 = L[j];
const uint8_t xi1 = L[j+16];
y[ib].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);

// get the 5-th bit and store it in qh at the right position
qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0/2);
}

memcpy(&y[ib].qh, &qh, sizeof(qh));
}
}

size_t quantize_q5_0(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
if (!quant_weights) {
return ggml_quantize_q5_0(src, dst, nrow*n_per_row, n_per_row, hist);
}
int row_size = ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
char * qrow = (char *)dst;
for (int row = 0; row < nrow; ++row) {
quantize_row_q5_0_impl(src, (block_q5_0*)qrow, n_per_row, quant_weights);
src += n_per_row;
qrow += row_size;
}
return nrow * row_size;
}

static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restrict y, int n_per_row, const float * quant_weights) {
static_assert(QK5_1 == 32, "QK5_1 must be 32");

if (!quant_weights) {
quantize_row_q5_1_reference(x, y, n_per_row);
return;
}

float weight[QK5_1];
uint8_t L[QK5_1], Laux[QK5_1];

float sum_x2 = 0;
for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
float sigma2 = sum_x2/n_per_row;

const int nb = n_per_row/QK5_1;
for (int ib = 0; ib < nb; ++ib) {
const float * xb = x + QK5_1 * ib;
const float * qw = quant_weights + QK5_1 * ib;
for (int j = 0; j < QK5_1; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
float min;
float d = make_qkx3_quants(QK5_1, 31, xb, weight, L, &min, Laux, -0.9f, 0.05f, 36, false);
y[ib].d = GGML_FP32_TO_FP16(d);
y[ib].m = GGML_FP32_TO_FP16(-min);

uint32_t qh = 0;
for (int j = 0; j < 16; ++j) {
const uint8_t xi0 = L[j];
const uint8_t xi1 = L[j+16];
y[ib].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
// get the 5-th bit and store it in qh at the right position
qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0/2);
}
memcpy(&y[ib].qh, &qh, sizeof(qh));
}
}

size_t quantize_q5_1(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
if (!quant_weights) {
return ggml_quantize_q5_1(src, dst, nrow*n_per_row, n_per_row, hist);
}
int row_size = ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
char * qrow = (char *)dst;
for (int row = 0; row < nrow; ++row) {
quantize_row_q5_1_impl(src, (block_q5_1*)qrow, n_per_row, quant_weights);
src += n_per_row;
qrow += row_size;
}
return nrow * row_size;
}

// ====================== "True" 2-bit (de)-quantization

static const uint64_t iq2xxs_grid[256] = {
Expand Down
4 changes: 4 additions & 0 deletions ggml-quants.h
Original file line number Diff line number Diff line change
Expand Up @@ -253,3 +253,7 @@ size_t quantize_q3_K (const float * src, void * dst, int nrows, int n_per_row,
size_t quantize_q4_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
size_t quantize_q5_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
size_t quantize_q6_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
size_t quantize_q4_0 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
size_t quantize_q4_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
size_t quantize_q5_0 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
size_t quantize_q5_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
28 changes: 20 additions & 8 deletions ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -18674,26 +18674,38 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
case GGML_TYPE_Q4_0:
{
GGML_ASSERT(start % QK4_0 == 0);
block_q4_0 * block = (block_q4_0*)dst + start / QK4_0;
result = ggml_quantize_q4_0(src + start, block, n, n, hist);
GGML_ASSERT(start % n_per_row == 0);
size_t start_row = start / n_per_row;
size_t row_size = ggml_row_size(type, n_per_row);
result = quantize_q4_0(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
GGML_ASSERT(result == row_size * nrows);
} break;
case GGML_TYPE_Q4_1:
{
GGML_ASSERT(start % QK4_1 == 0);
block_q4_1 * block = (block_q4_1*)dst + start / QK4_1;
result = ggml_quantize_q4_1(src + start, block, n, n, hist);
GGML_ASSERT(start % n_per_row == 0);
size_t start_row = start / n_per_row;
size_t row_size = ggml_row_size(type, n_per_row);
result = quantize_q4_1(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
GGML_ASSERT(result == row_size * nrows);
} break;
case GGML_TYPE_Q5_0:
{
GGML_ASSERT(start % QK5_0 == 0);
block_q5_0 * block = (block_q5_0*)dst + start / QK5_0;
result = ggml_quantize_q5_0(src + start, block, n, n, hist);
GGML_ASSERT(start % n_per_row == 0);
size_t start_row = start / n_per_row;
size_t row_size = ggml_row_size(type, n_per_row);
result = quantize_q5_0(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
GGML_ASSERT(result == row_size * nrows);
} break;
case GGML_TYPE_Q5_1:
{
GGML_ASSERT(start % QK5_1 == 0);
block_q5_1 * block = (block_q5_1*)dst + start / QK5_1;
result = ggml_quantize_q5_1(src + start, block, n, n, hist);
GGML_ASSERT(start % n_per_row == 0);
size_t start_row = start / n_per_row;
size_t row_size = ggml_row_size(type, n_per_row);
result = quantize_q5_1(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
GGML_ASSERT(result == row_size * nrows);
} break;
case GGML_TYPE_Q8_0:
{
Expand Down
10 changes: 10 additions & 0 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8374,6 +8374,8 @@ struct quantize_state_internal {
int n_k_quantized = 0;
int n_fallback = 0;

bool has_imatrix = false;

quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params)
: model(model)
, params(params)
Expand Down Expand Up @@ -8546,6 +8548,13 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
new_type = GGML_TYPE_Q5_K;
}
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || ftype == LLAMA_FTYPE_MOSTLY_Q5_0)
&& qs.has_imatrix && i_layer < n_layer/8) {
// Guard against craziness in the first few ffn_down layers that can happen even with imatrix for Q4_0/Q5_0.
// We only do it when an imatrix is provided because a) we want to make sure that one can always get the
// same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
}
++qs.i_feed_forward_w2;
} else if (name.find("attn_output.weight") != std::string::npos) {
if (arch != LLM_ARCH_FALCON) {
Expand Down Expand Up @@ -8669,6 +8678,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
imatrix_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->imatrix);
if (imatrix_data) {
LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
qs.has_imatrix = true;
}
}

Expand Down
Loading