From b9c0622927690e29864123b20e4b5eb08e8f0765 Mon Sep 17 00:00:00 2001
From: Sam Spilsbury <smspillaz@gmail.com>
Date: Wed, 26 Jul 2023 20:47:45 +0300
Subject: [PATCH 01/22] ggml-tensor: Add ggml_tensor_get_data_type

---
 ggml-gobject/ggml-tensor.c | 12 ++++++++++++
 ggml-gobject/ggml-tensor.h |  2 +-
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/ggml-gobject/ggml-tensor.c b/ggml-gobject/ggml-tensor.c
index 33fee01..2100aa9 100644
--- a/ggml-gobject/ggml-tensor.c
+++ b/ggml-gobject/ggml-tensor.c
@@ -280,6 +280,18 @@ ggml_tensor_get_name (GGMLTensor *tensor)
   return ggml_get_name (tensor->tensor);
 }
 
+/**
+ * ggml_tensor_get_data_type:
+ * @tensor: A #GGMLTensor
+ *
+ * Returns: A #GGMLDataType which is the data type of this tensor
+ */
+GGMLDataType
+ggml_tensor_get_data_type (GGMLTensor *tensor)
+{
+  return (GGMLDataType) tensor->tensor->type;
+}
+
 /**
  * ggml_tensor_get_shape:
  * @tensor: A #GGMLTensor
diff --git a/ggml-gobject/ggml-tensor.h b/ggml-gobject/ggml-tensor.h
index 5e131b7..68e159e 100644
--- a/ggml-gobject/ggml-tensor.h
+++ b/ggml-gobject/ggml-tensor.h
@@ -49,7 +49,7 @@ GBytes * ggml_tensor_get_bytes (GGMLTensor *tensor);
 void ggml_tensor_set_name (GGMLTensor *tensor,
                            const char *name);
 const char * ggml_tensor_get_name (GGMLTensor *tensor);
-
+GGMLDataType ggml_tensor_get_data_type (GGMLTensor *tensor);
 int64_t *ggml_tensor_get_shape (GGMLTensor *tensor, size_t *out_n_dims);
 
 GPtrArray * ggml_tensor_get_cgraph_children (GGMLTensor *tensor);

From 04ebcc25727b80a39173a959b1a952e2a7f4870e Mon Sep 17 00:00:00 2001
From: Sam Spilsbury <smspillaz@gmail.com>
Date: Wed, 26 Jul 2023 20:53:28 +0300
Subject: [PATCH 02/22] ggml-context: Add ggml_context_new_tensor (generic
 version of 1d/2d/3d funcs)

---
 ggml-gobject/ggml-context.c | 21 +++++++++++++++++++++
 ggml-gobject/ggml-context.h |  4 ++++
 2 files changed, 25 insertions(+)

diff --git a/ggml-gobject/ggml-context.c b/ggml-gobject/ggml-context.c
index 615e672..0c7ac00 100644
--- a/ggml-gobject/ggml-context.c
+++ b/ggml-gobject/ggml-context.c
@@ -111,6 +111,27 @@ ggml_context_unref (GGMLContext *context)
     }
 }
 
+/**
+ * ggml_context_new_tensor:
+ * @context: A #GGMLContext
+ * @data_type: A #GGMLDataType for the new tensor
+ * @shape: (array length=n_dims): Shape of the tensor
+ * @n_dims: Number of dimensions in the tensor shape
+ *
+ * Creates a new #GGMLTensor from the memory pool of @context
+ * with shape @shape
+ *
+ * Returns: (transfer full): The #GGMLTensor
+ */
+GGMLTensor *
+ggml_context_new_tensor (GGMLContext  *context,
+                         GGMLDataType  data_type,
+                         int64_t      *shape,
+                         size_t        n_dims)
+{
+  return ggml_tensor_new (context, data_type, shape, n_dims);
+}
+
 /**
  * ggml_context_new_tensor_1d:
  * @context: A #GGMLContext
diff --git a/ggml-gobject/ggml-context.h b/ggml-gobject/ggml-context.h
index 9c43a33..fa73c56 100644
--- a/ggml-gobject/ggml-context.h
+++ b/ggml-gobject/ggml-context.h
@@ -37,6 +37,10 @@ GGMLContext *ggml_context_new_from_mem_buffer (GBytes *mem_buffer);
 GGMLContext *ggml_context_new (size_t memory_size);
 GGMLContext *ggml_context_ref (GGMLContext *context);
 void ggml_context_unref (GGMLContext *context);
+GGMLTensor *ggml_context_new_tensor (GGMLContext  *context,
+                                     GGMLDataType  data_type,
+                                     int64_t      *shape,
+                                     size_t        n_dims);
 GGMLTensor *ggml_context_new_tensor_1d (GGMLContext *context,
                                         GGMLDataType data_type, size_t size);
 GGMLTensor *ggml_context_new_tensor_2d (GGMLContext *context,

From 4227298dd7b6cccadd720a06c3e8ce3435edbc12 Mon Sep 17 00:00:00 2001
From: Sam Spilsbury <smspillaz@gmail.com>
Date: Wed, 26 Jul 2023 21:00:53 +0300
Subject: [PATCH 03/22] ggml-model-desc: Add ggml_model_desc_map

---
 ggml-gobject/ggml-model-desc.c | 73 ++++++++++++++++++++++++++++++++++
 ggml-gobject/ggml-model-desc.h | 15 +++++++
 2 files changed, 88 insertions(+)

diff --git a/ggml-gobject/ggml-model-desc.c b/ggml-gobject/ggml-model-desc.c
index dd237a2..0731162 100644
--- a/ggml-gobject/ggml-model-desc.c
+++ b/ggml-gobject/ggml-model-desc.c
@@ -181,6 +181,79 @@ ggml_model_node_flatten_recurse (GHashTable *table, GGMLModelDescNode *current_n
     }
 }
 
+GGMLModelDescNode *
+ggml_model_desc_map_recurse (GGMLModelDescNode    *current_node,
+                             GGMLModelDescMapFunc  map_func,
+                             gpointer              map_user_data,
+                             const char           *path)
+{
+  g_autoptr(GGMLModelDescLeaf) mapped_leaf = NULL;
+
+  if (current_node->leaf != NULL)
+    {
+      mapped_leaf = map_func (path,
+                              current_node->leaf,
+                              map_user_data);
+    }
+
+  g_autoptr(GHashTable) mapped_children = NULL;
+
+  if (current_node->children)
+    {
+      GHashTableIter iter;
+      gpointer key, value;
+
+      mapped_children = g_hash_table_new_full (g_str_hash,
+                                               g_str_equal,
+                                               g_free,
+                                               (GDestroyNotify) ggml_model_desc_node_unref);
+
+      g_hash_table_iter_init (&iter, current_node->children);
+      while (g_hash_table_iter_next (&iter, &key, &value))
+        {
+          const char *key_str = key;
+          GGMLModelDescNode *child = value;
+
+          g_autofree char *child_path = (path == NULL ?
+                                         g_strdup(key) :
+                                         g_strjoin("/", path, (const gchar *) key, NULL));
+
+          GGMLModelDescNode *new_child = ggml_model_desc_map_recurse (child,
+                                                                      map_func,
+                                                                      map_user_data,
+                                                                      child_path);
+
+          g_hash_table_insert (mapped_children, g_strdup (key_str), g_steal_pointer (&new_child));
+        }
+    }
+
+  return ggml_model_desc_node_new (g_steal_pointer (&mapped_leaf),
+                                   g_steal_pointer (&mapped_children));
+}
+
+/**
+ * ggml_model_desc_map:
+ * @model_desc: A #GGMLModelDescNode to apply @map_func to
+ * @map_func: (scope call): A #GGMLModelDescMapFunc
+ * @map_user_data: (closure map_func): User data for @map_func
+ *
+ * Map @model_desc and return a new #GGMLModelDescNode tree with
+ * the @map_func applied to each leaf. The @map_func must return a
+ * new #GGMLModelDescLeaf for each leaf.
+ *
+ * Returns: (transfer full): A new #GGMLModelDescNode transformed by @map_func.
+ */
+GGMLModelDescNode *
+ggml_model_desc_map (GGMLModelDescNode    *model_desc,
+                     GGMLModelDescMapFunc  map_func,
+                     gpointer              map_user_data)
+{
+  return ggml_model_desc_map_recurse (model_desc,
+                                      map_func,
+                                      map_user_data,
+                                      NULL);
+}
+
 /**
  * ggml_model_desc_node_flatten:
  * @node: A #GGMLModelDescNode to flatten
diff --git a/ggml-gobject/ggml-model-desc.h b/ggml-gobject/ggml-model-desc.h
index 2ec6b96..3d90b81 100644
--- a/ggml-gobject/ggml-model-desc.h
+++ b/ggml-gobject/ggml-model-desc.h
@@ -74,6 +74,21 @@ GGMLModelDescNode *ggml_model_desc_node_new_leaf (int64_t *dimensions,
                                                   GGMLDataType type);
 GGMLModelDescNode *ggml_model_desc_node_ref (GGMLModelDescNode *node);
 GHashTable *ggml_model_desc_node_flatten (GGMLModelDescNode *node);
+
+/**
+ * GGMLModelDescMapFunc:
+ * @path: The key for the current weight
+ * @leaf: (transfer none): A #GGMLModelDescLeaf
+ *
+ * Returns: (transfer full): A new #GGMLModelDescLeaf with the mapped leaf node.
+ */
+typedef GGMLModelDescLeaf * (*GGMLModelDescMapFunc) (const char               *path,
+                                                     const GGMLModelDescLeaf  *leaf,
+                                                     gpointer                  user_data);
+
+GGMLModelDescNode *ggml_model_desc_map (GGMLModelDescNode    *model_desc,
+                                        GGMLModelDescMapFunc  map_func,
+                                        gpointer              map_user_data);
 void ggml_model_desc_node_unref (GGMLModelDescNode *node);
 
 /**

From 1dfe88e4ae7911a2d0635e585ff2f895b7e1fb59 Mon Sep 17 00:00:00 2001
From: Sam Spilsbury <smspillaz@gmail.com>
Date: Wed, 26 Jul 2023 21:06:48 +0300
Subject: [PATCH 04/22] ggml-tensor: Add ggml_tensor_new

---
 ggml-gobject/ggml-tensor.c                   | 13 +++++++++++++
 ggml-gobject/internal/ggml-tensor-internal.h |  6 ++++++
 2 files changed, 19 insertions(+)

diff --git a/ggml-gobject/ggml-tensor.c b/ggml-gobject/ggml-tensor.c
index 2100aa9..099fc4c 100644
--- a/ggml-gobject/ggml-tensor.c
+++ b/ggml-gobject/ggml-tensor.c
@@ -35,6 +35,19 @@ ggml_tensor_from_tensor (GGMLContext *context, struct ggml_tensor *base_tensor)
   return tensor;
 }
 
+GGMLTensor *
+ggml_tensor_new (GGMLContext  *context,
+                 GGMLDataType  data_type,
+                 int64_t      *shape,
+                 size_t        n_dims)
+{
+  return ggml_tensor_from_tensor (context,
+                                  ggml_new_tensor (context->ctx,
+                                                   (enum ggml_type) data_type,
+                                                   n_dims,
+                                                   shape));
+}
+
 GGMLTensor *
 ggml_tensor_new_1d (GGMLContext *context, GGMLDataType data_type, size_t size)
 {
diff --git a/ggml-gobject/internal/ggml-tensor-internal.h b/ggml-gobject/internal/ggml-tensor-internal.h
index b3fdc21..16bb6f6 100644
--- a/ggml-gobject/internal/ggml-tensor-internal.h
+++ b/ggml-gobject/internal/ggml-tensor-internal.h
@@ -36,6 +36,12 @@ struct _GGMLTensor {
 GGMLTensor *
 ggml_tensor_from_tensor (GGMLContext *context, struct ggml_tensor *base_tensor);
 
+GGMLTensor *
+ggml_tensor_new (GGMLContext  *context,
+                 GGMLDataType  data_type,
+                 int64_t      *shape,
+                 size_t        n_dims);
+
 GGMLTensor *
 ggml_tensor_new_1d (GGMLContext *context, GGMLDataType data_type, size_t size);
 

From d824d1127b037c4acf600c94d6c01abd3652d299 Mon Sep 17 00:00:00 2001
From: Sam Spilsbury <smspillaz@gmail.com>
Date: Wed, 26 Jul 2023 21:07:33 +0300
Subject: [PATCH 05/22] ggml-tensor: Fix doc comment

---
 ggml-gobject/ggml-tensor.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml-gobject/ggml-tensor.c b/ggml-gobject/ggml-tensor.c
index 099fc4c..20dd686 100644
--- a/ggml-gobject/ggml-tensor.c
+++ b/ggml-gobject/ggml-tensor.c
@@ -181,7 +181,7 @@ ggml_tensor_n_bytes (GGMLTensor *tensor)
  * @size: The number of bytes to read
  *
  * Sets the data of the tensor. It is the caller's responsibility to
- * pass a buffer of the correct size.
+ * pass a buffer of the correct size and stride.
  */
 void
 ggml_tensor_set_data (GGMLTensor *tensor, char *data, size_t size)

From e1347e8d289c884130e452a776ae1d041c36aafc Mon Sep 17 00:00:00 2001
From: Sam Spilsbury <smspillaz@gmail.com>
Date: Wed, 26 Jul 2023 21:07:54 +0300
Subject: [PATCH 06/22] ggml-tensor: Add assert about size of buffer.

---
 ggml-gobject/ggml-tensor.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ggml-gobject/ggml-tensor.c b/ggml-gobject/ggml-tensor.c
index 20dd686..eef0b8a 100644
--- a/ggml-gobject/ggml-tensor.c
+++ b/ggml-gobject/ggml-tensor.c
@@ -186,6 +186,7 @@ ggml_tensor_n_bytes (GGMLTensor *tensor)
 void
 ggml_tensor_set_data (GGMLTensor *tensor, char *data, size_t size)
 {
+  g_assert (size <= ggml_tensor_n_bytes (tensor));
   memcpy(tensor->tensor->data, (const void *) data, size);
 }
 

From c4a58540e46815384c2a7a371817ba3d29c3c8de Mon Sep 17 00:00:00 2001
From: Sam Spilsbury <smspillaz@gmail.com>
Date: Wed, 26 Jul 2023 21:10:16 +0300
Subject: [PATCH 07/22] ggml-model: Formatting adjustment

---
 ggml-gobject/ggml-model.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ggml-gobject/ggml-model.c b/ggml-gobject/ggml-model.c
index 9a67f60..1f5f10c 100644
--- a/ggml-gobject/ggml-model.c
+++ b/ggml-gobject/ggml-model.c
@@ -89,7 +89,8 @@ ggml_model_new_from_flattened_desc (GGMLContext *context,
   return model;
 }
 
-static inline int32_t product_i32 (int32_t *array, size_t n)
+static inline int32_t
+product_i32 (int32_t *array, size_t n)
 {
   int32_t product = 1;
   for (size_t i = 0; i < n; ++i)

From 09289f1f8e700a3eea46c22c52ece9de01672876 Mon Sep 17 00:00:00 2001
From: Sam Spilsbury <smspillaz@gmail.com>
Date: Wed, 26 Jul 2023 21:14:16 +0300
Subject: [PATCH 08/22] ggml-model: Compute context memory size from the model
 desc

---
 ggml-gobject/ggml-model.c | 83 ++++++++++++++++++++-------------------
 1 file changed, 43 insertions(+), 40 deletions(-)

diff --git a/ggml-gobject/ggml-model.c b/ggml-gobject/ggml-model.c
index 1f5f10c..65e115a 100644
--- a/ggml-gobject/ggml-model.c
+++ b/ggml-gobject/ggml-model.c
@@ -208,45 +208,55 @@ ggml_model_load_weights_from_istream (GInputStream *istream,
 }
 
 static size_t
-ggml_estimate_transformer_model_memory (int32_t n_vocab, int32_t n_embd, int32_t n_head, int32_t n_layer, int32_t n_ctx, int32_t ftype)
+ggml_estimate_tensor_size_for_type (GGMLDataType  data_type,
+                                    int64_t      *shape,
+                                    size_t        n_shape)
 {
-  const int32_t head_dim = n_embd / n_head;
-  const int32_t kv_heads = n_head;
-  const int32_t kv_dim = kv_heads * head_dim;
-  enum ggml_type wtype = ggml_ftype_to_ggml_type((enum ggml_ftype) (ftype));
-  size_t ctx_size = 0;
-
-  ctx_size += n_embd * ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
-  ctx_size += n_embd * ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
-
-  ctx_size += n_vocab * n_embd * ggml_type_sizef(wtype);         // wte
-  ctx_size +=   n_ctx * n_embd * ggml_type_sizef(GGML_TYPE_F32); // wpe
-  ctx_size += n_vocab * n_embd * ggml_type_sizef(wtype);         // lm_head
-
-  ctx_size += n_layer * (n_embd * ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
-  ctx_size += n_layer * (n_embd * ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
-
-  ctx_size += n_layer * (n_embd * ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
-  ctx_size += n_layer * (n_embd * ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
+  size_t nb[GGML_MAX_DIMS];
+  size_t ne[GGML_MAX_DIMS];
+  size_t blk_size = ggml_blck_size (data_type);
+
+  /* The true size is the stride at the final shape index * number
+   * of elements (eg, how do we get from the beginning to the end
+   * of the tensor on the outer dimension */
+  nb[0] = ggml_type_size (data_type);
+  ne[0] = shape[0];
+  nb[1] = nb[0] * (ne[0] / blk_size);
+  ne[1] = (n_shape > 1) ? shape[1] : 1;
+
+  for (size_t i = 2; i < GGML_MAX_DIMS; ++i)
+    {
+      ne[i] = (i < n_shape) ? shape[i] : 1;
+      nb[i] = ne[i - 1] * nb[i - 1];
+    }
 
-  ctx_size += n_layer * ((n_embd + 2 * kv_dim) * n_embd * 3 * ggml_type_sizef(wtype));         // c_attn_attn_w // TODO:
-  ctx_size += n_layer * (       (n_embd + 2 * kv_dim) * 3 * ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
+  return ne[GGML_MAX_DIMS - 1] * nb[GGML_MAX_DIMS - 1];
+}
 
-  ctx_size += n_layer * (n_embd * n_embd * ggml_type_sizef(wtype));           // c_attn_proj_w
-  ctx_size += n_layer * (       n_embd * ggml_type_sizef(GGML_TYPE_F32));   // c_attn_proj_b
+static size_t
+ggml_estimate_model_size_from_flattened_desc (GHashTable *flattened_desc)
+{
+  gpointer key, value;
+  GHashTableIter iter;
 
-  ctx_size += n_layer * (4 * n_embd * n_embd * ggml_type_sizef(wtype));         // c_mlp_fc_w
-  ctx_size += n_layer * (       4 * n_embd * ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
+  size_t computed_size = 0;
+  size_t overhead = ggml_tensor_overhead ();
 
-  ctx_size += n_layer * (4 * n_embd *n_embd * ggml_type_sizef(wtype));         // c_mlp_proj_w
-  ctx_size += n_layer * (         n_embd * ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
+  g_hash_table_iter_init (&iter, flattened_desc);
+  while (g_hash_table_iter_next (&iter, &key, &value))
+    {
+      GGMLModelDescLeaf *leaf = value;
 
-  ctx_size += n_ctx*n_layer * n_embd * ggml_type_sizef(GGML_TYPE_F32); // memory_k
-  ctx_size += n_ctx*n_layer * n_embd * ggml_type_sizef(GGML_TYPE_F32); // memory_v
+      size_t n_dims = leaf->n_dim;
+      int64_t *shape = leaf->dimensions;
 
-  ctx_size += (6 + 12 * n_layer) * 512; // object overhead
+      /* Here we will quantize, so estimate the size of the quantized tensor */
+      computed_size += ggml_estimate_tensor_size_for_type (leaf->type,
+                                                           shape,
+                                                           n_dims) + overhead;
+    }
 
-  return ctx_size;
+  return computed_size;
 }
 
 /**
@@ -275,16 +285,9 @@ ggml_model_load_from_istream (GInputStream                           *istream,
                               GCancellable                           *cancellable,
                               GError                                **error)
 {
-  const int32_t n_embd = ggml_hyperparameters_get_int32 (hyperparameters, "n_embd");
-  const int32_t n_layer = ggml_hyperparameters_get_int32 (hyperparameters, "n_layer");
-  const int32_t n_ctx = ggml_hyperparameters_get_int32 (hyperparameters, "n_ctx");
-  const int32_t n_vocab = ggml_hyperparameters_get_int32 (hyperparameters, "n_vocab");
-  const int32_t n_head = ggml_hyperparameters_get_int32 (hyperparameters, "n_head");
-  const int32_t ftype = ggml_hyperparameters_get_int32 (hyperparameters, "ftype");
-
-  size_t memory_size = ggml_estimate_transformer_model_memory (n_vocab, n_embd, n_head, n_layer, n_ctx, ftype);
-  g_autoptr (GGMLContext) context = ggml_context_new (memory_size);
   g_autoptr (GHashTable) flattened_desc = ggml_model_desc_node_flatten (model_desc_node);
+  size_t memory_size = ggml_estimate_model_size_from_flattened_desc (flattened_desc);
+  g_autoptr (GGMLContext) context = ggml_context_new (memory_size);
   g_autoptr (GGMLModel) model = ggml_model_new_from_flattened_desc (context,
                                                                     flattened_desc,
                                                                     forward_func,

From 61da8968c11b4f3413f229c53c8730dacdd69463 Mon Sep 17 00:00:00 2001
From: Sam Spilsbury <smspillaz@gmail.com>
Date: Wed, 26 Jul 2023 21:21:13 +0300
Subject: [PATCH 09/22] ggml-model: Convert data as-needed upon reading a model
 file

---
 ggml-gobject/ggml-model.c | 318 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 306 insertions(+), 12 deletions(-)

diff --git a/ggml-gobject/ggml-model.c b/ggml-gobject/ggml-model.c
index 65e115a..249a837 100644
--- a/ggml-gobject/ggml-model.c
+++ b/ggml-gobject/ggml-model.c
@@ -23,6 +23,7 @@
 #include <ggml-gobject/ggml-model.h>
 #include <ggml-gobject/internal/ggml-stream-internal.h>
 #include <ggml-gobject/internal/ggml-tensor-internal.h>
+#include <ggml-gobject/ggml-enum-types.h>
 
 struct _GGMLModel {
   GGMLContext *owning_context;
@@ -101,6 +102,294 @@ product_i32 (int32_t *array, size_t n)
   return product;
 }
 
+static inline int64_t
+product_i64 (int64_t *array, size_t n)
+{
+  int64_t product = 1;
+  for (size_t i = 0; i < n; ++i)
+    {
+      product *= array[i];
+    }
+
+  return product;
+}
+
+static GArray *
+data_to_f32_array (GGMLDataType   data_type,
+                   char          *data,
+                   size_t         n_elements,
+                   float        **out_data_ptr)
+{
+  g_assert (data_type == GGML_DATA_TYPE_F16 || data_type == GGML_DATA_TYPE_F32);
+
+  if (data_type == GGML_DATA_TYPE_F16)
+    {
+      g_autoptr(GArray) tensor_data = g_array_sized_new (FALSE, FALSE, sizeof (float), n_elements);
+      tensor_data->len = n_elements;
+
+      ggml_fp16_t *fp16_data = (ggml_fp16_t *) data;
+
+      for (size_t i = 0; i < n_elements; ++i)
+        {
+          g_array_index (tensor_data, float, i) = ggml_fp16_to_fp32 (fp16_data[i]);
+        }
+
+      *out_data_ptr = (float *) tensor_data->data;
+      return g_steal_pointer (&tensor_data);
+    }
+
+  /* In this case, we just return NULL and instead
+   * alias the data directly */
+  *out_data_ptr = (float *) data;
+
+  return NULL;
+}
+
+static size_t
+convert_f32_to_f16 (const float   *data,
+                    size_t         n_elements,
+                    char          *out_data_ptr)
+{
+  ggml_fp16_t *fp16_ptr = (ggml_fp16_t *) out_data_ptr;
+  const float *fp32_ptr = (const float *) data;
+
+  for (size_t i = 0; i < n_elements; ++i)
+    {
+      fp16_ptr[i] = fp32_ptr[i];
+    }
+
+  return n_elements * ggml_type_size ((enum ggml_type) GGML_DATA_TYPE_F16);
+}
+
+/**
+ * ggml_convert_data: (skip)
+ * @src_type: The source #GGMLDataType
+ * @original_data: The original data in bytes
+ * @original_data_length: The length of the original data
+ * @shape: The shape of the original data
+ * @n_dims: The number of dimensions in the original data's shape
+ * @quantize_type: The target #GGMLDataType to quantize into
+ * @histogram: (inout) (array length=n_histogram): A histogram to write into
+ * @n_histogram: Length of @histogram
+ * @out_data: (inout): An output pointer for quantized data
+ * @out_data_len: The size of the output quantized data.
+ * @error: A #GError
+ *
+ * Quantize tensor data or convert to another data type.
+ * Histogram information is written into @histogram if it is set.
+ *
+ * Returns: %TRUE on success, %FALSE with @error set on failure.
+ */
+static gboolean
+convert_data_for_model (GGMLDataType   src_type,
+                        char          *original_data,
+                        size_t         original_data_length,
+                        int64_t       *shape,
+                        size_t         n_dims,
+                        GGMLDataType   tgt_type,
+                        int64_t       *histogram,
+                        size_t         n_histogram,
+                        char          *out_data,
+                        size_t         out_data_len,
+                        GError       **error)
+{
+  /* This should not usually happen, but in this case we memcpy
+   * directly into the out_data ptr, assuming that it is not aliased */
+  if (src_type == tgt_type)
+    {
+      if (out_data == original_data)
+        {
+          return TRUE;
+        }
+
+      if (out_data_len != original_data_length)
+        {
+          g_set_error (error,
+                       G_IO_ERROR,
+                       G_IO_ERROR_FAILED,
+                       "Cannot copy from src to tgt, buffer sizes (src: %zu, tgt: %zu) differ",
+                       original_data_length,
+                       out_data_len);
+          return FALSE;
+        }
+
+      memcpy (out_data, original_data, out_data_len);
+      return TRUE;
+    }
+
+  if (src_type != GGML_DATA_TYPE_F32 &&
+      src_type != GGML_DATA_TYPE_F16)
+    {
+      g_autoptr(GEnumClass) src_type_enum = g_type_class_ref (GGML_TYPE_DATA_TYPE);
+      GEnumValue *src_data_type_value = g_enum_get_value (src_type_enum, src_type);
+      g_set_error (error,
+                   G_IO_ERROR,
+                   G_IO_ERROR_FAILED,
+                   "Cannot convert from src_type %s, src_type must be F32 or F16",
+                   src_data_type_value->value_name);
+      return FALSE;
+    }
+
+  float *data_f32_ptr = NULL;
+
+  /* We return data_f32 here, but it might not be defined - its
+   * just a convenience to free the data later, but the real
+   * data pointer comes from data_ptr. This helps to avoid copies where
+   * not necessary. */
+  size_t n_elements = product_i64 (shape, n_dims);
+  g_autoptr(GArray) data_f32 = data_to_f32_array (src_type,
+                                                  original_data,
+                                                  n_elements,
+                                                  &data_f32_ptr);
+
+  size_t cur_size = 0;
+
+  switch (tgt_type)
+    {
+      case GGML_DATA_TYPE_Q4_0:
+        cur_size = ggml_quantize_q4_0 ((float *) data_f32_ptr,
+                                       out_data,
+                                       n_elements,
+                                       shape[0],
+                                       histogram);
+        break;
+      case GGML_DATA_TYPE_Q4_1:
+        cur_size = ggml_quantize_q4_1 ((float *) data_f32_ptr,
+                                       out_data,
+                                       n_elements,
+                                       shape[0],
+                                       histogram);
+        break;
+      case GGML_DATA_TYPE_Q5_0:
+        cur_size = ggml_quantize_q5_0 ((float *) data_f32_ptr,
+                                       out_data,
+                                       n_elements,
+                                       shape[0],
+                                       histogram);
+        break;
+      case GGML_DATA_TYPE_Q5_1:
+        cur_size = ggml_quantize_q5_1 ((float *) data_f32_ptr,
+                                       out_data,
+                                       n_elements,
+                                       shape[0],
+                                       histogram);
+        break;
+      case GGML_DATA_TYPE_Q8_0:
+        cur_size = ggml_quantize_q8_0 ((float *) data_f32_ptr,
+                                       out_data,
+                                       n_elements,
+                                       shape[0],
+                                       histogram);
+        break;
+      case GGML_DATA_TYPE_F16:
+        cur_size = convert_f32_to_f16 ((float *) data_f32_ptr,
+                                       n_elements,
+                                       out_data);
+        break;
+      default:
+        {
+          g_autoptr(GEnumClass) tgt_type_class = g_type_class_ref (GGML_TYPE_DATA_TYPE);
+          GEnumValue *tgt_data_type_value = g_enum_get_value (tgt_type_class, tgt_type);
+          g_set_error (error,
+                       G_IO_ERROR,
+                       G_IO_ERROR_FAILED,
+                       "Conversion failed, tgt_type cannot be %s",
+                       tgt_data_type_value->value_name);
+          return FALSE;
+        }
+    }
+
+  g_assert (cur_size <= out_data_len);
+
+  return TRUE;
+}
+
+static gboolean
+read_into_tensor (GGMLTensor    *tensor,
+                  GGMLDataType   stream_data_type,
+                  GInputStream  *istream,
+                  int64_t       *histogram,
+                  size_t         histogram_len,
+                  GCancellable  *cancellable,
+                  GError       **error)
+{
+  GGMLDataType tensor_data_type = ggml_tensor_get_data_type (tensor);
+  size_t tensor_definition_n_elements = ggml_tensor_n_elements (tensor);
+  size_t stream_bytes_per_element = ggml_size_of_data_type (stream_data_type);
+  size_t expected_bytes = (tensor_definition_n_elements * stream_bytes_per_element / ggml_blck_size ((enum ggml_type) stream_data_type));
+  size_t allocated_bytes = 0;
+  char *tensor_data_ptr = ggml_tensor_get_data (tensor, &allocated_bytes);
+
+  if (stream_data_type != tensor_data_type)
+    {
+      /* Conversion required. First read the data from the stream,
+       * then convert it and write the result into the tensor */
+      g_autoptr(GArray) stream_data = g_array_sized_new (FALSE, TRUE, sizeof (char), expected_bytes);
+      stream_data->len = expected_bytes;
+
+      /* Now we can read the tensor data */
+      if (!ggml_input_stream_read_exactly (istream,
+                                           stream_data->data,
+                                           expected_bytes,
+                                           cancellable,
+                                           error))
+        {
+          return FALSE;
+        }
+
+      /* Now apply the conversion required */
+      GError *my_error = NULL;
+      size_t n_dims;
+      int64_t *shape = ggml_tensor_get_shape (tensor, &n_dims);
+
+      if (!convert_data_for_model (stream_data_type,
+                                   stream_data->data,
+                                   stream_data->len,
+                                   shape,
+                                   n_dims,
+                                   tensor_data_type,
+                                   histogram,
+                                   histogram_len,
+                                   tensor_data_ptr,
+                                   allocated_bytes,
+                                   &my_error))
+        {
+          g_set_error (error,
+                       G_IO_ERROR,
+                       G_IO_ERROR_FAILED,
+                       "Unable to convert %s",
+                       my_error->message);
+          g_clear_error (&my_error);
+          return FALSE;
+        }
+
+      return TRUE;
+    }
+
+  if (expected_bytes != allocated_bytes)
+    {
+      g_set_error (error,
+                   G_IO_ERROR,
+                   G_IO_ERROR_FAILED,
+                   "Tensor allocation of %zu bytes, expected %zu bytes",
+                   allocated_bytes,
+                   expected_bytes);
+      return FALSE;
+    }
+
+  /* No conversion required, just read the tensor */
+  if (!ggml_input_stream_read_exactly (istream,
+                                       tensor_data_ptr,
+                                       allocated_bytes,
+                                       cancellable,
+                                       error))
+    {
+      return FALSE;
+    }
+
+  return TRUE;
+}
+
 static gboolean
 ggml_model_load_weights_from_istream (GInputStream *istream,
                                       GGMLModel *model,
@@ -109,6 +398,8 @@ ggml_model_load_weights_from_istream (GInputStream *istream,
                                       GError **error)
 {
   g_autoptr(GPtrArray) loaded_keys = g_ptr_array_new_full (0, g_free);
+  g_autoptr(GArray) histogram = g_array_sized_new (FALSE, TRUE, sizeof (int64_t), 1 << 4);
+  histogram->len = 1 << 4;
 
   while (TRUE)
     {
@@ -176,20 +467,23 @@ ggml_model_load_weights_from_istream (GInputStream *istream,
           return FALSE;
         }
 
-      size_t bytes_per_element = ggml_size_of_data_type (ttype);
-      size_t allocated_bytes = 0;
-      char *tensor_data_ptr = ggml_tensor_get_data (tensor, &allocated_bytes);
-
-      size_t expected_bytes = (tensor_definition_n_elements * bytes_per_element / ggml_tensor_block_size (tensor));
-      if (expected_bytes != allocated_bytes)
-        {
-          g_set_error (error, G_IO_ERROR, G_IO_ERROR_FAILED, "Tensor %s has allocation of %zu bytes, expected %zu bytes", name_buffer, allocated_bytes, expected_bytes);
-          return FALSE;
-        }
+      GError *my_error = NULL;
 
-      /* Now we can read the tensor */
-      if (!ggml_input_stream_read_exactly (istream, tensor_data_ptr, allocated_bytes, cancellable, error))
+      if (!read_into_tensor (tensor,
+                             ttype,
+                             istream,
+                             (int64_t *) histogram->data,
+                             histogram->len,
+                             cancellable,
+                             &my_error))
         {
+          g_set_error (error,
+                       G_IO_ERROR,
+                       G_IO_ERROR_FAILED,
+                       "Unable to read into tensor %s: %s",
+                       name_buffer,
+                       my_error->message);
+          g_clear_error (&my_error);
           return FALSE;
         }
 

From 24905f9748cb3302b21da57cd4c9b4675a8b4774 Mon Sep 17 00:00:00 2001
From: Sam Spilsbury <smspillaz@gmail.com>
Date: Wed, 26 Jul 2023 22:34:22 +0300
Subject: [PATCH 10/22] ggml-model-config: Add GGMLModelConfig

This can be used to pre-configure how we would like the model to be
loaded. Right now it supports a configuration for quantization. A design
principle of GGMLModelConfig is that it can be used also when the
config is NULL - this is why the config_get functions return a gboolean
indicating whether there is any change from the default in that part of
the configuration and also handle the case where the config object is NULL
---
 ggml-gobject/ggml-model-config.c | 147 +++++++++++++++++++++++++++++++
 ggml-gobject/ggml-model-config.h |  50 +++++++++++
 ggml-gobject/meson.build         |   2 +
 3 files changed, 199 insertions(+)
 create mode 100644 ggml-gobject/ggml-model-config.c
 create mode 100644 ggml-gobject/ggml-model-config.h

diff --git a/ggml-gobject/ggml-model-config.c b/ggml-gobject/ggml-model-config.c
new file mode 100644
index 0000000..3ae986f
--- /dev/null
+++ b/ggml-gobject/ggml-model-config.c
@@ -0,0 +1,147 @@
+/*
+ * ggml-gobject/ggml-model-config.h
+ *
+ * Header file for ggml-model-config
+ *
+ * Copyright (C) 2023 Sam Spilsbury.
+ *
+ * ggml-gobject is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * ggml-gobject is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with ggml-gobject; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <ggml-gobject/ggml-model-config.h>
+
+struct _GGMLModelConfig {
+  size_t ref_count;
+  GGMLDataType quantization_type;
+  GStrv quantization_regexes;
+  GStrv skip_quantization_regexes;
+
+  gboolean quantization_type_set : 1;
+};
+
+/**
+ * ggml_model_config_new:
+ *
+ * Returns a new #GGMLModelConfig
+ */
+GGMLModelConfig *
+ggml_model_config_new (void)
+{
+  GGMLModelConfig *config = g_new0 (GGMLModelConfig, 1);
+  config->ref_count = 1;
+
+  return config;
+}
+
+/**
+ * ggml_model_config_ref: (skip)
+ * @config: A #GGMLModelConfig
+ *
+ * Increases the ref count on @config
+ *
+ * Returns: (transfer full): The @config with ref count increased.
+ */
+GGMLModelConfig *
+ggml_model_config_ref (GGMLModelConfig *config)
+{
+  ++config->ref_count;
+  return config;
+}
+
+/**
+ * ggml_model_config_unref:
+ * @config: A #GGMLModelConfig
+ *
+ * Decreases the ref count on @config and frees the underlying config
+ * if the ref count goes to zero.
+ */
+void
+ggml_model_config_unref (GGMLModelConfig *config)
+{
+  if (--config->ref_count == 0)
+    {
+      g_clear_pointer (&config, g_free);
+    }
+}
+
+/**
+ * ggml_model_config_set_quantization_config:
+ * @config: A #GGMLModelConfig
+ * @quantization_type: A #GGMLDataType
+ * @quantization_regexes: (array zero-terminated=1) (nullable): A strv of regular expressions
+ *                        of weights to apply quantization to.
+ * @skip_quantization_regexes: (array zero-terminated=1) (nullable): A strv of regular expressions
+ *                             of weights to not quantize.
+ */
+void
+ggml_model_config_set_quantization_config (GGMLModelConfig  *config,
+                                           GGMLDataType      quantization_type,
+                                           const char      **quantization_regexes,
+                                           const char      **skip_quantization_regexes)
+{
+  config->quantization_type = quantization_type;
+  config->quantization_regexes = g_strdupv ((GStrv) quantization_regexes);
+  config->skip_quantization_regexes = g_strdupv ((GStrv) skip_quantization_regexes);
+  config->quantization_type_set = TRUE;
+}
+
+/**
+ * ggml_model_config_get_quantization_config:
+ * @config: A #GGMLModelConfig
+ * @out_quantization_type: (out) (nullable): A #GGMLDataType out parameter
+ * @out_quantization_regexes: (out) (nullable) (array zero-terminated=1) (transfer none): The regular
+ *                            expressions of weights to quantize, out-parameter.
+ * @out_skip_quantization_regexes: (out) (nullable) (array zero-terminated=1) (transfer none): The regular
+ *                                 expressions of weights not to quantize, out-parameter.
+ *
+ * Returns: %TRUE if the quantization type has been set with @quantization_type
+ *          set to the internal quantization type, otherwise %FALSE
+ */
+gboolean
+ggml_model_config_get_quantization_config (GGMLModelConfig   *config,
+                                           GGMLDataType      *out_quantization_type,
+                                           const char      ***out_quantization_regexes,
+                                           const char      ***out_skip_quantization_regexes)
+{
+  if (config == NULL)
+    {
+      return FALSE;
+    }
+
+  if (!config->quantization_type_set)
+    {
+      return FALSE;
+    }
+
+  if (out_quantization_type != NULL)
+    {
+      *out_quantization_type = config->quantization_type;
+    }
+
+  if (out_quantization_regexes != NULL)
+    {
+      *out_quantization_regexes = (const char **) config->quantization_regexes;
+    }
+
+  if (out_skip_quantization_regexes != NULL)
+    {
+      *out_skip_quantization_regexes = (const char **) config->skip_quantization_regexes;
+    }
+
+
+  return TRUE;
+}
+
+G_DEFINE_BOXED_TYPE (GGMLModelConfig, ggml_model_config, ggml_model_config_ref, ggml_model_config_unref);
diff --git a/ggml-gobject/ggml-model-config.h b/ggml-gobject/ggml-model-config.h
new file mode 100644
index 0000000..febc1ed
--- /dev/null
+++ b/ggml-gobject/ggml-model-config.h
@@ -0,0 +1,50 @@
+/*
+ * ggml-gobject/ggml-model-config.h
+ *
+ * Header file for ggml-model-config
+ *
+ * Copyright (C) 2023 Sam Spilsbury.
+ *
+ * ggml-gobject is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * ggml-gobject is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with ggml-gobject; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#pragma once
+
+#include <glib-object.h>
+#include <ggml-gobject/ggml-types.h>
+
+G_BEGIN_DECLS
+
+typedef struct _GGMLModelConfig GGMLModelConfig;
+
+GGMLModelConfig * ggml_model_config_new (void);
+GGMLModelConfig * ggml_model_config_ref (GGMLModelConfig *config);
+void ggml_model_config_unref (GGMLModelConfig *config);
+
+void ggml_model_config_set_quantization_config (GGMLModelConfig  *config,
+                                                GGMLDataType      quantization_type,
+                                                const char      **quantization_regexes,
+                                                const char      **skip_quantization_regexes);
+gboolean ggml_model_config_get_quantization_config (GGMLModelConfig   *config,
+                                                    GGMLDataType      *out_quantization_type,
+                                                    const char     ***out_quantization_regexes,
+                                                    const char     ***out_skip_quantization_regexes);
+
+#define GGML_TYPE_MODEL_CONFIG (ggml_model_config_get_type ());
+GType ggml_model_config_get_type (void);
+
+G_DEFINE_AUTOPTR_CLEANUP_FUNC (GGMLModelConfig, ggml_model_config_unref)
+
+G_END_DECLS
diff --git a/ggml-gobject/meson.build b/ggml-gobject/meson.build
index 6298769..1267d62 100644
--- a/ggml-gobject/meson.build
+++ b/ggml-gobject/meson.build
@@ -12,6 +12,7 @@ ggml_gobject_toplevel_introspectable_headers = files([
   'ggml-gpt.h',
   'ggml-hyperparameters.h',
   'ggml-language-model.h',
+  'ggml-model-config.h',
   'ggml-model-desc.h',
   'ggml-model.h',
   'ggml-ops.h',
@@ -28,6 +29,7 @@ ggml_gobject_toplevel_introspectable_sources = files([
   'ggml-gpt.c',
   'ggml-hyperparameters.c',
   'ggml-language-model.c',
+  'ggml-model-config.c',
   'ggml-model-desc.c',
   'ggml-model.c',
   'ggml-ops.c',

From 3bdbc264d7eaff438afe50aa32bb91b22752acd2 Mon Sep 17 00:00:00 2001
From: Sam Spilsbury <smspillaz@gmail.com>
Date: Wed, 26 Jul 2023 22:37:41 +0300
Subject: [PATCH 11/22] ggml-language-model: Add a GGMLModelConfig argument in
 language model constructor

---
 ggml-gobject/ggml-language-model.c | 15 +++++++++++++++
 ggml-gobject/ggml-language-model.h |  5 +++++
 tests/js/testLoadGPT2.js           |  8 ++++++++
 3 files changed, 28 insertions(+)

diff --git a/ggml-gobject/ggml-language-model.c b/ggml-gobject/ggml-language-model.c
index 5b41c04..62604e2 100644
--- a/ggml-gobject/ggml-language-model.c
+++ b/ggml-gobject/ggml-language-model.c
@@ -869,6 +869,7 @@ ggml_model_set_possible_tied_weights (GGMLModel *model,
 /**
  * ggml_language_model_load_from_istream:
  * @istream: (transfer none): A #GInputStream
+ * @model_config: (nullable): A #GGMLModelConfig
  * @create_model_desc: (transfer none) (scope call): A #GGMLModelDescFromHyperparametersFunc to specify the model structure and weights
  * @create_model_desc_user_data: (closure create_model_desc): A closure for @create_model_desc
  * @forward_func: (scope notified) (nullable): A #GGMLModelFowardFunc
@@ -884,6 +885,7 @@ ggml_model_set_possible_tied_weights (GGMLModel *model,
  */
 GGMLLanguageModel *
 ggml_language_model_load_from_istream (GInputStream *istream,
+                                       GGMLModelConfig *model_config,
                                        GGMLModelDescFromHyperparametersFunc create_model_desc,
                                        gpointer create_model_desc_user_data,
                                        GGMLModelForwardFunc forward_func,
@@ -971,6 +973,7 @@ static struct GGMLLanguageModelDefinitions {
  * ggml_language_model_load_defined_from_istream:
  * @model: A #GGMLDefinedLanguageModel configuration to load
  * @istream: (transfer none): A #GInputStream
+ * @model_config: (transfer none) (nullable): A #GGMLModelConfig
  * @cancellable: (transfer none): A #GCancellable
  * @error: A #GError
  *
@@ -985,10 +988,12 @@ static struct GGMLLanguageModelDefinitions {
 GGMLLanguageModel *
 ggml_language_model_load_defined_from_istream (GGMLDefinedLanguageModel    model,
                                                GInputStream               *istream,
+                                               GGMLModelConfig            *model_config,
                                                GCancellable               *cancellable,
                                                GError                    **error)
 {
   return ggml_language_model_load_from_istream (istream,
+                                                model_config,
                                                 ggml_language_model_definitions[model].model_desc_from_hyperparameters_func,
                                                 NULL,
                                                 ggml_language_model_definitions[model].forward_func,
@@ -1001,6 +1006,7 @@ ggml_language_model_load_defined_from_istream (GGMLDefinedLanguageModel    model
 typedef struct _GGMLLanguageModelLoadFromIstreamData
 {
   GInputStream *istream;
+  GGMLModelConfig *config;
   GGMLModelDescFromHyperparametersFunc create_model_desc;
   gpointer create_model_desc_user_data;
   GDestroyNotify create_model_desc_user_data_destroy;
@@ -1017,6 +1023,7 @@ typedef struct _GGMLLanguageModelLoadFromIstreamData
 
 static GGMLLanguageModelLoadFromIstreamData *
 ggml_language_model_load_from_istream_data_new (GInputStream *istream,
+                                                GGMLModelConfig *config,
                                                 GGMLModelDescFromHyperparametersFunc create_model_desc,
                                                 gpointer create_model_desc_user_data,
                                                 GDestroyNotify create_model_desc_user_data_destroy,
@@ -1027,6 +1034,7 @@ ggml_language_model_load_from_istream_data_new (GInputStream *istream,
   GGMLLanguageModelLoadFromIstreamData *data = g_new0 (GGMLLanguageModelLoadFromIstreamData, 1);
 
   data->istream = g_object_ref (istream);
+  data->config = config != NULL ? ggml_model_config_ref (config) : NULL;
   data->create_model_desc = create_model_desc;
   data->create_model_desc_user_data = create_model_desc_user_data;
   data->create_model_desc_user_data_destroy = create_model_desc_user_data_destroy;
@@ -1041,6 +1049,7 @@ void
 ggml_language_model_load_from_istream_data_free (GGMLLanguageModelLoadFromIstreamData *data)
 {
   g_clear_pointer (&data->istream, g_object_unref);
+  g_clear_pointer (&data->config, ggml_model_config_unref);
   g_clear_pointer (&data->create_model_desc_user_data, data->create_model_desc_user_data_destroy);
   g_clear_pointer (&data->forward_func_user_data, data->forward_func_user_data_destroy);
 
@@ -1181,6 +1190,7 @@ ggml_language_model_load_from_istream_on_magic_read (GObject *src,
 /**
  * ggml_language_model_load_from_istream_async:
  * @istream: (transfer none): A #GInputStream
+ * @model_config: (nullable): A #GGMLModelConfig
  * @create_model_desc: (transfer none) (scope call): A #GGMLModelDescFromHyperparametersFunc to specify the model structure and weights
  * @create_model_desc_user_data: (closure create_model_desc): A closure for @create_model_desc
  * @create_model_desc_user_data_destroy: (destroy create_model_desc): A #GDestroyNotify for create_model_desc
@@ -1197,6 +1207,7 @@ ggml_language_model_load_from_istream_on_magic_read (GObject *src,
  */
 void
 ggml_language_model_load_from_istream_async (GInputStream *istream,
+                                             GGMLModelConfig *model_config,
                                              GGMLModelDescFromHyperparametersFunc create_model_desc,
                                              gpointer create_model_desc_user_data,
                                              GDestroyNotify create_model_desc_user_data_destroy,
@@ -1208,6 +1219,7 @@ ggml_language_model_load_from_istream_async (GInputStream *istream,
                                              gpointer user_data)
 {
   g_autoptr(GGMLLanguageModelLoadFromIstreamData) data = ggml_language_model_load_from_istream_data_new(istream,
+                                                                                                        model_config,
                                                                                                         create_model_desc,
                                                                                                         create_model_desc_user_data,
                                                                                                         create_model_desc_user_data_destroy,
@@ -1268,6 +1280,7 @@ ggml_language_model_load_defined_from_istream_finish (GAsyncResult  *result,
  * ggml_language_model_load_defined_from_istream_async:
  * @model: A #GGMLDefinedLanguageModel configuration to load
  * @istream: (transfer none): A #GInputStream
+ * @model_config: (nullable): A #GGMLModelConfig
  * @cancellable: (transfer none) (nullable): A #GCancellable
  * @callback: A #GAsyncReadyCallback to be called when loading is complete.
  * @user_data: (closure callback): Some user data for @callback
@@ -1281,12 +1294,14 @@ ggml_language_model_load_defined_from_istream_finish (GAsyncResult  *result,
 void
 ggml_language_model_load_defined_from_istream_async (GGMLDefinedLanguageModel   model,
                                                      GInputStream              *istream,
+                                                     GGMLModelConfig           *model_config,
                                                      GCancellable              *cancellable,
                                                      GAsyncReadyCallback        callback,
                                                      gpointer                   user_data,
                                                      GError                   **error)
 {
   ggml_language_model_load_from_istream_async (istream,
+                                               model_config,
                                                ggml_language_model_definitions[model].model_desc_from_hyperparameters_func,
                                                NULL,
                                                NULL,
diff --git a/ggml-gobject/ggml-language-model.h b/ggml-gobject/ggml-language-model.h
index 5db7816..45a3247 100644
--- a/ggml-gobject/ggml-language-model.h
+++ b/ggml-gobject/ggml-language-model.h
@@ -25,6 +25,7 @@
 #include <glib-object.h>
 #include <gio/gio.h>
 #include <ggml-gobject/ggml-hyperparameters.h>
+#include <ggml-gobject/ggml-model-config.h>
 #include <ggml-gobject/ggml-model.h>
 #include <ggml-gobject/ggml-token-dictionary.h>
 
@@ -60,6 +61,7 @@ typedef enum {
 } GGMLDefinedLanguageModel;
 
 GGMLLanguageModel *ggml_language_model_load_from_istream (GInputStream *istream,
+                                                          GGMLModelConfig *model_config,
                                                           GGMLModelDescFromHyperparametersFunc create_model_desc,
                                                           gpointer create_model_desc_user_data,
                                                           GGMLModelForwardFunc forward_func,
@@ -69,6 +71,7 @@ GGMLLanguageModel *ggml_language_model_load_from_istream (GInputStream *istream,
                                                           GError **error);
 
 void ggml_language_model_load_from_istream_async (GInputStream *istream,
+                                                  GGMLModelConfig *model_config,
                                                   GGMLModelDescFromHyperparametersFunc create_model_desc,
                                                   gpointer create_model_desc_user_data,
                                                   GDestroyNotify create_model_desc_user_data_destroy,
@@ -83,10 +86,12 @@ GGMLLanguageModel * ggml_language_model_load_from_istream_finish (GAsyncResult
 
 GGMLLanguageModel *ggml_language_model_load_defined_from_istream (GGMLDefinedLanguageModel   model,
                                                                   GInputStream              *istream,
+                                                                  GGMLModelConfig           *model_config,
                                                                   GCancellable              *cancellable,
                                                                   GError                   **error);
 void ggml_language_model_load_defined_from_istream_async (GGMLDefinedLanguageModel   model,
                                                           GInputStream              *istream,
+                                                          GGMLModelConfig           *model_config,
                                                           GCancellable              *cancellable,
                                                           GAsyncReadyCallback        callback,
                                                           gpointer                   user_data,
diff --git a/tests/js/testLoadGPT2.js b/tests/js/testLoadGPT2.js
index 22a22f8..b62d734 100644
--- a/tests/js/testLoadGPT2.js
+++ b/tests/js/testLoadGPT2.js
@@ -375,6 +375,7 @@ describe('GGML GPT2', function() {
 
     const language_model = GGML.LanguageModel.load_from_istream(
       istream,
+      null,
       (hyperparameters) => createModelDescGPT2(
         hyperparameters.get_int32('n_vocab'),
         hyperparameters.get_int32('n_embd'),
@@ -391,6 +392,7 @@ describe('GGML GPT2', function() {
 
     GGML.LanguageModel.load_from_istream_async(
       istream,
+      null,
       (hyperparameters) => createModelDescGPT2(
         hyperparameters.get_int32('n_vocab'),
         hyperparameters.get_int32('n_embd'),
@@ -413,6 +415,7 @@ describe('GGML GPT2', function() {
       GGML.DefinedLanguageModel.GPT2P177M,
       istream,
       null,
+      null,
       (src, res) => {
         GGML.LanguageModel.load_defined_from_istream_finish (res);
         done();
@@ -425,6 +428,7 @@ describe('GGML GPT2', function() {
     const language_model = GGML.LanguageModel.load_defined_from_istream(
       GGML.DefinedLanguageModel.GPT2P177M,
       istream,
+      null,
       null
     );
 
@@ -438,6 +442,7 @@ describe('GGML GPT2', function() {
     const language_model = GGML.LanguageModel.load_defined_from_istream(
       GGML.DefinedLanguageModel.GPT2P177M,
       istream,
+      null,
       null
     );
 
@@ -453,6 +458,7 @@ describe('GGML GPT2', function() {
     const language_model = GGML.LanguageModel.load_defined_from_istream(
       GGML.DefinedLanguageModel.GPT2P177M,
       istream,
+      null,
       null
     );
 
@@ -483,6 +489,7 @@ describe('GGML GPT2', function() {
     const language_model = GGML.LanguageModel.load_defined_from_istream(
       GGML.DefinedLanguageModel.GPT2P177M,
       istream,
+      null,
       null
     );
 
@@ -516,6 +523,7 @@ describe('GGML GPT2', function() {
 
     const language_model = GGML.LanguageModel.load_from_istream(
       istream,
+      null,
       (hyperparameters) => createModelDescGPT2(
         hyperparameters.get_int32('n_vocab'),
         hyperparameters.get_int32('n_embd'),

From ab9facf74f2d6c7cce0f822c1c746c1aec2faf52 Mon Sep 17 00:00:00 2001
From: Sam Spilsbury <smspillaz@gmail.com>
Date: Wed, 26 Jul 2023 22:38:29 +0300
Subject: [PATCH 12/22] ggml-quantize: Add helper functions for setting up
 quantization in the ModelDesc

---
 ggml-gobject/ggml-quantize.c | 160 +++++++++++++++++++++++++++++++++++
 ggml-gobject/ggml-quantize.h |  37 ++++++++
 ggml-gobject/meson.build     |   2 +
 3 files changed, 199 insertions(+)
 create mode 100644 ggml-gobject/ggml-quantize.c
 create mode 100644 ggml-gobject/ggml-quantize.h

diff --git a/ggml-gobject/ggml-quantize.c b/ggml-gobject/ggml-quantize.c
new file mode 100644
index 0000000..d0473df
--- /dev/null
+++ b/ggml-gobject/ggml-quantize.c
@@ -0,0 +1,160 @@
+/*
+ * ggml-gobject/ggml-quantize.c
+ *
+ * Header file for ggml-quantize
+ *
+ * Copyright (C) 2023 Sam Spilsbury.
+ *
+ * ggml-gobject is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * ggml-gobject is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with ggml-gobject; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <ggml-gobject/ggml-model-desc.h>
+#include <ggml-gobject/ggml-quantize.h>
+
+static gboolean
+matches_quantize_regexes (GRegex     **quantize_keys,
+                          GRegex     **skip_keys,
+                          const char  *weight_path)
+{
+  if (skip_keys != NULL)
+    {
+      for (GRegex **skip_key_regex_ptr = skip_keys;
+           *skip_key_regex_ptr != NULL;
+           ++skip_key_regex_ptr)
+        {
+          if (g_regex_match (*skip_key_regex_ptr, weight_path, 0, NULL))
+            {
+              return FALSE;
+            }
+          
+        }
+    }
+
+  if (quantize_keys == NULL)
+    {
+      return FALSE;
+    }
+
+  gboolean should_quantize = FALSE;
+
+  for (GRegex **quantize_key_regex_ptr = quantize_keys;
+       *quantize_key_regex_ptr != NULL;
+       ++quantize_key_regex_ptr)
+    {
+      should_quantize |= g_regex_match (*quantize_key_regex_ptr, weight_path, 0, NULL);
+    }
+
+  return should_quantize;
+}
+
+static gboolean
+strv_to_regex_array (const char **strv,
+                     GPtrArray  **out_ptr_array,
+                     GError     **error)
+{
+  if (strv == NULL)
+    {
+      *out_ptr_array = NULL;
+      return TRUE;
+    }
+
+  g_autoptr(GPtrArray) regex_array = g_ptr_array_new_null_terminated (g_strv_length ((GStrv) strv),
+                                                                      (GDestroyNotify) g_regex_unref,
+                                                                      TRUE);
+
+  for (const char **ptr = strv; *ptr != NULL; ++ptr)
+    {
+      GRegex *regex = g_regex_new (*ptr, 0, 0, error);
+
+      if (regex == NULL)
+        {
+          *out_ptr_array = NULL;
+          return FALSE;
+        }
+
+      g_ptr_array_add (regex_array, regex);
+    }
+
+  *out_ptr_array = g_steal_pointer (&regex_array);
+  return TRUE;
+}
+
+typedef struct {
+  GRegex **quantize_regex_objects;
+  GRegex **skip_regex_objects;
+  GGMLDataType quantize_type;
+} QuantizeByRegexMapFuncData;
+
+static GGMLModelDescLeaf *
+quantize_by_regex_map_func (const char              *path,
+                            const GGMLModelDescLeaf *leaf,
+                            gpointer                 user_data)
+{
+  QuantizeByRegexMapFuncData *data = user_data;
+
+  if (leaf->n_dim != 2 ||
+      !matches_quantize_regexes (data->quantize_regex_objects, data->skip_regex_objects, path))
+    {
+      return ggml_model_desc_leaf_new (leaf->dimensions, leaf->n_dim, leaf->type);
+    }
+
+  return ggml_model_desc_leaf_new (leaf->dimensions, leaf->n_dim, data->quantize_type);
+}
+
+/**
+ * ggml_configure_quantized_model_desc_by_regexes:
+ * @model_desc: A #GGMLModelDescNode
+ * @quantize_type: A #GGMLDataType
+ * @quantize_regexes: (array zero-terminated=1) (nullable): A GStrv of regular expressions for weights
+ *                    to quantize.
+ * @skip_regexes: (array zero-terminated=1) (nullable): A #GStrv of regular expressions for weights to
+ *                not quantize, if they are matched by @quantize_regexes
+ * @error: A #GError out-parameter
+ *
+ * Returns: (transfer full): A new #GGMLModelDescNode tree with weights marked for quantization as appropriate or %NULL with
+ *                           @error set on failure.
+ */
+GGMLModelDescNode *
+ggml_configure_quantized_model_desc_by_regexes (GGMLModelDescNode  *model_desc,
+                                                GGMLDataType        quantize_type,
+                                                const char        **quantize_regexes,
+                                                const char        **skip_regexes,
+                                                GError            **error)
+{
+  /* We first allocate a new contex with enough memory to fit
+   * the quantized model */
+  g_autoptr(GPtrArray) quantize_regex_objects = NULL;
+  g_autoptr(GPtrArray) skip_regex_objects = NULL;
+  
+  if (!strv_to_regex_array (quantize_regexes, &quantize_regex_objects, error))
+    {
+      return NULL;
+    }
+
+  if (!strv_to_regex_array (skip_regexes, &skip_regex_objects, error))
+    {
+      return NULL;
+    }
+
+  QuantizeByRegexMapFuncData data = {
+    .quantize_regex_objects = quantize_regex_objects != NULL ? (GRegex **) quantize_regex_objects->pdata : NULL,
+    .skip_regex_objects = skip_regex_objects != NULL ? (GRegex **) skip_regex_objects->pdata : NULL,
+    .quantize_type = quantize_type
+  };
+
+  return ggml_model_desc_map (model_desc,
+                              quantize_by_regex_map_func,
+                              &data);
+}
diff --git a/ggml-gobject/ggml-quantize.h b/ggml-gobject/ggml-quantize.h
new file mode 100644
index 0000000..462bbfd
--- /dev/null
+++ b/ggml-gobject/ggml-quantize.h
@@ -0,0 +1,37 @@
+/*
+ * ggml-gobject/ggml-quantize.h
+ *
+ * Header file for ggml-quantize
+ *
+ * Copyright (C) 2023 Sam Spilsbury.
+ *
+ * ggml-gobject is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * ggml-gobject is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with ggml-gobject; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#pragma once
+
+#include <glib-object.h>
+#include <ggml-gobject/ggml-model-desc.h>
+#include <ggml-gobject/ggml-types.h>
+
+G_BEGIN_DECLS
+
+GGMLModelDescNode * ggml_configure_quantized_model_desc_by_regexes (GGMLModelDescNode  *model_desc,
+                                                                    GGMLDataType        quantize_type,
+                                                                    const char        **quantize_regexes,
+                                                                    const char        **skip_regexes,
+                                                                    GError            **error);
+
+G_END_DECLS
diff --git a/ggml-gobject/meson.build b/ggml-gobject/meson.build
index 1267d62..06bdf04 100644
--- a/ggml-gobject/meson.build
+++ b/ggml-gobject/meson.build
@@ -16,6 +16,7 @@ ggml_gobject_toplevel_introspectable_headers = files([
   'ggml-model-desc.h',
   'ggml-model.h',
   'ggml-ops.h',
+  'ggml-quantize.h',
   'ggml-tensor.h',
   'ggml-token-dictionary.h',
   'ggml-types.h',
@@ -33,6 +34,7 @@ ggml_gobject_toplevel_introspectable_sources = files([
   'ggml-model-desc.c',
   'ggml-model.c',
   'ggml-ops.c',
+  'ggml-quantize.c',
   'ggml-tensor.c',
   'ggml-token-dictionary.c',
   'ggml-types.c',

From 929101be095f7cbdd4a049bef44c9cdfc76f33ac Mon Sep 17 00:00:00 2001
From: Sam Spilsbury <smspillaz@gmail.com>
Date: Wed, 26 Jul 2023 22:39:26 +0300
Subject: [PATCH 13/22] ggml-language-model: If using quantization, set up
 quantization in the ModelDescNode

We do this after loading the hyperparameters and starting to set up
the ModelDesc. The ModelDesc contains all the information about the types
of the tensors and since we can convert them now on the fly during loading,
this is the perfect place to edit once we know the desired quantization
configuration.
---
 ggml-gobject/ggml-language-model.c | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/ggml-gobject/ggml-language-model.c b/ggml-gobject/ggml-language-model.c
index 62604e2..3348eb6 100644
--- a/ggml-gobject/ggml-language-model.c
+++ b/ggml-gobject/ggml-language-model.c
@@ -23,6 +23,7 @@
 #include <ggml-gobject/ggml-cached-model.h>
 #include <ggml-gobject/ggml-gpt.h>
 #include <ggml-gobject/ggml-language-model.h>
+#include <ggml-gobject/ggml-quantize.h>
 #include <ggml-gobject/internal/ggml-async-queue-source.h>
 #include <ggml-gobject/internal/ggml-stream-internal.h>
 
@@ -907,6 +908,29 @@ ggml_language_model_load_from_istream (GInputStream *istream,
     }
 
   g_autoptr (GGMLModelDescNode) model_desc_node = (*create_model_desc) (hyperparameters, create_model_desc_user_data);
+
+  GGMLDataType quantized_type;
+  const char **quantize_regexes = NULL;
+  const char **skip_quantize_regexes = NULL;
+  gboolean should_quantize = ggml_model_config_get_quantization_config (model_config,
+                                                                        &quantized_type,
+                                                                        &quantize_regexes,
+                                                                        &skip_quantize_regexes);
+
+  g_autoptr(GGMLModelDescNode) postprocessed_model_desc_node = (
+    should_quantize ? ggml_configure_quantized_model_desc_by_regexes (model_desc_node,
+                                                                      quantized_type,
+                                                                      quantize_regexes,
+                                                                      skip_quantize_regexes,
+                                                                      error) :
+                      ggml_model_desc_node_ref (model_desc_node)
+  );
+
+  if (postprocessed_model_desc_node == NULL)
+    {
+      return NULL;
+    }
+
   int32_t n_vocab = ggml_hyperparameters_get_int32 (hyperparameters, "n_vocab");
   g_autoptr(GGMLTokenDictionary) token_dictionary = ggml_token_dictionary_load_from_istream (istream,
                                                                                              n_vocab,
@@ -920,7 +944,7 @@ ggml_language_model_load_from_istream (GInputStream *istream,
 
   g_auto(GStrv) loaded_keys = NULL;
   g_autoptr(GGMLModel) model = ggml_model_load_from_istream (istream,
-                                                             model_desc_node,
+                                                             postprocessed_model_desc_node,
                                                              hyperparameters,
                                                              forward_func,
                                                              forward_func_user_data,

From 783f581a7695d812dedd4a256bc4d10f49ed8977 Mon Sep 17 00:00:00 2001
From: Sam Spilsbury <smspillaz@gmail.com>
Date: Wed, 26 Jul 2023 22:41:16 +0300
Subject: [PATCH 14/22] ggml-gpt: Add a helper function to get the quantize
 regexes

---
 ggml-gobject/ggml-gpt.c | 22 ++++++++++++++++++++++
 ggml-gobject/ggml-gpt.h |  1 +
 2 files changed, 23 insertions(+)

diff --git a/ggml-gobject/ggml-gpt.c b/ggml-gobject/ggml-gpt.c
index b8844a7..b84f37d 100644
--- a/ggml-gobject/ggml-gpt.c
+++ b/ggml-gobject/ggml-gpt.c
@@ -20,6 +20,7 @@
  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  */
 
+#include <ggml-gobject/ggml-quantize.h>
 #include <ggml-gobject/ggml-gpt.h>
 #include <math.h>
 
@@ -619,3 +620,24 @@ ggml_gpt_model_forward_pass (GGMLModel *model,
                                                                NULL);
   return g_steal_pointer (&lm_head_output);
 }
+
+static const char *ggml_gpt_model_quantize_regexes[] = {
+  "model/wte",
+  "model/lm_head",
+  "model/h.*/attn/c_attn/w",
+  "model/h.*/attn/c_proj/w",
+  "model/h.*/mlp/c_fc/w",
+  "model/h.*/mlp/c_proj/w",
+  NULL
+};
+
+/**
+ * ggml_gpt_model_quantization_regexes:
+ *
+ * Returns: (transfer none) (array zero-terminated=1): A strv of weights to quantize for GPT models
+ */
+const char **
+ggml_gpt_model_quantization_regexes (void)
+{
+  return ggml_gpt_model_quantize_regexes;
+}
diff --git a/ggml-gobject/ggml-gpt.h b/ggml-gobject/ggml-gpt.h
index 32a6a4b..0a1e674 100644
--- a/ggml-gobject/ggml-gpt.h
+++ b/ggml-gobject/ggml-gpt.h
@@ -52,6 +52,7 @@ GGMLModelDescNode * ggml_create_gpt2_model_desc (int32_t n_vocab,
                                                  int32_t n_ctx);
 
 GGMLModelDescNode * ggml_create_gpt2_model_desc_from_hyperparameters (GGMLHyperparameters *hyperparameters);
+const char ** ggml_gpt_model_quantization_regexes (void);
 
 GBytes * ggml_gpt_model_forward_pass_create_memory_buffer (size_t n_tokens);
 

From e549447c6723763ddc9429a42e0ed6f0ebb84ec0 Mon Sep 17 00:00:00 2001
From: Sam Spilsbury <smspillaz@gmail.com>
Date: Wed, 26 Jul 2023 22:41:39 +0300
Subject: [PATCH 15/22] testLoadGPT: Fix typo

---
 tests/js/testLoadGPT2.js | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/js/testLoadGPT2.js b/tests/js/testLoadGPT2.js
index b62d734..dcd3f93 100644
--- a/tests/js/testLoadGPT2.js
+++ b/tests/js/testLoadGPT2.js
@@ -317,7 +317,7 @@ const gpt2ForwardPass = (model, hyperparameters, inputs, eval_parameters, cgraph
 describe('GGML GPT2', function() {
   afterEach(() => {
     System.gc();
-  })
+  });
   it('can tokenize a simple string', function() {
     const token_dictionary = GGML.TokenDictionary.new([
       "ab",

From bba801439a20d677ce285cb583b23793bd988590 Mon Sep 17 00:00:00 2001
From: Sam Spilsbury <smspillaz@gmail.com>
Date: Wed, 26 Jul 2023 22:41:49 +0300
Subject: [PATCH 16/22] tests/js: Add test for running with quantization

---
 tests/js/testLoadGPT2.js | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/tests/js/testLoadGPT2.js b/tests/js/testLoadGPT2.js
index dcd3f93..4a66dbb 100644
--- a/tests/js/testLoadGPT2.js
+++ b/tests/js/testLoadGPT2.js
@@ -543,4 +543,23 @@ describe('GGML GPT2', function() {
       ['The meaning of life is: to live in a world of abundance', false]
     );
   });
-})
\ No newline at end of file
+  it('can do a forward pass through a quantized model', function() {
+    const istream = GGML.LanguageModel.stream_from_cache(GGML.DefinedLanguageModel.GPT2P177M);
+    const config = GGML.ModelConfig.new();
+
+    config.set_quantization_config(GGML.DataType.Q8_0,
+                                   GGML.gpt_model_quantization_regexes(),
+                                   null);
+
+    const language_model = GGML.LanguageModel.load_defined_from_istream(
+      GGML.DefinedLanguageModel.GPT2P177M,
+      istream,
+      config,
+      null
+    );
+
+    expect(language_model.complete('The meaning of life is:', 7, null)).toEqual(
+      ['The meaning of life is: to live in a state of being', false]
+    );
+  });
+})

From a178d557b1e0292d2a2f42235ccadcaad10a6ec4 Mon Sep 17 00:00:00 2001
From: Sam Spilsbury <smspillaz@gmail.com>
Date: Thu, 27 Jul 2023 03:40:57 +0300
Subject: [PATCH 17/22] ggml-quantize: Use g_ptr_array_new_full instead of
 new_null_terminated

Its better supported on older platforms
---
 ggml-gobject/ggml-quantize.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/ggml-gobject/ggml-quantize.c b/ggml-gobject/ggml-quantize.c
index d0473df..fa5f888 100644
--- a/ggml-gobject/ggml-quantize.c
+++ b/ggml-gobject/ggml-quantize.c
@@ -70,9 +70,8 @@ strv_to_regex_array (const char **strv,
       return TRUE;
     }
 
-  g_autoptr(GPtrArray) regex_array = g_ptr_array_new_null_terminated (g_strv_length ((GStrv) strv),
-                                                                      (GDestroyNotify) g_regex_unref,
-                                                                      TRUE);
+  g_autoptr(GPtrArray) regex_array = g_ptr_array_new_full (g_strv_length ((GStrv) strv),
+                                                           (GDestroyNotify) g_regex_unref);
 
   for (const char **ptr = strv; *ptr != NULL; ++ptr)
     {
@@ -87,6 +86,8 @@ strv_to_regex_array (const char **strv,
       g_ptr_array_add (regex_array, regex);
     }
 
+  g_ptr_array_add (regex_array, NULL);
+
   *out_ptr_array = g_steal_pointer (&regex_array);
   return TRUE;
 }

From 2dd0d1103bf6cfd8fb143c4cc81a3ec0575f4795 Mon Sep 17 00:00:00 2001
From: Sam Spilsbury <smspillaz@gmail.com>
Date: Thu, 27 Jul 2023 03:51:33 +0300
Subject: [PATCH 18/22] ggml-quantize: Ignore NULL regexes when unreffing

---
 ggml-gobject/ggml-quantize.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/ggml-gobject/ggml-quantize.c b/ggml-gobject/ggml-quantize.c
index fa5f888..d28a0a0 100644
--- a/ggml-gobject/ggml-quantize.c
+++ b/ggml-gobject/ggml-quantize.c
@@ -59,6 +59,17 @@ matches_quantize_regexes (GRegex     **quantize_keys,
   return should_quantize;
 }
 
+static void
+unref_regex_ignore_null (GRegex *regex)
+{
+  if (regex == NULL)
+    {
+      return;
+    }
+
+  return g_regex_unref (regex);
+}
+
 static gboolean
 strv_to_regex_array (const char **strv,
                      GPtrArray  **out_ptr_array,
@@ -71,7 +82,7 @@ strv_to_regex_array (const char **strv,
     }
 
   g_autoptr(GPtrArray) regex_array = g_ptr_array_new_full (g_strv_length ((GStrv) strv),
-                                                           (GDestroyNotify) g_regex_unref);
+                                                           (GDestroyNotify) unref_regex_ignore_null);
 
   for (const char **ptr = strv; *ptr != NULL; ++ptr)
     {

From 60ceabe448534a5dd700c8b0f4e082e8d8ae08c0 Mon Sep 17 00:00:00 2001
From: Sam Spilsbury <smspillaz@gmail.com>
Date: Thu, 27 Jul 2023 04:11:44 +0300
Subject: [PATCH 19/22] testLoadGPT2: Allow for variations depending on system

Quantization is imprecise, so we could get slightly different answers
depending on the architecture.
---
 tests/js/testLoadGPT2.js | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/js/testLoadGPT2.js b/tests/js/testLoadGPT2.js
index 4a66dbb..eb33552 100644
--- a/tests/js/testLoadGPT2.js
+++ b/tests/js/testLoadGPT2.js
@@ -558,8 +558,8 @@ describe('GGML GPT2', function() {
       null
     );
 
-    expect(language_model.complete('The meaning of life is:', 7, null)).toEqual(
-      ['The meaning of life is: to live in a state of being', false]
+    expect(language_model.complete('The meaning of life is:', 7, null)[0]).toMatch(
+      /The meaning of life is\: to live in a state of (being|peace)/,
     );
   });
 })

From bd91dad8f3f2f93a5a46b3af3fcb5e63f85951b7 Mon Sep 17 00:00:00 2001
From: Sam Spilsbury <smspillaz@gmail.com>
Date: Thu, 27 Jul 2023 04:22:00 +0300
Subject: [PATCH 20/22] llm-writer-app: Adjust to change in
 load_defined_from_istream_async API

---
 examples/llm-writer-app/src/main.js | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/llm-writer-app/src/main.js b/examples/llm-writer-app/src/main.js
index 4471866..d21e127 100644
--- a/examples/llm-writer-app/src/main.js
+++ b/examples/llm-writer-app/src/main.js
@@ -62,6 +62,7 @@ const load_model = (model, cancellable, callback, progress_callback) => {
   GGML.LanguageModel.load_defined_from_istream_async(
     model,
     istream,
+    null,
     cancellable,
     (src, res) => {
       try {

From 9db44171dd0ecfbd610af8a6b4c847b77985e6ab Mon Sep 17 00:00:00 2001
From: Sam Spilsbury <smspillaz@gmail.com>
Date: Thu, 27 Jul 2023 05:09:18 +0300
Subject: [PATCH 21/22] ggml-language-model: Support setting quantization flags
 also on async case

---
 ggml-gobject/ggml-language-model.c | 29 +++++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/ggml-gobject/ggml-language-model.c b/ggml-gobject/ggml-language-model.c
index 3348eb6..4f63efe 100644
--- a/ggml-gobject/ggml-language-model.c
+++ b/ggml-gobject/ggml-language-model.c
@@ -1177,8 +1177,33 @@ ggml_language_model_load_from_istream_on_hyperparameters_read (GObject *src,
 
   /* We can already use the hyperparameters to create the model desc. */
   data->hyperparameters = g_steal_pointer (&hyperparameters);
-  data->model_desc = (*data->create_model_desc) (data->hyperparameters,
-                                                 data->create_model_desc_user_data);
+  g_autoptr(GGMLModelDescNode) model_desc = (*data->create_model_desc) (data->hyperparameters,
+                                                                        data->create_model_desc_user_data);
+
+  GGMLDataType quantized_type;
+  const char **quantize_regexes = NULL;
+  const char **skip_quantize_regexes = NULL;
+  gboolean should_quantize = ggml_model_config_get_quantization_config (data->config,
+                                                                        &quantized_type,
+                                                                        &quantize_regexes,
+                                                                        &skip_quantize_regexes);
+
+  g_autoptr(GGMLModelDescNode) postprocessed_model_desc_node = (
+    should_quantize ? ggml_configure_quantized_model_desc_by_regexes (model_desc,
+                                                                      quantized_type,
+                                                                      quantize_regexes,
+                                                                      skip_quantize_regexes,
+                                                                      &error) :
+                      ggml_model_desc_node_ref (model_desc)
+  );
+
+  if (postprocessed_model_desc_node == NULL)
+    {
+      g_task_return_error (task, error);
+      return;
+    }
+
+  data->model_desc = g_steal_pointer (&postprocessed_model_desc_node);
 
   /* Continue reading the stream, now for the token dictionary */
   ggml_token_dictionary_load_from_istream_async (data->istream,

From 102980b911b9ca9b03a8f1a05ffa8cc95b4c35d8 Mon Sep 17 00:00:00 2001
From: Sam Spilsbury <smspillaz@gmail.com>
Date: Thu, 27 Jul 2023 05:28:22 +0300
Subject: [PATCH 22/22] llm-writer-app: Allow configuring the quantization
 level

---
 examples/llm-writer-app/src/main.js | 96 +++++++++++++++++++++--------
 1 file changed, 71 insertions(+), 25 deletions(-)

diff --git a/examples/llm-writer-app/src/main.js b/examples/llm-writer-app/src/main.js
index d21e127..8600c6c 100644
--- a/examples/llm-writer-app/src/main.js
+++ b/examples/llm-writer-app/src/main.js
@@ -52,17 +52,28 @@ const list_store_from_rows = (rows) => {
   return list_store;
 };
 
-const load_model = (model, cancellable, callback, progress_callback) => {
+const load_model = (model, quantization_level, cancellable, callback, progress_callback) => {
   const istream = GGML.LanguageModel.stream_from_cache(model);
 
   if (progress_callback) {
     istream.set_download_progress_callback(progress_callback);
   }
 
+  const config = GGML.ModelConfig.new();
+
+  if (quantization_level !== null)
+    {
+      config.set_quantization_config(
+        quantization_level,
+        GGML.gpt_model_quantization_regexes(),
+        null
+      )
+    }
+
   GGML.LanguageModel.load_defined_from_istream_async(
     model,
     istream,
-    null,
+    config,
     cancellable,
     (src, res) => {
       try {
@@ -78,10 +89,20 @@ const load_model = (model, cancellable, callback, progress_callback) => {
 };
 
 const COMBOBOX_ID_TO_LANGUAGE_MODEL_ENUM = Object.keys(GGML.DefinedLanguageModel).map(k => GGML.DefinedLanguageModel[k]);
+const COMBOBOX_ID_TO_QUANTIZATION_LEVEL_ENUM = [
+  null,
+  GGML.DataType.F16,
+  GGML.DataType.Q8_0,
+  GGML.DataType.Q5_0,
+  GGML.DataType.Q5_1,
+  GGML.DataType.Q4_0,
+  GGML.DataType.Q4_1,
+];
 
 class ModelLoader {
   constructor() {
     this._model_enum = null;
+    this._quantization_enum = null;
     this._model = null;
     this._pending_load = null;
   }
@@ -96,21 +117,24 @@ class ModelLoader {
    * if the action is cancelled, then @callback won't be invoked, but
    * the model will stil be downloaded if the download is in progress.
    */
-  with_model(model_enum, cancellable, callback, progress_callback) {
-    if (this._model_enum === model_enum) {
+  with_model(model_enum, quantization_enum, cancellable, callback, progress_callback) {
+    if (this._model_enum === model_enum &&
+        this._quantization_enum === quantization_enum) {
       return callback(this._model)
     }
 
     if (this._pending_load) {
       /* We only do the most recent callback once the model is loaded
        * and discard other ones */
-      if (this._pending_load.model_enum !== model_enum) {
+      if (this._pending_load.model_enum !== model_enum ||
+          this._pending_load.quantization_enum !== quantization_enum) {
         /* Cancel the existing pending load and start over again */
         this._pending_load.load_cancellable.cancel();
       } else {
         /* Don't cancel the pending load operation, but change the callback */
         this._pending_load = {
           model_enum: model_enum,
+          quantization_enum: quantization_enum,
           callback: callback,
           load_cancellable: this._pending_load.load_cancellable,
           action_cancellable: cancellable
@@ -122,16 +146,18 @@ class ModelLoader {
     /* Create a pending load and load the model */
     this._pending_load = {
       model_enum: model_enum,
+      quantization_enum: quantization_enum,
       callback: callback,
       load_cancellable: new Gio.Cancellable(),
       action_cancellable: cancellable
     };
 
-    load_model(model_enum, this._pending_load.load_cancellable, model => {
+    load_model(model_enum, quantization_enum, this._pending_load.load_cancellable, model => {
       const { callback, action_cancellable } = this._pending_load;
 
       if (action_cancellable === null || !action_cancellable.is_cancelled()) {
         this._model_enum = model_enum;
+        this._quantization_enum = quantization_enum;
         this._model = model;
 
         System.gc();
@@ -141,6 +167,19 @@ class ModelLoader {
   }
 }
 
+const makeCombobox = (listOptions, callback) => {
+  const combobox = Gtk.ComboBox.new_with_model(
+    list_store_from_rows(listOptions)
+  );
+  const renderer = new Gtk.CellRendererText();
+  combobox.pack_start(renderer, true);
+  combobox.add_attribute(renderer, 'text', 0);
+  combobox.set_active(0);
+  combobox.connect('changed', callback);
+
+  return combobox;
+};
+
 const LLMWriterAppMainWindow = GObject.registerClass({
   Template: `${RESOURCE_PATH}/main.ui`,
   Children: [
@@ -179,30 +218,36 @@ const LLMWriterAppMainWindow = GObject.registerClass({
     this._spinner = new Gtk.Spinner({
       visible: true
     });
-    const combobox = Gtk.ComboBox.new_with_model(
-      list_store_from_rows([
-        ['GPT2 117M'],
-        ['GPT2 345M'],
-        ['GPT2 774M'],
-        ['GPT2 1558M'],
-      ])
-    );
-    const renderer = new Gtk.CellRendererText();
-    combobox.pack_start(renderer, true);
-    combobox.add_attribute(renderer, 'text', 0);
-    combobox.set_active(0);
-    combobox.connect('changed', () => {
+    const comboboxChangedCallback = () => {
       resetProgress();
       this._model_loader.with_model(
-        COMBOBOX_ID_TO_LANGUAGE_MODEL_ENUM[combobox.active],
+        COMBOBOX_ID_TO_LANGUAGE_MODEL_ENUM[modelCombobox.active],
+        COMBOBOX_ID_TO_QUANTIZATION_LEVEL_ENUM[quantizationCombobox.active],
         null,
         () => this._spinner.stop(),
         progressCallback
       );
-    });
-    combobox.show();
-
-    header.pack_start(combobox);
+    };
+    const modelCombobox = makeCombobox([
+      ['GPT2 117M'],
+      ['GPT2 345M'],
+      ['GPT2 774M'],
+      ['GPT2 1558M'],
+    ], comboboxChangedCallback);
+    modelCombobox.show();
+    const quantizationCombobox = makeCombobox([
+      ['No quantization'],
+      ['F16'],
+      ['Q8_0'],
+      ['Q5_0'],
+      ['Q5_1'],
+      ['Q4_0'],
+      ['Q4_1'],
+    ], comboboxChangedCallback);
+    quantizationCombobox.show();
+
+    header.pack_start(modelCombobox);
+    header.pack_start(quantizationCombobox);
     header.pack_end(this._spinner);
     this.set_titlebar(header);
 
@@ -263,7 +308,8 @@ const LLMWriterAppMainWindow = GObject.registerClass({
         buffer.create_mark("predictions-start", buffer.get_end_iter(), true);
 
         this._model_loader.with_model(
-          COMBOBOX_ID_TO_LANGUAGE_MODEL_ENUM[combobox.active],
+          COMBOBOX_ID_TO_LANGUAGE_MODEL_ENUM[modelCombobox.active],
+          COMBOBOX_ID_TO_QUANTIZATION_LEVEL_ENUM[quantizationCombobox.active],
           this._cancellable,
           model => {
             model.complete_async(