talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI * talk.llama : disable EOS token * talk-llama : add README instructions * ggml : fix build in debug
2023-11-04 02:52:44 +03:00 · 2023-03-27 21:00:32 +03:00
parent 8e361d90d7
commit 4a0deb8b1e
14 changed files with 5061 additions and 528 deletions
--- a/ggml.h
+++ b/ggml.h
@@ -198,6 +198,8 @@ struct ggml_object;
 struct ggml_context;

 enum ggml_type {
+    GGML_TYPE_Q4_0,
+    GGML_TYPE_Q4_1,
    GGML_TYPE_I8,
    GGML_TYPE_I16,
    GGML_TYPE_I32,
@@ -226,7 +228,9 @@ enum ggml_op {
    GGML_OP_STEP,
    GGML_OP_RELU,
    GGML_OP_GELU,
+    GGML_OP_SILU,
    GGML_OP_NORM, // normalize
+    GGML_OP_RMS_NORM,

    GGML_OP_MUL_MAT,

@@ -326,7 +330,10 @@ void ggml_print_objects(const struct ggml_context * ctx);
 int    ggml_nelements(const struct ggml_tensor * tensor);
 size_t ggml_nbytes   (const struct ggml_tensor * tensor);

-size_t ggml_type_size   (enum ggml_type type);
+int    ggml_blck_size (enum ggml_type type);
+size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
+float  ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
+
 size_t ggml_element_size(const struct ggml_tensor * tensor);

 struct ggml_context * ggml_init(struct ggml_init_params params);
@@ -336,6 +343,9 @@ size_t ggml_used_mem(const struct ggml_context * ctx);

 size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);

+bool ggml_mlock_supported(void);
+bool ggml_mlock(struct ggml_context * ctx, char ** err_p);
+
 struct ggml_tensor * ggml_new_tensor(
        struct ggml_context * ctx,
        enum   ggml_type type,
@@ -466,12 +476,20 @@ struct ggml_tensor * ggml_gelu(
        struct ggml_context * ctx,
        struct ggml_tensor  * a);

+struct ggml_tensor * ggml_silu(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a);
+
 // normalize along rows
 // TODO: eps is hardcoded to 1e-5 for now
 struct ggml_tensor * ggml_norm(
        struct ggml_context * ctx,
        struct ggml_tensor  * a);

+struct ggml_tensor * ggml_rms_norm(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a);
+
 // A: m rows, n columns
 // B: p rows, n columns (i.e. we transpose it internally)
 // result is m columns, p rows
@@ -726,6 +744,13 @@ enum ggml_opt_result ggml_opt(
        struct ggml_opt_params params,
        struct ggml_tensor * f);

+//
+// quantization
+//
+
+size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int qk, int64_t * hist);
+size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int qk, int64_t * hist);
+
 //
 // system info
 //