Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

tests: add gradient tests for all backends #932

Merged
merged 5 commits into from
Sep 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -1234,7 +1234,7 @@ extern "C" {
size_t nb1,
size_t nb2,
size_t nb3,
size_t offset);
size_t offset); // in bytes

// b -> view(a,offset,nb1,nb2,3), return view(a)
GGML_API struct ggml_tensor * ggml_set_inplace(
Expand All @@ -1244,35 +1244,35 @@ extern "C" {
size_t nb1,
size_t nb2,
size_t nb3,
size_t offset);
size_t offset); // in bytes

GGML_API struct ggml_tensor * ggml_set_1d(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
size_t offset);
size_t offset); // in bytes

GGML_API struct ggml_tensor * ggml_set_1d_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
size_t offset);
size_t offset); // in bytes

// b -> view(a,offset,nb1,nb2,3), return modified a
GGML_API struct ggml_tensor * ggml_set_2d(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
size_t nb1,
size_t offset);
size_t offset); // in bytes

// b -> view(a,offset,nb1,nb2,3), return view(a)
GGML_API struct ggml_tensor * ggml_set_2d_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
size_t nb1,
size_t offset);
size_t offset); // in bytes

// a -> b, return view(b)
GGML_API struct ggml_tensor * ggml_cpy(
Expand Down
4 changes: 4 additions & 0 deletions src/ggml-backend.c
Original file line number Diff line number Diff line change
Expand Up @@ -825,6 +825,10 @@ GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const
op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
case GGML_OP_MUL_MAT:
return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
case GGML_OP_ROPE_BACK:
return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
case GGML_OP_IM2COL_BACK:
return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
default:
return true;
}
Expand Down
12 changes: 12 additions & 0 deletions src/ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include "ggml-cuda/rope.cuh"
#include "ggml-cuda/scale.cuh"
#include "ggml-cuda/softmax.cuh"
#include "ggml-cuda/sum.cuh"
#include "ggml-cuda/sumrows.cuh"
#include "ggml-cuda/tsembd.cuh"
#include "ggml-cuda/unary.cuh"
Expand Down Expand Up @@ -2180,6 +2181,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
ggml_cuda_dup(ctx, dst);
break;
case GGML_OP_ADD:
case GGML_OP_ADD1: // TODO: more efficient implementation
ggml_cuda_op_add(ctx, dst);
break;
case GGML_OP_SUB:
Expand All @@ -2196,6 +2198,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
break;
case GGML_OP_UNARY:
switch (ggml_get_unary_op(dst)) {
case GGML_UNARY_OP_NEG:
ggml_cuda_op_neg(ctx, dst);
break;
case GGML_UNARY_OP_GELU:
ggml_cuda_op_gelu(ctx, dst);
break;
Expand Down Expand Up @@ -2304,6 +2309,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_POOL_2D:
ggml_cuda_op_pool2d(ctx, dst);
break;
case GGML_OP_SUM:
ggml_cuda_op_sum(ctx, dst);
break;
case GGML_OP_SUM_ROWS:
ggml_cuda_op_sum_rows(ctx, dst);
break;
Expand Down Expand Up @@ -2741,6 +2749,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
switch (op->op) {
case GGML_OP_UNARY:
switch (ggml_get_unary_op(op)) {
case GGML_UNARY_OP_NEG:
case GGML_UNARY_OP_GELU:
case GGML_UNARY_OP_SILU:
case GGML_UNARY_OP_RELU:
Expand Down Expand Up @@ -2867,6 +2876,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
case GGML_OP_TRANSPOSE:
case GGML_OP_NORM:
case GGML_OP_ADD:
case GGML_OP_ADD1:
case GGML_OP_SUB:
case GGML_OP_MUL:
case GGML_OP_DIV:
Expand All @@ -2886,7 +2896,9 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
case GGML_OP_ROPE:
return ggml_is_contiguous(op->src[0]);
case GGML_OP_IM2COL:
return op->src[0]->type == GGML_TYPE_F16;
case GGML_OP_POOL_2D:
case GGML_OP_SUM:
case GGML_OP_SUM_ROWS:
case GGML_OP_ARGSORT:
case GGML_OP_ACC:
Expand Down
4 changes: 2 additions & 2 deletions src/ggml-cuda/cross-entropy-loss.cu
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#include "common.cuh"
#include "cross-entropy-loss.cuh"
#include "sumrows.cuh"
#include "sum.cuh"

#include <cmath>
#include <cstdint>
Expand Down Expand Up @@ -102,5 +102,5 @@ void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor *
cross_entropy_loss_f32<<<blocks_num, blocks_dim, shmem, stream>>>(src0_d, src1_d, dst_tmp.ptr, ne00, nrows);

// Combine results from individual blocks:
sum_rows_f32_cuda(dst_tmp.ptr, dst_d, blocks_num.x, 1, stream);
sum_f32_cuda(pool, dst_tmp.ptr, dst_d, blocks_num.x, stream);
}
41 changes: 41 additions & 0 deletions src/ggml-cuda/sum.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#include "sumrows.cuh"
#include "sum.cuh"

#include <cstdint>

#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
#include <cub/cub.cuh>
using namespace cub;
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)

void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream) {
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
size_t tmp_size = 0;
DeviceReduce::Sum(nullptr, tmp_size, x, dst, ne, stream);
ggml_cuda_pool_alloc<uint8_t> tmp_alloc(pool, tmp_size);
DeviceReduce::Sum(tmp_alloc.ptr, tmp_size, x, dst, ne, stream);
#else
// Use (inefficient) sum_rows implementation as a fallback.
// For AMD there is rocPRIM which could be used as a drop-in replacement via hipcub but this would require C++11 -> C++14.
sum_rows_f32_cuda(x, dst, ne, 1, stream);
GGML_UNUSED(pool);
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
}

void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];

GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
GGML_ASSERT(ggml_is_contiguous(src0));

const float * src0_d = (const float *) src0->data;
float * dst_d = (float *) dst->data;

const int64_t ne = ggml_nelements(src0);

ggml_cuda_pool & pool = ctx.pool();
cudaStream_t stream = ctx.stream();

sum_f32_cuda(pool, src0_d, dst_d, ne, stream);
}
5 changes: 5 additions & 0 deletions src/ggml-cuda/sum.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#include "common.cuh"

void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream);

void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
29 changes: 29 additions & 0 deletions src/ggml-cuda/unary.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,15 @@
#include "unary.cuh"

static __global__ void neg_f32(const float * x, float * dst, const int k) {
const int i = blockDim.x*blockIdx.x + threadIdx.x;

if (i >= k) {
return;
}

dst[i] = -x[i];
}

static __global__ void gelu_f32(const float * x, float * dst, const int k) {
const float GELU_COEF_A = 0.044715f;
const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
Expand Down Expand Up @@ -119,6 +129,11 @@ static __global__ void cos_f32(const float * x, float * dst, const int k) {
dst[i] = cosf(x[i]);
}

static void neg_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_NEG_BLOCK_SIZE - 1) / CUDA_NEG_BLOCK_SIZE;
neg_f32<<<num_blocks, CUDA_NEG_BLOCK_SIZE, 0, stream>>>(x, dst, k);
}

static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
gelu_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
Expand Down Expand Up @@ -184,6 +199,20 @@ static void cos_f32_cuda(const float * x, float * dst, const int k, cudaStream_t
cos_f32<<<num_blocks, CUDA_COS_BLOCK_SIZE, 0, stream>>>(x, dst, k);
}

void ggml_cuda_op_neg(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const float * src0_d = (const float *)src0->data;
float * dst_d = (float *)dst->data;
cudaStream_t stream = ctx.stream();

GGML_ASSERT(ggml_is_contiguous(src0));

GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);

neg_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
}

void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const float * src0_d = (const float *)src0->data;
Expand Down
3 changes: 3 additions & 0 deletions src/ggml-cuda/unary.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include "common.cuh"

#define CUDA_NEG_BLOCK_SIZE 256
#define CUDA_GELU_BLOCK_SIZE 256
#define CUDA_SILU_BLOCK_SIZE 256
#define CUDA_TANH_BLOCK_SIZE 256
Expand All @@ -12,6 +13,8 @@
#define CUDA_SIN_BLOCK_SIZE 256
#define CUDA_COS_BLOCK_SIZE 256

void ggml_cuda_op_neg(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

void ggml_cuda_op_silu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
Expand Down
32 changes: 16 additions & 16 deletions src/ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -5131,6 +5131,7 @@ struct ggml_tensor * ggml_concat(
bool is_node = false;

if (a->grad || b->grad) {
GGML_ABORT("fatal error"); // TODO: implement
is_node = true;
}

Expand Down Expand Up @@ -5252,6 +5253,7 @@ struct ggml_tensor * ggml_leaky_relu(
bool is_node = false;

if (!inplace && (a->grad)) {
GGML_ABORT("fatal error"); // TODO: not implemented
is_node = true;
}

Expand Down Expand Up @@ -5677,6 +5679,7 @@ static struct ggml_tensor * ggml_set_impl(
// make a view of the destination
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);

GGML_ASSERT(offset < (size_t)(1 << 30));
int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
ggml_set_op_params(result, params, sizeof(params));

Expand Down Expand Up @@ -6634,14 +6637,12 @@ struct ggml_tensor * ggml_rope_back(
GGML_ASSERT(ggml_is_vector(b));
GGML_ASSERT(b->type == GGML_TYPE_I32);
GGML_ASSERT(a->ne[2] == b->ne[0]);
GGML_ASSERT(c == NULL && "freq factors not implemented yet");

GGML_ASSERT((mode & 4) == 0 && "ggml_rope_back() for ChatGLM not implemented yet");

bool is_node = false;

if (a->grad) {
is_node = false; // TODO: implement backward
GGML_ASSERT(false && "backwards pass not implemented");
is_node = false;
}

struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
Expand All @@ -6659,6 +6660,7 @@ struct ggml_tensor * ggml_rope_back(
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = a;
result->src[1] = b;
result->src[2] = c;

return result;
}
Expand Down Expand Up @@ -7212,6 +7214,11 @@ struct ggml_tensor * ggml_argsort(
enum ggml_sort_order order) {
bool is_node = false;

if (a->grad) {
GGML_ABORT("fatal error"); // TODO: not implemented
is_node = true;
}

struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, GGML_MAX_DIMS, a->ne);

ggml_set_op_params_i32(result, 0, (int32_t) order);
Expand Down Expand Up @@ -10745,9 +10752,6 @@ static void ggml_compute_forward_sum_f32(
return;
}

assert(ggml_is_scalar(dst));


assert(ggml_is_scalar(dst));
assert(src0->nb[0] == sizeof(float));

Expand Down Expand Up @@ -18000,14 +18004,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
if (src0->grad || src1->grad) {
GGML_ASSERT(src0->type == tensor->type);
GGML_ASSERT(tensor->grad->type == tensor->type);
GGML_ASSERT(tensor->grad->type == src1->grad->type);
GGML_ASSERT(!src1->grad || src1->grad->type == tensor->grad->type);

tensor_grad_view = ggml_view_4d(ctx,
tensor->grad,
src1->grad->ne[0],
src1->grad->ne[1],
src1->grad->ne[2],
src1->grad->ne[3],
tensor->grad, src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
nb1, nb2, nb3, offset);
}

Expand Down Expand Up @@ -18076,9 +18076,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor

memcpy(&offset, tensor->op_params, sizeof(offset));

size_t nb1 = tensor->nb[1];
size_t nb2 = tensor->nb[2];
size_t nb3 = tensor->nb[3];
size_t nb1 = tensor->nb[1];
size_t nb2 = tensor->nb[2];
size_t nb3 = tensor->nb[3];

if (src0->type != src0->grad->type) {
// gradient is typically F32, but src0 could be other type
Expand Down
Loading
Loading