Skip to content

Commit

Permalink
tests: add gradient checking to test-backend-ops
Browse files Browse the repository at this point in the history
  • Loading branch information
JohannesGaessler committed Aug 31, 2024
1 parent d3a58b0 commit 4e15cdf
Show file tree
Hide file tree
Showing 9 changed files with 988 additions and 84 deletions.
12 changes: 6 additions & 6 deletions include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -1234,7 +1234,7 @@ extern "C" {
size_t nb1,
size_t nb2,
size_t nb3,
size_t offset);
size_t offset); // in bytes

// b -> view(a,offset,nb1,nb2,3), return view(a)
GGML_API struct ggml_tensor * ggml_set_inplace(
Expand All @@ -1244,35 +1244,35 @@ extern "C" {
size_t nb1,
size_t nb2,
size_t nb3,
size_t offset);
size_t offset); // in bytes

GGML_API struct ggml_tensor * ggml_set_1d(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
size_t offset);
size_t offset); // in bytes

GGML_API struct ggml_tensor * ggml_set_1d_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
size_t offset);
size_t offset); // in bytes

// b -> view(a,offset,nb1,nb2,3), return modified a
GGML_API struct ggml_tensor * ggml_set_2d(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
size_t nb1,
size_t offset);
size_t offset); // in bytes

// b -> view(a,offset,nb1,nb2,3), return view(a)
GGML_API struct ggml_tensor * ggml_set_2d_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
size_t nb1,
size_t offset);
size_t offset); // in bytes

// a -> b, return view(b)
GGML_API struct ggml_tensor * ggml_cpy(
Expand Down
12 changes: 12 additions & 0 deletions src/ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include "ggml-cuda/rope.cuh"
#include "ggml-cuda/scale.cuh"
#include "ggml-cuda/softmax.cuh"
#include "ggml-cuda/sum.cuh"
#include "ggml-cuda/sumrows.cuh"
#include "ggml-cuda/tsembd.cuh"
#include "ggml-cuda/unary.cuh"
Expand Down Expand Up @@ -2180,6 +2181,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
ggml_cuda_dup(ctx, dst);
break;
case GGML_OP_ADD:
case GGML_OP_ADD1: // TODO: more efficient implementation
ggml_cuda_op_add(ctx, dst);
break;
case GGML_OP_SUB:
Expand All @@ -2196,6 +2198,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
break;
case GGML_OP_UNARY:
switch (ggml_get_unary_op(dst)) {
case GGML_UNARY_OP_NEG:
ggml_cuda_op_neg(ctx, dst);
break;
case GGML_UNARY_OP_GELU:
ggml_cuda_op_gelu(ctx, dst);
break;
Expand Down Expand Up @@ -2304,6 +2309,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_POOL_2D:
ggml_cuda_op_pool2d(ctx, dst);
break;
case GGML_OP_SUM:
ggml_cuda_op_sum(ctx, dst);
break;
case GGML_OP_SUM_ROWS:
ggml_cuda_op_sum_rows(ctx, dst);
break;
Expand Down Expand Up @@ -2741,6 +2749,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
switch (op->op) {
case GGML_OP_UNARY:
switch (ggml_get_unary_op(op)) {
case GGML_UNARY_OP_NEG:
case GGML_UNARY_OP_GELU:
case GGML_UNARY_OP_SILU:
case GGML_UNARY_OP_RELU:
Expand Down Expand Up @@ -2867,6 +2876,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
case GGML_OP_TRANSPOSE:
case GGML_OP_NORM:
case GGML_OP_ADD:
case GGML_OP_ADD1:
case GGML_OP_SUB:
case GGML_OP_MUL:
case GGML_OP_DIV:
Expand All @@ -2886,7 +2896,9 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
case GGML_OP_ROPE:
return ggml_is_contiguous(op->src[0]);
case GGML_OP_IM2COL:
return op->src[0]->type == GGML_TYPE_F16;
case GGML_OP_POOL_2D:
case GGML_OP_SUM:
case GGML_OP_SUM_ROWS:
case GGML_OP_ARGSORT:
case GGML_OP_ACC:
Expand Down
4 changes: 2 additions & 2 deletions src/ggml-cuda/cross-entropy-loss.cu
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#include "common.cuh"
#include "cross-entropy-loss.cuh"
#include "sumrows.cuh"
#include "sum.cuh"

#include <cmath>
#include <cstdint>
Expand Down Expand Up @@ -102,5 +102,5 @@ void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor *
cross_entropy_loss_f32<<<blocks_num, blocks_dim, shmem, stream>>>(src0_d, src1_d, dst_tmp.ptr, ne00, nrows);

// Combine results from individual blocks:
sum_rows_f32_cuda(dst_tmp.ptr, dst_d, blocks_num.x, 1, stream);
sum_f32_cuda(pool, dst_tmp.ptr, dst_d, blocks_num.x, stream);
}
41 changes: 41 additions & 0 deletions src/ggml-cuda/sum.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#include "sumrows.cuh"
#include "sum.cuh"

#include <cstdint>

#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
#include <cub/cub.cuh>
using namespace cub;
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)

void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream) {
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
size_t tmp_size = 0;
DeviceReduce::Sum(nullptr, tmp_size, x, dst, ne, stream);
ggml_cuda_pool_alloc<uint8_t> tmp_alloc(pool, tmp_size);
DeviceReduce::Sum(tmp_alloc.ptr, tmp_size, x, dst, ne, stream);
#else
// Use (inefficient) sum_rows implementation as a fallback.
// For AMD there is rocPRIM which could be used as a drop-in replacement via hipcub but this would require C++11 -> C++14.
sum_rows_f32_cuda(x, dst, ne, 1, stream);
GGML_UNUSED(pool);
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
}

void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];

GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
GGML_ASSERT(ggml_is_contiguous(src0));

const float * src0_d = (const float *) src0->data;
float * dst_d = (float *) dst->data;

const int64_t ne = ggml_nelements(src0);

ggml_cuda_pool & pool = ctx.pool();
cudaStream_t stream = ctx.stream();

sum_f32_cuda(pool, src0_d, dst_d, ne, stream);
}
5 changes: 5 additions & 0 deletions src/ggml-cuda/sum.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#include "common.cuh"

void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream);

void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
29 changes: 29 additions & 0 deletions src/ggml-cuda/unary.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,15 @@
#include "unary.cuh"

static __global__ void neg_f32(const float * x, float * dst, const int k) {
const int i = blockDim.x*blockIdx.x + threadIdx.x;

if (i >= k) {
return;
}

dst[i] = -x[i];
}

static __global__ void gelu_f32(const float * x, float * dst, const int k) {
const float GELU_COEF_A = 0.044715f;
const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
Expand Down Expand Up @@ -119,6 +129,11 @@ static __global__ void cos_f32(const float * x, float * dst, const int k) {
dst[i] = cosf(x[i]);
}

static void neg_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_NEG_BLOCK_SIZE - 1) / CUDA_NEG_BLOCK_SIZE;
neg_f32<<<num_blocks, CUDA_NEG_BLOCK_SIZE, 0, stream>>>(x, dst, k);
}

static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
gelu_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
Expand Down Expand Up @@ -184,6 +199,20 @@ static void cos_f32_cuda(const float * x, float * dst, const int k, cudaStream_t
cos_f32<<<num_blocks, CUDA_COS_BLOCK_SIZE, 0, stream>>>(x, dst, k);
}

void ggml_cuda_op_neg(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const float * src0_d = (const float *)src0->data;
float * dst_d = (float *)dst->data;
cudaStream_t stream = ctx.stream();

GGML_ASSERT(ggml_is_contiguous(src0));

GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);

neg_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
}

void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const float * src0_d = (const float *)src0->data;
Expand Down
3 changes: 3 additions & 0 deletions src/ggml-cuda/unary.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include "common.cuh"

#define CUDA_NEG_BLOCK_SIZE 256
#define CUDA_GELU_BLOCK_SIZE 256
#define CUDA_SILU_BLOCK_SIZE 256
#define CUDA_TANH_BLOCK_SIZE 256
Expand All @@ -12,6 +13,8 @@
#define CUDA_SIN_BLOCK_SIZE 256
#define CUDA_COS_BLOCK_SIZE 256

void ggml_cuda_op_neg(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

void ggml_cuda_op_silu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
Expand Down
25 changes: 13 additions & 12 deletions src/ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -5131,6 +5131,7 @@ struct ggml_tensor * ggml_concat(
bool is_node = false;

if (a->grad || b->grad) {
GGML_ABORT("fatal error"); // TODO: implement
is_node = true;
}

Expand Down Expand Up @@ -5252,6 +5253,7 @@ struct ggml_tensor * ggml_leaky_relu(
bool is_node = false;

if (!inplace && (a->grad)) {
GGML_ABORT("fatal error"); // TODO: not implemented
is_node = true;
}

Expand Down Expand Up @@ -5677,6 +5679,7 @@ static struct ggml_tensor * ggml_set_impl(
// make a view of the destination
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);

GGML_ASSERT(offset < (size_t)(1 << 30));
int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
ggml_set_op_params(result, params, sizeof(params));

Expand Down Expand Up @@ -7212,6 +7215,11 @@ struct ggml_tensor * ggml_argsort(
enum ggml_sort_order order) {
bool is_node = false;

if (a->grad) {
GGML_ABORT("fatal error"); // TODO: not implemented
is_node = true;
}

struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, GGML_MAX_DIMS, a->ne);

ggml_set_op_params_i32(result, 0, (int32_t) order);
Expand Down Expand Up @@ -10745,9 +10753,6 @@ static void ggml_compute_forward_sum_f32(
return;
}

assert(ggml_is_scalar(dst));


assert(ggml_is_scalar(dst));
assert(src0->nb[0] == sizeof(float));

Expand Down Expand Up @@ -18000,14 +18005,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
if (src0->grad || src1->grad) {
GGML_ASSERT(src0->type == tensor->type);
GGML_ASSERT(tensor->grad->type == tensor->type);
GGML_ASSERT(tensor->grad->type == src1->grad->type);
GGML_ASSERT(!src1->grad || src1->grad->type == tensor->grad->type);

tensor_grad_view = ggml_view_4d(ctx,
tensor->grad,
src1->grad->ne[0],
src1->grad->ne[1],
src1->grad->ne[2],
src1->grad->ne[3],
tensor->grad, src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
nb1, nb2, nb3, offset);
}

Expand Down Expand Up @@ -18076,9 +18077,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor

memcpy(&offset, tensor->op_params, sizeof(offset));

size_t nb1 = tensor->nb[1];
size_t nb2 = tensor->nb[2];
size_t nb3 = tensor->nb[3];
size_t nb1 = tensor->nb[1];
size_t nb2 = tensor->nb[2];
size_t nb3 = tensor->nb[3];

if (src0->type != src0->grad->type) {
// gradient is typically F32, but src0 could be other type
Expand Down
Loading

0 comments on commit 4e15cdf

Please sign in to comment.