Skip to content

Commit

Permalink
mmap hack
Browse files Browse the repository at this point in the history
  • Loading branch information
slaren committed Mar 28, 2023
1 parent 7e53955 commit fc68512
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 20 deletions.
8 changes: 6 additions & 2 deletions ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -2737,6 +2737,10 @@ bool ggml_mlock(struct ggml_context * ctx, char ** err_p) {
#endif // GGML_MLOCK_SUPPORT

////////////////////////////////////////////////////////////////////////////////
int g_nomem = 0;
void ggml_nomem(int nomem) {
g_nomem = nomem;
}

struct ggml_tensor * ggml_new_tensor_impl(
struct ggml_context * ctx,
Expand All @@ -2753,7 +2757,7 @@ struct ggml_tensor * ggml_new_tensor_impl(

size_t size_needed = 0;

if (data == NULL) {
if (data == NULL && !g_nomem) {
size_needed += GGML_TYPE_SIZE[type]*(ne[0]/GGML_BLCK_SIZE[type]);
for (int i = 1; i < n_dims; i++) {
size_needed *= ne[i];
Expand Down Expand Up @@ -2837,7 +2841,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
/*.perf_runs =*/ 0,
/*.perf_cycles =*/ 0,
/*.perf_time_us =*/ 0,
/*.data =*/ data == NULL ? (void *)(result + 1) : data,
/*.data =*/ (data == NULL && !g_nomem) ? (void *)(result + 1) : data,
/*.pad =*/ { 0 },
};

Expand Down
2 changes: 2 additions & 0 deletions ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -346,6 +346,8 @@ size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
bool ggml_mlock_supported(void);
bool ggml_mlock(struct ggml_context * ctx, char ** err_p);

void ggml_nomem(int nomem);

struct ggml_tensor * ggml_new_tensor(
struct ggml_context * ctx,
enum ggml_type type,
Expand Down
68 changes: 50 additions & 18 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@
#include <cassert>
#include <cstring>

#include <sys/mman.h>
#include <fcntl.h>
#include <unistd.h>

#define LLAMA_USE_SCRATCH
#define LLAMA_MAX_SCRATCH_BUFFERS 16

Expand Down Expand Up @@ -452,36 +456,37 @@ static bool llama_model_load(
auto & ctx = model.ctx;

size_t ctx_size = 0;

{
const auto & hparams = model.hparams;

const int n_embd = hparams.n_embd;
const int n_layer = hparams.n_layer;
const int n_ctx = hparams.n_ctx;
const int n_vocab = hparams.n_vocab;

ctx_size += n_embd*n_vocab*ggml_type_sizef(vtype); // tok_embeddings
if (n_parts > 1) {
ctx_size += n_embd*n_vocab*ggml_type_sizef(vtype); // tok_embeddings

ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // norm
ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // norm

ctx_size += n_embd*n_vocab*ggml_type_sizef(vtype); // output
ctx_size += n_embd*n_vocab*ggml_type_sizef(vtype); // output

ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // attention_norm
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // attention_norm

ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wq
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wk
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wv
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wo
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wq
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wk
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wv
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wo

ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ffn_norm
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ffn_norm

ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w1
ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w2
ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w3
ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w1
ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w2
ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w3
}

ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_k
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_v
// this is no longer stored in this context
//ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_k
//ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_v

ctx_size += (5 + 10*n_layer)*256; // object overhead

Expand Down Expand Up @@ -533,6 +538,9 @@ static bool llama_model_load(

model.layers.resize(n_layer);

if (n_parts == 1)
ggml_nomem(1); // hack to stop ggml from allocating memory for these tensors

model.tok_embeddings = ggml_new_tensor_2d(ctx, vtype, n_embd, n_vocab);

model.norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
Expand Down Expand Up @@ -576,6 +584,9 @@ static bool llama_model_load(
}
}

if (n_parts == 1)
ggml_nomem(0);

const size_t file_offset = fin.tellg();

fin.close();
Expand All @@ -600,6 +611,17 @@ static bool llama_model_load(
fin = std::ifstream(fname_part, std::ios::binary);
fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());

// mmap support
int fd = open(fname.c_str(), O_RDONLY);
size_t len = lseek(fd, 0, SEEK_END);
char* mm = (char*)mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
if (mm == MAP_FAILED) {
perror("mmap");
mm = NULL;
}
close(fd);
//

fin.seekg(0, fin.end);
const size_t file_size = fin.tellg();

Expand Down Expand Up @@ -736,13 +758,23 @@ static bool llama_model_load(
}

if (part_id == 0) {
fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
if (mm == NULL) {
fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
}
else {
fprintf(stderr, "tensor mmaped: %s\n", name.c_str());
off_t offset = fin.tellg();;
tensor->data = mm + offset;
fin.seekg(ggml_nbytes(tensor), std::ios::cur);
}
} else {
fprintf(stderr, "tensor skipped: %s\n", name.c_str());
fin.seekg(ggml_nbytes(tensor), std::ios::cur);
}

total_size += ggml_nbytes(tensor);
//total_size += ggml_nbytes(tensor);
} else {
fprintf(stderr, "tensor not mmaped: %s\n", name.c_str());
if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)/n_parts) {
fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
__func__, name.data(), ggml_nbytes(tensor)/n_parts, nelements*bpe);
Expand Down

0 comments on commit fc68512

Please sign in to comment.