mirror of https://github.com/ollama/ollama
Merge d7cda4f5ac into a013693f80
This commit is contained in:
commit
6d43c4a7ba
|
|
@ -1,6 +1,6 @@
|
|||
UPSTREAM=https://github.com/ggml-org/llama.cpp.git
|
||||
WORKDIR=llama/vendor
|
||||
FETCH_HEAD=17f7f4baad8b3a716ee139da7bb56ae984e8c0fa
|
||||
FETCH_HEAD=ec98e2002
|
||||
|
||||
.PHONY: help
|
||||
help:
|
||||
|
|
|
|||
|
|
@ -140,10 +140,6 @@ func (c *Causal) Init(backend ml.Backend, dtype ml.DType, maxSequences, capacity
|
|||
c.config.CachePadding = 1
|
||||
}
|
||||
|
||||
if c.config.MaskBatchPadding == 0 {
|
||||
c.config.MaskBatchPadding = 1
|
||||
}
|
||||
|
||||
if c.config.MaskDType == ml.DTypeOther {
|
||||
c.config.MaskDType = ml.DTypeF32
|
||||
}
|
||||
|
|
@ -364,15 +360,12 @@ func roundUp(length, pad int) int {
|
|||
// token in the history should apply. This is based on both the sequence and causality (the
|
||||
// position of the history is not ahead of the token in the batch).
|
||||
func (c *Causal) buildMask(ctx ml.Context) ml.Tensor {
|
||||
// Align and pad the two dimensions as required by the backend
|
||||
batchSize := roundUp(c.curBatchSize, c.config.MaskBatchPadding)
|
||||
|
||||
c.curCellRange.min = roundDown(c.curCellRange.min, c.config.CachePadding)
|
||||
c.curCellRange.max = roundUp(c.curCellRange.max+1, c.config.CachePadding) - 1
|
||||
|
||||
length := c.curCellRange.max - c.curCellRange.min + 1
|
||||
|
||||
mask := make([]float32, batchSize*length)
|
||||
mask := make([]float32, c.curBatchSize*length)
|
||||
|
||||
for i := range c.curBatchSize {
|
||||
enabled := !slices.Contains(c.opts.Except, i)
|
||||
|
|
@ -386,13 +379,7 @@ func (c *Causal) buildMask(ctx ml.Context) ml.Tensor {
|
|||
}
|
||||
}
|
||||
|
||||
// Mask out any padding tokens we added. For padding that we added to the cache history, this
|
||||
// has already been masked out because the sequence doesn't match.
|
||||
for i := c.curBatchSize * length; i < len(mask); i++ {
|
||||
mask[i] = float32(math.Inf(-1))
|
||||
}
|
||||
|
||||
maskTensor := ctx.Input().FromFloats(mask, length, batchSize)
|
||||
maskTensor := ctx.Input().FromFloats(mask, length, c.curBatchSize)
|
||||
|
||||
if c.config.MaskDType != ml.DTypeF32 {
|
||||
maskTensor = maskTensor.Cast(ctx, c.config.MaskDType)
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
int LLAMA_BUILD_NUMBER = 0;
|
||||
char const *LLAMA_COMMIT = "17f7f4baad8b3a716ee139da7bb56ae984e8c0fa";
|
||||
char const *LLAMA_COMMIT = "ec98e2002";
|
||||
char const *LLAMA_COMPILER = "";
|
||||
char const *LLAMA_BUILD_TARGET = "";
|
||||
|
|
|
|||
|
|
@ -17,6 +17,9 @@ include /tools/mtmd/clip.cpp
|
|||
include /tools/mtmd/mtmd.cpp
|
||||
include /tools/mtmd/mtmd-audio.cpp
|
||||
include /tools/mtmd/mtmd-helper.cpp
|
||||
include /tools/mtmd/models/
|
||||
include /tools/mtmd/models/*.h
|
||||
include /tools/mtmd/models/*.cpp
|
||||
include /src/
|
||||
include /src/llama.*
|
||||
include /src/llama-*.*
|
||||
|
|
|
|||
|
|
@ -1013,31 +1013,40 @@ bool tty_can_use_colors() {
|
|||
// Model utils
|
||||
//
|
||||
|
||||
static inline void common_init_sampler_from_model(
|
||||
// TODO: move to common/sampling
|
||||
static void common_init_sampler_from_model(
|
||||
const llama_model * model,
|
||||
common_params_sampling & sparams) {
|
||||
|
||||
const uint64_t config = sparams.user_sampling_config;
|
||||
|
||||
auto get_int32 = [&](const char * key, int32_t & dst, uint64_t user_config) {
|
||||
if (config & user_config) return;
|
||||
if (config & user_config) {
|
||||
return;
|
||||
}
|
||||
|
||||
char buf[64] = {0};
|
||||
if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) {
|
||||
char * end = nullptr;
|
||||
int32_t v = strtol(buf, &end, 10);
|
||||
if (end && end != buf) dst = v;
|
||||
if (end && end != buf) {
|
||||
dst = v;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
auto get_float = [&](const char * key, float & dst, uint64_t user_config) {
|
||||
if (config & user_config) return;
|
||||
if (config & user_config) {
|
||||
return;
|
||||
}
|
||||
|
||||
char buf[128] = {0};
|
||||
if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) {
|
||||
char * end = nullptr;
|
||||
float v = strtof(buf, &end);
|
||||
if (end && end != buf) dst = v;
|
||||
if (end && end != buf) {
|
||||
dst = v;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
|
@ -1065,31 +1074,125 @@ static inline void common_init_sampler_from_model(
|
|||
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA), sparams.mirostat_eta, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA);
|
||||
}
|
||||
|
||||
struct common_init_result common_init_from_params(common_params & params) {
|
||||
common_init_result iparams;
|
||||
struct common_init_result::impl {
|
||||
impl() = default;
|
||||
~impl() = default;
|
||||
|
||||
llama_model_ptr model;
|
||||
llama_context_ptr context;
|
||||
|
||||
std::vector<llama_adapter_lora_ptr> lora;
|
||||
|
||||
std::vector<common_sampler_ptr> samplers;
|
||||
};
|
||||
|
||||
common_init_result::common_init_result(common_params & params) :
|
||||
pimpl(new impl{}) {
|
||||
auto mparams = common_model_params_to_llama(params);
|
||||
auto cparams = common_context_params_to_llama(params);
|
||||
|
||||
if (params.fit_params) {
|
||||
LOG_INF("%s: fitting params to device memory, to report bugs during this step use -fit off (or --verbose if you can't)\n", __func__);
|
||||
llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
|
||||
params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target, params.fit_params_min_ctx,
|
||||
params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
|
||||
}
|
||||
|
||||
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
|
||||
if (model == NULL) {
|
||||
LOG_ERR("%s: failed to load model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
|
||||
__func__, params.model.path.c_str());
|
||||
return iparams;
|
||||
return;
|
||||
}
|
||||
|
||||
common_init_sampler_from_model(model, params.sampling);
|
||||
pimpl->model.reset(model);
|
||||
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
|
||||
auto cparams = common_context_params_to_llama(params);
|
||||
// updates params.sampling
|
||||
// TODO: fix naming
|
||||
common_init_sampler_from_model(model, params.sampling);
|
||||
|
||||
if (params.sampling.ignore_eos && llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
|
||||
LOG_WRN("%s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n", __func__);
|
||||
params.sampling.ignore_eos = false;
|
||||
}
|
||||
|
||||
// initialize once
|
||||
for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
|
||||
if (llama_vocab_is_eog(vocab, i)) {
|
||||
LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(vocab, i).c_str(), -INFINITY);
|
||||
params.sampling.logit_bias_eog.push_back({i, -INFINITY});
|
||||
}
|
||||
}
|
||||
|
||||
if (params.sampling.ignore_eos) {
|
||||
// add EOG biases to the active set of logit biases
|
||||
params.sampling.logit_bias.insert(
|
||||
params.sampling.logit_bias.end(),
|
||||
params.sampling.logit_bias_eog.begin(), params.sampling.logit_bias_eog.end());
|
||||
}
|
||||
|
||||
//if (params.sampling.penalty_last_n == -1) {
|
||||
// LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
|
||||
// params.sampling.penalty_last_n = llama_n_ctx(lctx);
|
||||
//}
|
||||
|
||||
//if (params.sampling.dry_penalty_last_n == -1) {
|
||||
// LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
|
||||
// params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
|
||||
//}
|
||||
|
||||
pimpl->samplers.resize(cparams.n_seq_max);
|
||||
|
||||
for (int i = 0; i < (int) cparams.n_seq_max; ++i) {
|
||||
pimpl->samplers[i].reset(common_sampler_init(model, params.sampling));
|
||||
}
|
||||
|
||||
llama_context * lctx = llama_init_from_model(model, cparams);
|
||||
if (lctx == NULL) {
|
||||
LOG_ERR("%s: failed to create context with model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
|
||||
__func__, params.model.path.c_str());
|
||||
llama_model_free(model);
|
||||
return iparams;
|
||||
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
|
||||
return;
|
||||
}
|
||||
|
||||
pimpl->context.reset(lctx);
|
||||
}
|
||||
|
||||
llama_model * common_init_result::model() {
|
||||
return pimpl->model.get();
|
||||
}
|
||||
|
||||
llama_context * common_init_result::context() {
|
||||
return pimpl->context.get();
|
||||
}
|
||||
|
||||
common_sampler * common_init_result::sampler(llama_seq_id seq_id) {
|
||||
return pimpl->samplers[seq_id].get();
|
||||
}
|
||||
|
||||
std::vector<llama_adapter_lora_ptr> & common_init_result::lora() {
|
||||
return pimpl->lora;
|
||||
}
|
||||
|
||||
void common_init_result::free_context() {
|
||||
pimpl->context.reset();
|
||||
}
|
||||
|
||||
common_init_result_ptr common_init_from_params(common_params & params) {
|
||||
common_init_result_ptr res(new common_init_result(params));
|
||||
|
||||
llama_model * model = res->model();
|
||||
if (model == NULL) {
|
||||
LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str());
|
||||
return res;
|
||||
}
|
||||
|
||||
llama_context * lctx = res->context();
|
||||
if (lctx == NULL) {
|
||||
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
|
||||
return res;
|
||||
}
|
||||
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
|
||||
if (params.ctx_shift && !llama_memory_can_shift(llama_get_memory(lctx))) {
|
||||
LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__);
|
||||
params.ctx_shift = false;
|
||||
|
|
@ -1101,10 +1204,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|||
|
||||
const auto cvec = common_control_vector_load(params.control_vectors);
|
||||
if (cvec.n_embd == -1) {
|
||||
llama_free(lctx);
|
||||
llama_model_free(model);
|
||||
|
||||
return iparams;
|
||||
return res;
|
||||
}
|
||||
|
||||
int err = llama_apply_adapter_cvec(
|
||||
|
|
@ -1115,10 +1215,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|||
params.control_vector_layer_start,
|
||||
params.control_vector_layer_end);
|
||||
if (err) {
|
||||
llama_free(lctx);
|
||||
llama_model_free(model);
|
||||
|
||||
return iparams;
|
||||
return res;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1142,10 +1239,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|||
}
|
||||
|
||||
if (!ok) {
|
||||
llama_free(lctx);
|
||||
llama_model_free(model);
|
||||
|
||||
return iparams;
|
||||
return res;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1155,9 +1249,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|||
lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
|
||||
if (lora == nullptr) {
|
||||
LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
|
||||
llama_free(lctx);
|
||||
llama_model_free(model);
|
||||
return iparams;
|
||||
return res;
|
||||
}
|
||||
|
||||
char buf[1024];
|
||||
|
|
@ -1166,43 +1258,13 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|||
la.task_name = buf;
|
||||
llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));
|
||||
la.prompt_prefix = buf;
|
||||
iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
|
||||
res->lora().emplace_back(std::move(lora)); // copy to list of loaded adapters
|
||||
}
|
||||
|
||||
if (!params.lora_init_without_apply) {
|
||||
common_set_adapter_lora(lctx, params.lora_adapters);
|
||||
}
|
||||
|
||||
if (params.sampling.ignore_eos && llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
|
||||
LOG_WRN("%s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n", __func__);
|
||||
params.sampling.ignore_eos = false;
|
||||
}
|
||||
|
||||
// initialize once
|
||||
for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
|
||||
if (llama_vocab_is_eog(vocab, i)) {
|
||||
LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
|
||||
params.sampling.logit_bias_eog.push_back({i, -INFINITY});
|
||||
}
|
||||
}
|
||||
|
||||
if (params.sampling.ignore_eos) {
|
||||
// add EOG biases to the active set of logit biases
|
||||
params.sampling.logit_bias.insert(
|
||||
params.sampling.logit_bias.end(),
|
||||
params.sampling.logit_bias_eog.begin(), params.sampling.logit_bias_eog.end());
|
||||
}
|
||||
|
||||
if (params.sampling.penalty_last_n == -1) {
|
||||
LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
|
||||
params.sampling.penalty_last_n = llama_n_ctx(lctx);
|
||||
}
|
||||
|
||||
if (params.sampling.dry_penalty_last_n == -1) {
|
||||
LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
|
||||
params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
|
||||
}
|
||||
|
||||
if (params.warmup) {
|
||||
LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
|
||||
|
||||
|
|
@ -1241,12 +1303,11 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|||
llama_set_warmup(lctx, false);
|
||||
}
|
||||
|
||||
iparams.model.reset(model);
|
||||
iparams.context.reset(lctx);
|
||||
|
||||
return iparams;
|
||||
return res;
|
||||
}
|
||||
|
||||
common_init_result::~common_init_result() = default;
|
||||
|
||||
std::string get_model_endpoint() {
|
||||
const char * model_endpoint_env = getenv("MODEL_ENDPOINT");
|
||||
// We still respect the use of environment-variable "HF_ENDPOINT" for backward-compatibility.
|
||||
|
|
@ -1255,7 +1316,9 @@ std::string get_model_endpoint() {
|
|||
std::string model_endpoint = "https://huggingface.co/";
|
||||
if (endpoint_env) {
|
||||
model_endpoint = endpoint_env;
|
||||
if (model_endpoint.back() != '/') model_endpoint += '/';
|
||||
if (model_endpoint.back() != '/') {
|
||||
model_endpoint += '/';
|
||||
}
|
||||
}
|
||||
return model_endpoint;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -82,7 +82,8 @@ int32_t cpu_get_num_math();
|
|||
enum llama_example {
|
||||
LLAMA_EXAMPLE_COMMON,
|
||||
LLAMA_EXAMPLE_SPECULATIVE,
|
||||
LLAMA_EXAMPLE_MAIN,
|
||||
LLAMA_EXAMPLE_COMPLETION,
|
||||
LLAMA_EXAMPLE_CLI,
|
||||
LLAMA_EXAMPLE_EMBEDDING,
|
||||
LLAMA_EXAMPLE_PERPLEXITY,
|
||||
LLAMA_EXAMPLE_RETRIEVAL,
|
||||
|
|
@ -98,6 +99,7 @@ enum llama_example {
|
|||
LLAMA_EXAMPLE_TTS,
|
||||
LLAMA_EXAMPLE_DIFFUSION,
|
||||
LLAMA_EXAMPLE_FINETUNE,
|
||||
LLAMA_EXAMPLE_FIT_PARAMS,
|
||||
|
||||
LLAMA_EXAMPLE_COUNT,
|
||||
};
|
||||
|
|
@ -194,7 +196,6 @@ struct common_params_sampling {
|
|||
|
||||
std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY
|
||||
|
||||
|
||||
std::vector<enum common_sampler_type> samplers = {
|
||||
COMMON_SAMPLER_TYPE_PENALTIES,
|
||||
COMMON_SAMPLER_TYPE_DRY,
|
||||
|
|
@ -215,6 +216,10 @@ struct common_params_sampling {
|
|||
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
|
||||
std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens
|
||||
|
||||
bool has_logit_bias() const {
|
||||
return !logit_bias.empty();
|
||||
}
|
||||
|
||||
// print the parameters into a string
|
||||
std::string print() const;
|
||||
};
|
||||
|
|
@ -302,8 +307,8 @@ struct lr_opt {
|
|||
struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
|
||||
|
||||
struct common_params {
|
||||
int32_t n_predict = -1; // new tokens to predict
|
||||
int32_t n_ctx = 4096; // context size
|
||||
int32_t n_predict = -1; // max. number of new tokens to predict, -1 == no limit
|
||||
int32_t n_ctx = 0; // context size, 0 == context the model was trained with
|
||||
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
||||
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
|
||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||
|
|
@ -324,9 +329,12 @@ struct common_params {
|
|||
// offload params
|
||||
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
|
||||
|
||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
||||
bool fit_params = true; // whether to fit unset model/context parameters to free device memory
|
||||
size_t fit_params_target = 1024 * 1024*1024; // margin per device in bytes for fitting parameters to free memory
|
||||
int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use
|
||||
|
||||
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
|
||||
|
||||
|
|
@ -406,6 +414,7 @@ struct common_params {
|
|||
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
|
||||
bool cont_batching = true; // insert new sequences for decoding on-the-fly
|
||||
bool no_perf = false; // disable performance metrics
|
||||
bool show_timings = true; // show timing information on CLI
|
||||
bool ctx_shift = false; // context shift on infinite text generation
|
||||
bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
|
||||
bool kv_unified = false; // enable unified KV cache
|
||||
|
|
@ -462,7 +471,7 @@ struct common_params {
|
|||
std::string public_path = ""; // NOLINT
|
||||
std::string api_prefix = ""; // NOLINT
|
||||
std::string chat_template = ""; // NOLINT
|
||||
bool use_jinja = false; // NOLINT
|
||||
bool use_jinja = true; // NOLINT
|
||||
bool enable_chat_template = true;
|
||||
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
|
||||
int reasoning_budget = -1;
|
||||
|
|
@ -482,9 +491,10 @@ struct common_params {
|
|||
bool endpoint_metrics = false;
|
||||
|
||||
// router server configs
|
||||
std::string models_dir = ""; // directory containing models for the router server
|
||||
int models_max = 4; // maximum number of models to load simultaneously
|
||||
bool models_autoload = true; // automatically load models when requested via the router server
|
||||
std::string models_dir = ""; // directory containing models for the router server
|
||||
std::string models_preset = ""; // directory containing model presets for the router server
|
||||
int models_max = 4; // maximum number of models to load simultaneously
|
||||
bool models_autoload = true; // automatically load models when requested via the router server
|
||||
|
||||
bool log_json = false;
|
||||
|
||||
|
|
@ -666,15 +676,29 @@ bool tty_can_use_colors();
|
|||
// Model utils
|
||||
//
|
||||
|
||||
// note: defines object's lifetime
|
||||
struct common_init_result {
|
||||
llama_model_ptr model;
|
||||
llama_context_ptr context;
|
||||
struct common_sampler;
|
||||
|
||||
std::vector<llama_adapter_lora_ptr> lora;
|
||||
// note: defines the model, context, samplers, ets. lifetimes
|
||||
struct common_init_result {
|
||||
common_init_result(common_params & params);
|
||||
~common_init_result();
|
||||
|
||||
llama_model * model();
|
||||
llama_context * context();
|
||||
common_sampler * sampler(llama_seq_id seq_id);
|
||||
|
||||
std::vector<llama_adapter_lora_ptr> & lora();
|
||||
|
||||
void free_context();
|
||||
|
||||
private:
|
||||
struct impl;
|
||||
std::unique_ptr<impl> pimpl;
|
||||
};
|
||||
|
||||
struct common_init_result common_init_from_params(common_params & params);
|
||||
using common_init_result_ptr = std::unique_ptr<common_init_result>;
|
||||
|
||||
common_init_result_ptr common_init_from_params(common_params & params);
|
||||
|
||||
struct llama_model_params common_model_params_to_llama ( common_params & params);
|
||||
struct llama_context_params common_context_params_to_llama(const common_params & params);
|
||||
|
|
|
|||
|
|
@ -305,8 +305,9 @@ static std::string format_literal(const std::string & literal) {
|
|||
|
||||
std::string gbnf_format_literal(const std::string & literal) { return format_literal(literal); }
|
||||
|
||||
class SchemaConverter {
|
||||
class common_schema_converter {
|
||||
private:
|
||||
friend class common_schema_info;
|
||||
friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options);
|
||||
std::function<json(const std::string &)> _fetch_json;
|
||||
bool _dotall;
|
||||
|
|
@ -729,7 +730,7 @@ private:
|
|||
}
|
||||
|
||||
public:
|
||||
SchemaConverter(
|
||||
common_schema_converter(
|
||||
const std::function<json(const std::string &)> & fetch_json,
|
||||
bool dotall)
|
||||
: _fetch_json(fetch_json), _dotall(dotall)
|
||||
|
|
@ -990,6 +991,134 @@ public:
|
|||
}
|
||||
};
|
||||
|
||||
// common_schema_info implementation (pimpl)
|
||||
|
||||
common_schema_info::common_schema_info()
|
||||
: impl_(std::make_unique<common_schema_converter>(
|
||||
[](const std::string &) { return json(); },
|
||||
false)) {}
|
||||
|
||||
common_schema_info::~common_schema_info() = default;
|
||||
|
||||
common_schema_info::common_schema_info(common_schema_info &&) noexcept = default;
|
||||
common_schema_info & common_schema_info::operator=(common_schema_info &&) noexcept = default;
|
||||
|
||||
void common_schema_info::resolve_refs(nlohmann::ordered_json & schema) {
|
||||
impl_->resolve_refs(schema, "");
|
||||
}
|
||||
|
||||
// Determines if a JSON schema can resolve to a string type through any path.
|
||||
// Some models emit raw string values rather than JSON-encoded strings for string parameters.
|
||||
// If any branch of the schema (via oneOf, anyOf, $ref, etc.) permits a string, this returns
|
||||
// true, allowing callers to handle the value as a raw string for simplicity.
|
||||
bool common_schema_info::resolves_to_string(const nlohmann::ordered_json & schema) {
|
||||
std::unordered_set<std::string> visited_refs;
|
||||
|
||||
std::function<bool(const json &)> check = [&](const json & s) -> bool {
|
||||
if (!s.is_object()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Handle $ref
|
||||
if (s.contains("$ref")) {
|
||||
const std::string & ref = s["$ref"];
|
||||
if (visited_refs.find(ref) != visited_refs.end()) {
|
||||
// Circular reference, assume not a string to be safe
|
||||
return false;
|
||||
}
|
||||
visited_refs.insert(ref);
|
||||
auto it = impl_->_refs.find(ref);
|
||||
if (it != impl_->_refs.end()) {
|
||||
return check(it->second);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check type field
|
||||
if (s.contains("type")) {
|
||||
const json & schema_type = s["type"];
|
||||
if (schema_type.is_string()) {
|
||||
if (schema_type == "string") {
|
||||
return true;
|
||||
}
|
||||
} else if (schema_type.is_array()) {
|
||||
// Type can be an array like ["string", "null"]
|
||||
for (const auto & t : schema_type) {
|
||||
if (t == "string") {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Check oneOf/anyOf - if any alternative can be a string
|
||||
if (s.contains("oneOf")) {
|
||||
for (const auto & alt : s["oneOf"]) {
|
||||
if (check(alt)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (s.contains("anyOf")) {
|
||||
for (const auto & alt : s["anyOf"]) {
|
||||
if (check(alt)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Check allOf - all components must be compatible with string type
|
||||
if (s.contains("allOf")) {
|
||||
bool all_string = true;
|
||||
for (const auto & component : s["allOf"]) {
|
||||
if (!check(component)) {
|
||||
all_string = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (all_string) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Check const - if the constant value is a string
|
||||
if (s.contains("const")) {
|
||||
if (s["const"].is_string()) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Check enum - if any enum value is a string
|
||||
if (s.contains("enum")) {
|
||||
for (const auto & val : s["enum"]) {
|
||||
if (val.is_string()) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// String-specific keywords imply string type
|
||||
if (s.contains("pattern") || s.contains("minLength") || s.contains("maxLength")) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Check format - many formats imply string
|
||||
if (s.contains("format")) {
|
||||
const std::string & fmt = s["format"];
|
||||
if (fmt == "date" || fmt == "time" || fmt == "date-time" ||
|
||||
fmt == "uri" || fmt == "email" || fmt == "hostname" ||
|
||||
fmt == "ipv4" || fmt == "ipv6" || fmt == "uuid" ||
|
||||
fmt.find("uuid") == 0) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
};
|
||||
|
||||
return check(schema);
|
||||
}
|
||||
|
||||
std::string json_schema_to_grammar(const json & schema, bool force_gbnf) {
|
||||
#ifdef LLAMA_USE_LLGUIDANCE
|
||||
if (!force_gbnf) {
|
||||
|
|
@ -1006,7 +1135,7 @@ std::string json_schema_to_grammar(const json & schema, bool force_gbnf) {
|
|||
}
|
||||
|
||||
std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options) {
|
||||
SchemaConverter converter([&](const std::string &) { return json(); }, options.dotall);
|
||||
common_schema_converter converter([&](const std::string &) { return json(); }, options.dotall);
|
||||
common_grammar_builder builder {
|
||||
/* .add_rule = */ [&](const std::string & name, const std::string & rule) {
|
||||
return converter._add_rule(name, rule);
|
||||
|
|
|
|||
|
|
@ -3,11 +3,31 @@
|
|||
#include <nlohmann/json_fwd.hpp>
|
||||
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
std::string json_schema_to_grammar(const nlohmann::ordered_json & schema,
|
||||
bool force_gbnf = false);
|
||||
|
||||
class common_schema_converter;
|
||||
|
||||
// Probes a JSON schema to extract information about its structure and type constraints.
|
||||
class common_schema_info {
|
||||
std::unique_ptr<common_schema_converter> impl_;
|
||||
|
||||
public:
|
||||
common_schema_info();
|
||||
~common_schema_info();
|
||||
|
||||
common_schema_info(const common_schema_info &) = delete;
|
||||
common_schema_info & operator=(const common_schema_info &) = delete;
|
||||
common_schema_info(common_schema_info &&) noexcept;
|
||||
common_schema_info & operator=(common_schema_info &&) noexcept;
|
||||
|
||||
void resolve_refs(nlohmann::ordered_json & schema);
|
||||
bool resolves_to_string(const nlohmann::ordered_json & schema);
|
||||
};
|
||||
|
||||
struct common_grammar_builder {
|
||||
std::function<std::string(const std::string &, const std::string &)> add_rule;
|
||||
std::function<std::string(const std::string &, const nlohmann::ordered_json &)> add_schema;
|
||||
|
|
|
|||
|
|
@ -420,6 +420,11 @@ void common_log_set_timestamps(struct common_log * log, bool timestamps) {
|
|||
log->set_timestamps(timestamps);
|
||||
}
|
||||
|
||||
void common_log_flush(struct common_log * log) {
|
||||
log->pause();
|
||||
log->resume();
|
||||
}
|
||||
|
||||
static int common_get_verbosity(enum ggml_log_level level) {
|
||||
switch (level) {
|
||||
case GGML_LOG_LEVEL_DEBUG: return LOG_LEVEL_DEBUG;
|
||||
|
|
|
|||
|
|
@ -84,6 +84,7 @@ void common_log_set_file (struct common_log * log, const char * file); // n
|
|||
void common_log_set_colors (struct common_log * log, log_colors colors); // not thread-safe
|
||||
void common_log_set_prefix (struct common_log * log, bool prefix); // whether to output prefix to each log
|
||||
void common_log_set_timestamps(struct common_log * log, bool timestamps); // whether to output timestamps in the prefix
|
||||
void common_log_flush (struct common_log * log); // flush all pending log messages
|
||||
|
||||
// helper macros for logging
|
||||
// use these to avoid computing log arguments if the verbosity of the log is higher than the threshold
|
||||
|
|
|
|||
|
|
@ -104,9 +104,10 @@ struct ring_buffer {
|
|||
struct common_sampler {
|
||||
common_params_sampling params;
|
||||
|
||||
struct llama_sampler * grmr;
|
||||
struct llama_sampler * chain;
|
||||
|
||||
bool grammar;
|
||||
|
||||
ring_buffer<llama_token> prev;
|
||||
|
||||
std::vector<llama_token_data> cur;
|
||||
|
|
@ -116,7 +117,6 @@ struct common_sampler {
|
|||
void reset() {
|
||||
prev.clear();
|
||||
|
||||
llama_sampler_reset(grmr);
|
||||
llama_sampler_reset(chain);
|
||||
}
|
||||
|
||||
|
|
@ -167,10 +167,15 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
|||
|
||||
lparams.no_perf = params.no_perf;
|
||||
|
||||
struct llama_sampler * grmr;
|
||||
llama_sampler * chain = llama_sampler_chain_init(lparams);
|
||||
|
||||
bool grammar = false;
|
||||
std::vector<llama_sampler *> samplers;
|
||||
|
||||
if (params.grammar.compare(0, 11, "%llguidance") == 0) {
|
||||
#ifdef LLAMA_USE_LLGUIDANCE
|
||||
grmr = llama_sampler_init_llg(vocab, "lark", params.grammar.c_str());
|
||||
samplers.push_back(llama_sampler_init_llg(vocab, "lark", params.grammar.c_str()));
|
||||
grammar = true;
|
||||
#else
|
||||
GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
|
||||
#endif // LLAMA_USE_LLGUIDANCE
|
||||
|
|
@ -217,30 +222,23 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
|||
trigger_patterns_c.push_back(regex.c_str());
|
||||
}
|
||||
|
||||
grmr = params.grammar_lazy
|
||||
? llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
|
||||
trigger_patterns_c.data(), trigger_patterns_c.size(),
|
||||
trigger_tokens.data(), trigger_tokens.size())
|
||||
: llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
|
||||
if (!grmr) {
|
||||
return nullptr;
|
||||
if (!params.grammar.empty()) {
|
||||
if (params.grammar_lazy) {
|
||||
samplers.push_back(
|
||||
llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
|
||||
trigger_patterns_c.data(), trigger_patterns_c.size(),
|
||||
trigger_tokens.data(), trigger_tokens.size()));
|
||||
} else {
|
||||
samplers.push_back(llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root"));
|
||||
}
|
||||
|
||||
grammar = true;
|
||||
}
|
||||
}
|
||||
|
||||
auto * result = new common_sampler {
|
||||
/* .params = */ params,
|
||||
/* .grmr = */ grmr,
|
||||
/* .chain = */ llama_sampler_chain_init(lparams),
|
||||
/* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
|
||||
/* .cur = */ {},
|
||||
/* .cur_p = */ {},
|
||||
};
|
||||
|
||||
llama_sampler_chain_add(result->chain,
|
||||
llama_sampler_init_logit_bias(
|
||||
llama_vocab_n_tokens(vocab),
|
||||
params.logit_bias.size(),
|
||||
params.logit_bias.data()));
|
||||
if (params.has_logit_bias()) {
|
||||
samplers.push_back(llama_sampler_init_logit_bias(llama_vocab_n_tokens(vocab), params.logit_bias.size(), params.logit_bias.data()));
|
||||
}
|
||||
|
||||
if (params.mirostat == 0) {
|
||||
for (const auto & cnstr : params.samplers) {
|
||||
|
|
@ -253,58 +251,70 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
|||
c_breakers.push_back(str.c_str());
|
||||
}
|
||||
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
|
||||
samplers.push_back(llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
|
||||
}
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TOP_K:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
|
||||
samplers.push_back(llama_sampler_init_top_k (params.top_k));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TOP_P:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
|
||||
samplers.push_back(llama_sampler_init_top_p (params.top_p, params.min_keep));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TOP_N_SIGMA:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma (params.top_n_sigma));
|
||||
samplers.push_back(llama_sampler_init_top_n_sigma(params.top_n_sigma));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_MIN_P:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
|
||||
samplers.push_back(llama_sampler_init_min_p (params.min_p, params.min_keep));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_XTC:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
|
||||
samplers.push_back(llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TYPICAL_P:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
|
||||
samplers.push_back(llama_sampler_init_typical (params.typ_p, params.min_keep));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TEMPERATURE:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
|
||||
samplers.push_back(llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_INFILL:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_infill (vocab));
|
||||
samplers.push_back(llama_sampler_init_infill (vocab));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_PENALTIES:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_penalties (params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
|
||||
samplers.push_back(llama_sampler_init_penalties (params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false && "unknown sampler type");
|
||||
}
|
||||
}
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
|
||||
|
||||
samplers.push_back(llama_sampler_init_dist(params.seed));
|
||||
} else if (params.mirostat == 1) {
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_vocab_n_tokens(vocab), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
|
||||
samplers.push_back(llama_sampler_init_temp(params.temp));
|
||||
samplers.push_back(llama_sampler_init_mirostat(llama_vocab_n_tokens(vocab), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
|
||||
} else if (params.mirostat == 2) {
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
|
||||
samplers.push_back(llama_sampler_init_temp(params.temp));
|
||||
samplers.push_back(llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
|
||||
} else {
|
||||
GGML_ASSERT(false && "unknown mirostat version");
|
||||
}
|
||||
|
||||
for (auto * smpl : samplers) {
|
||||
llama_sampler_chain_add(chain, smpl);
|
||||
}
|
||||
|
||||
auto * result = new common_sampler {
|
||||
/* .params = */ params,
|
||||
/* .chain = */ chain,
|
||||
/* .grammar = */ grammar,
|
||||
/* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
|
||||
/* .cur = */ {},
|
||||
/* .cur_p = */ {},
|
||||
};
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
void common_sampler_free(struct common_sampler * gsmpl) {
|
||||
if (gsmpl) {
|
||||
llama_sampler_free(gsmpl->grmr);
|
||||
|
||||
llama_sampler_free(gsmpl->chain);
|
||||
|
||||
delete gsmpl;
|
||||
|
|
@ -314,11 +324,24 @@ void common_sampler_free(struct common_sampler * gsmpl) {
|
|||
void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
|
||||
const auto tm = gsmpl->tm();
|
||||
|
||||
if (accept_grammar) {
|
||||
llama_sampler_accept(gsmpl->grmr, token);
|
||||
}
|
||||
if (gsmpl->grammar) {
|
||||
const int n_smpl = llama_sampler_chain_n(gsmpl->chain);
|
||||
|
||||
llama_sampler_accept(gsmpl->chain, token);
|
||||
for (int i = 0; i < n_smpl; i++) {
|
||||
auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
|
||||
|
||||
// the grammar sampler is always the first one
|
||||
if (i == 0) {
|
||||
if (accept_grammar) {
|
||||
llama_sampler_accept(smpl, token);
|
||||
}
|
||||
} else {
|
||||
llama_sampler_accept(smpl, token);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
llama_sampler_accept(gsmpl->chain, token);
|
||||
}
|
||||
|
||||
gsmpl->prev.push_back(token);
|
||||
}
|
||||
|
|
@ -329,12 +352,12 @@ void common_sampler_reset(struct common_sampler * gsmpl) {
|
|||
|
||||
struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
|
||||
return new common_sampler {
|
||||
/* .params = */ gsmpl->params,
|
||||
/* .grmr = */ llama_sampler_clone(gsmpl->grmr),
|
||||
/* .chain = */ llama_sampler_clone(gsmpl->chain),
|
||||
/* .prev = */ gsmpl->prev,
|
||||
/* .cur = */ gsmpl->cur,
|
||||
/* .cur_p = */ gsmpl->cur_p,
|
||||
/* .params = */ gsmpl->params,
|
||||
/* .chain = */ llama_sampler_clone(gsmpl->chain),
|
||||
/* .grammar = */ gsmpl->grammar,
|
||||
/* .prev = */ gsmpl->prev,
|
||||
/* .cur = */ gsmpl->cur,
|
||||
/* .cur_p = */ gsmpl->cur_p,
|
||||
};
|
||||
}
|
||||
|
||||
|
|
@ -383,58 +406,33 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
|
|||
}
|
||||
}
|
||||
|
||||
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
|
||||
struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl) {
|
||||
return gsmpl->chain;
|
||||
}
|
||||
|
||||
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx) {
|
||||
llama_synchronize(ctx);
|
||||
|
||||
// start measuring sampling time after the llama_context synchronization in order to not measure any ongoing async operations
|
||||
const auto tm = gsmpl->tm();
|
||||
|
||||
gsmpl->set_logits(ctx, idx);
|
||||
llama_token id = LLAMA_TOKEN_NULL;
|
||||
|
||||
auto & grmr = gsmpl->grmr;
|
||||
auto & chain = gsmpl->chain;
|
||||
auto & cur_p = gsmpl->cur_p; // initialized by set_logits
|
||||
|
||||
if (grammar_first) {
|
||||
llama_sampler_apply(grmr, &cur_p);
|
||||
}
|
||||
gsmpl->set_logits(ctx, idx);
|
||||
|
||||
llama_sampler_apply(chain, &cur_p);
|
||||
|
||||
GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");
|
||||
|
||||
const llama_token id = cur_p.data[cur_p.selected].id;
|
||||
id = cur_p.data[cur_p.selected].id;
|
||||
|
||||
if (grammar_first) {
|
||||
return id;
|
||||
}
|
||||
|
||||
// check if it the sampled token fits the grammar
|
||||
{
|
||||
llama_token_data single_token_data = { id, 1.0f, 0.0f };
|
||||
llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false };
|
||||
|
||||
llama_sampler_apply(grmr, &single_token_data_array);
|
||||
|
||||
const bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
|
||||
if (is_valid) {
|
||||
return id;
|
||||
}
|
||||
}
|
||||
|
||||
// resampling:
|
||||
// if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain
|
||||
gsmpl->set_logits(ctx, idx);
|
||||
|
||||
llama_sampler_apply(grmr, &cur_p);
|
||||
llama_sampler_apply(chain, &cur_p);
|
||||
|
||||
GGML_ASSERT(cur_p.selected != -1 && "no selected token during re-sampling - check your sampling configuration");
|
||||
|
||||
return cur_p.data[cur_p.selected].id;
|
||||
return id;
|
||||
}
|
||||
|
||||
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first) {
|
||||
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft) {
|
||||
GGML_ASSERT(idxs.size() == draft.size() + 1 && "idxs.size() must be draft.size() + 1");
|
||||
|
||||
std::vector<llama_token> result;
|
||||
|
|
@ -442,7 +440,7 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
|
|||
|
||||
size_t i = 0;
|
||||
for (; i < draft.size(); i++) {
|
||||
const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
|
||||
const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i]);
|
||||
|
||||
common_sampler_accept(gsmpl, id, true);
|
||||
|
||||
|
|
@ -454,7 +452,7 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
|
|||
}
|
||||
|
||||
if (i == draft.size()) {
|
||||
const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
|
||||
const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i]);
|
||||
|
||||
common_sampler_accept(gsmpl, id, true);
|
||||
|
||||
|
|
@ -464,13 +462,13 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
|
|||
return result;
|
||||
}
|
||||
|
||||
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first) {
|
||||
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft) {
|
||||
std::vector<int> idxs(draft.size() + 1);
|
||||
for (size_t i = 0; i < idxs.size(); ++i) {
|
||||
idxs[i] = i;
|
||||
}
|
||||
|
||||
return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft, grammar_first);
|
||||
return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft);
|
||||
}
|
||||
|
||||
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
|
||||
|
|
@ -515,7 +513,8 @@ std::string common_sampler_print(const struct common_sampler * gsmpl) {
|
|||
|
||||
for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
|
||||
const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
|
||||
result += std::string("-> ") + llama_sampler_name(smpl) + " ";
|
||||
result += std::string("-> ");
|
||||
result += std::string(llama_sampler_name(smpl)) + " ";
|
||||
}
|
||||
|
||||
return result;
|
||||
|
|
|
|||
|
|
@ -48,6 +48,8 @@ struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);
|
|||
// arguments can be nullptr to skip printing
|
||||
void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl);
|
||||
|
||||
struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl);
|
||||
|
||||
// extended sampling implementation:
|
||||
//
|
||||
// - set logits
|
||||
|
|
@ -55,10 +57,7 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
|
|||
// - check if the token fits the grammar (if any)
|
||||
// - if not: resample by first applying the grammar constraints and then sampling again (slower path)
|
||||
//
|
||||
// if grammar_first is true, the grammar is applied before the samplers (slower)
|
||||
// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
|
||||
//
|
||||
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
|
||||
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx);
|
||||
|
||||
// generalized version of common_sampler_sample
|
||||
//
|
||||
|
|
@ -76,10 +75,10 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
|
|||
//
|
||||
// returns at least 1 token, up to idxs.size()
|
||||
//
|
||||
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first = false);
|
||||
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft);
|
||||
|
||||
// assume idxs == [ 0, 1, 2, ..., draft.size() ]
|
||||
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first = false);
|
||||
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft);
|
||||
|
||||
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
|
||||
|
||||
|
|
@ -107,3 +106,9 @@ std::vector<enum common_sampler_type> common_sampler_types_from_chars(const std:
|
|||
|
||||
llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab,
|
||||
const char * grammar_kind, const char * grammar_data);
|
||||
|
||||
struct common_sampler_deleter {
|
||||
void operator()(common_sampler * s) { common_sampler_free(s); }
|
||||
};
|
||||
|
||||
typedef std::unique_ptr<common_sampler, common_sampler_deleter> common_sampler_ptr;
|
||||
|
|
|
|||
|
|
@ -313,6 +313,7 @@ extern "C" {
|
|||
bool check_tensors; // validate model tensor data
|
||||
bool use_extra_bufts; // use extra buffer types (used for weight repacking)
|
||||
bool no_host; // bypass host buffer allowing extra buffers to be used
|
||||
bool no_alloc; // only load metadata and simulate memory allocations
|
||||
};
|
||||
|
||||
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
|
||||
|
|
@ -466,10 +467,24 @@ extern "C" {
|
|||
// Frees all allocated memory
|
||||
LLAMA_API void llama_free(struct llama_context * ctx);
|
||||
|
||||
// fits mparams and cparams to free device memory (assumes system memory is unlimited)
|
||||
// returns true if the parameters could be successfully modified to fit device memory
|
||||
// this function is NOT thread safe because it modifies the global llama logger state
|
||||
LLAMA_API bool llama_params_fit(
|
||||
const char * path_model,
|
||||
struct llama_model_params * mparams,
|
||||
struct llama_context_params * cparams,
|
||||
float * tensor_split, // writable buffer for tensor split, needs at least llama_max_devices elements
|
||||
struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements
|
||||
size_t margin, // margin of memory to leave per device in bytes
|
||||
uint32_t n_ctx_min, // minimum context size to set when trying to reduce memory use
|
||||
enum ggml_log_level log_level); // minimum log level to print during fitting, lower levels go to debug log
|
||||
|
||||
LLAMA_API int64_t llama_time_us(void);
|
||||
|
||||
LLAMA_API size_t llama_max_devices(void);
|
||||
LLAMA_API size_t llama_max_parallel_sequences(void);
|
||||
LLAMA_API size_t llama_max_tensor_buft_overrides(void);
|
||||
|
||||
LLAMA_API bool llama_supports_mmap (void);
|
||||
LLAMA_API bool llama_supports_mlock (void);
|
||||
|
|
@ -1354,7 +1369,9 @@ extern "C" {
|
|||
|
||||
// Set callback for all future logging events.
|
||||
// If this is not called, or NULL is supplied, everything is output on stderr.
|
||||
LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);
|
||||
// The logger state is global so these functions are NOT thread safe.
|
||||
LLAMA_API void llama_log_get(ggml_log_callback * log_callback, void ** user_data);
|
||||
LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);
|
||||
|
||||
//
|
||||
// Performance utils
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
|
|
@ -3,6 +3,7 @@
|
|||
#include "ggml.h" // ggml_op
|
||||
|
||||
#include <string>
|
||||
#include <set>
|
||||
|
||||
//
|
||||
// gguf constants (sync with gguf.py)
|
||||
|
|
@ -318,6 +319,7 @@ enum llm_tensor {
|
|||
LLM_TENSOR_DENSE_3_OUT,
|
||||
LLM_TENSOR_OUTPUT,
|
||||
LLM_TENSOR_OUTPUT_NORM,
|
||||
LLM_TENSOR_OUTPUT_NORM_LFM2, // fix for wrong tensor name
|
||||
LLM_TENSOR_ROPE_FREQS,
|
||||
LLM_TENSOR_ROPE_FACTORS_LONG,
|
||||
LLM_TENSOR_ROPE_FACTORS_SHORT,
|
||||
|
|
@ -529,6 +531,10 @@ struct LLM_TN_IMPL {
|
|||
const int bid;
|
||||
const int xid;
|
||||
|
||||
const std::set<llm_tensor> model_tensors;
|
||||
|
||||
LLM_TN_IMPL(llm_arch arch, llm_tensor tensor, const char * suffix, int bid, int xid);
|
||||
|
||||
std::string str() const;
|
||||
|
||||
operator std::string() const {
|
||||
|
|
@ -550,11 +556,11 @@ struct LLM_TN {
|
|||
llm_arch arch;
|
||||
|
||||
LLM_TN_IMPL operator()(llm_tensor tensor, const char * suffix, int bid = -1, int xid = -1) const {
|
||||
return { arch, tensor, suffix, bid, xid };
|
||||
return LLM_TN_IMPL(arch, tensor, suffix, bid, xid);
|
||||
}
|
||||
|
||||
LLM_TN_IMPL operator()(llm_tensor tensor, int bid = -1, int xid = -1) const {
|
||||
return { arch, tensor, nullptr, bid, xid };
|
||||
return LLM_TN_IMPL(arch, tensor, nullptr, bid, xid);
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -695,6 +695,8 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
|
|||
udata->seq_idx .resize(LLAMA_MAX_SEQ, -1);
|
||||
udata->output .resize(n_tokens);
|
||||
|
||||
udata->seq_id_data.reserve(n_tokens);
|
||||
|
||||
seq_set_t seq_set_unq;
|
||||
|
||||
for (size_t i = 0; i < idxs.size(); ++i) {
|
||||
|
|
@ -716,11 +718,13 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
|
|||
}
|
||||
|
||||
udata->n_seq_id[i] = batch.n_seq_id[idxs[i]];
|
||||
udata->seq_id[i] = batch.seq_id[idxs[i]];
|
||||
udata->output[i] = batch.logits[idxs[i]];
|
||||
|
||||
for (int s = 0; s < udata->n_seq_id[i]; ++s) {
|
||||
seq_set_unq.set(udata->seq_id[i][s]);
|
||||
const llama_seq_id seq_id = batch.seq_id[idxs[i]][s];
|
||||
|
||||
udata->seq_id_data.push_back(seq_id);
|
||||
seq_set_unq.set(seq_id);
|
||||
}
|
||||
|
||||
if (udata->output[i]) {
|
||||
|
|
@ -728,6 +732,12 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
|
|||
}
|
||||
}
|
||||
|
||||
llama_seq_id * seq_id_ptr = udata->seq_id_data.data();
|
||||
for (size_t i = 0; i < idxs.size(); ++i) {
|
||||
udata->seq_id[i] = seq_id_ptr;
|
||||
seq_id_ptr += udata->n_seq_id[i];
|
||||
}
|
||||
|
||||
for (uint32_t s = 0; s < n_seq_max; ++s) {
|
||||
if (seq_set_unq.test(s)) {
|
||||
udata->seq_idx[s] = udata->seq_id_unq.size();
|
||||
|
|
|
|||
|
|
@ -56,13 +56,15 @@ struct llama_ubatch {
|
|||
std::vector<float> embd;
|
||||
std::vector<llama_pos> pos;
|
||||
std::vector<int32_t> n_seq_id;
|
||||
std::vector<llama_seq_id *> seq_id;
|
||||
std::vector<llama_seq_id *> seq_id; // these point into the seq_id_data below
|
||||
std::vector<llama_seq_id> seq_id_unq;
|
||||
std::vector<int32_t> seq_idx;
|
||||
std::vector<int8_t> output;
|
||||
|
||||
std::vector<llama_seq_id> seq_id_data;
|
||||
};
|
||||
|
||||
// the llama_ubatch pointers above point to this data if set. otherwise - points to non-owning data
|
||||
// the llama_ubatch pointers above point to this data if set. otherwise - point to external non-owning data
|
||||
std::shared_ptr<data_t> data;
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@
|
|||
#include "llama-model.h"
|
||||
|
||||
#include <cinttypes>
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
#include <limits>
|
||||
#include <stdexcept>
|
||||
|
|
@ -72,6 +73,43 @@ llama_context::llama_context(
|
|||
cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
|
||||
}
|
||||
|
||||
if (cparams.yarn_ext_factor != 0) {
|
||||
static auto get_mscale = [](float scale, float mscale) {
|
||||
return scale <= 1.0f ? 1.0f : (0.1f * mscale * logf(scale) + 1.0f);
|
||||
};
|
||||
|
||||
const float factor = 1.0f / cparams.rope_freq_scale;
|
||||
|
||||
// ref: https://github.com/huggingface/transformers/blob/6d00f6b0a5679c36510f203e4226e36f517c3032/src/transformers/modeling_rope_utils.py#L336-L348
|
||||
if (hparams.rope_yarn_log_mul != 0.0f) {
|
||||
// note: here we assume `mscale == 1.0f`
|
||||
// TODO: start reading the actual value of mscale and handle the case where it is not 1.0f
|
||||
float mscale = 1.0f;
|
||||
const float mscale_all_dims = hparams.rope_yarn_log_mul;
|
||||
|
||||
// [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
|
||||
// special-case DEEPSEEK v2:
|
||||
// https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite-Chat/blob/main/config.json#L42-L43
|
||||
if (model.arch == LLM_ARCH_DEEPSEEK2 && mscale_all_dims != 1.0f) {
|
||||
mscale = mscale_all_dims;
|
||||
}
|
||||
|
||||
cparams.yarn_attn_factor = get_mscale(factor, mscale) / get_mscale(factor, mscale_all_dims);
|
||||
|
||||
LLAMA_LOG_WARN("%s: setting new yarn_attn_factor = %.4f (mscale == %.1f, mscale_all_dim = %.1f)\n",
|
||||
__func__, cparams.yarn_attn_factor, mscale, mscale_all_dims);
|
||||
} else {
|
||||
cparams.yarn_attn_factor = get_mscale(factor, 1.0f);
|
||||
}
|
||||
|
||||
// when YARN is applied with yarn_ext_factor != 0.0f, we need to cancel this factor:
|
||||
// https://github.com/ggml-org/llama.cpp/blob/a81a569577cc38b32558958b048228150be63eae/ggml/src/ggml-cpu/ops.cpp#L5541-L5544
|
||||
//
|
||||
// ref: https://github.com/ggml-org/llama.cpp/discussions/7416
|
||||
// https://github.com/ggml-org/llama.cpp/pull/17945
|
||||
cparams.yarn_attn_factor *= 1.0f / (1.0f + 0.1f * logf(factor));
|
||||
}
|
||||
|
||||
cparams.yarn_attn_factor *= hparams.rope_attn_factor;
|
||||
|
||||
if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
|
||||
|
|
@ -93,14 +131,6 @@ llama_context::llama_context(
|
|||
// with causal attention, the batch size is limited by the context size
|
||||
cparams.n_batch = cparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
|
||||
|
||||
// the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
|
||||
// this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
|
||||
// ref: https://github.com/ggerganov/llama.cpp/pull/5021
|
||||
// TODO: this padding is not needed for the cache-less context so we should probably move it to llama_memory
|
||||
if (cparams.n_batch < GGML_KQ_MASK_PAD) {
|
||||
LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
|
||||
cparams.n_batch = GGML_KQ_MASK_PAD;
|
||||
}
|
||||
cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
|
||||
|
||||
cparams.op_offload = params.op_offload;
|
||||
|
|
@ -228,6 +258,7 @@ llama_context::llama_context(
|
|||
|
||||
backend_buft.clear();
|
||||
backend_ptrs.clear();
|
||||
backend_buf_exp_size.clear();
|
||||
|
||||
for (auto & backend : backends) {
|
||||
auto * buft = ggml_backend_get_default_buffer_type(backend.get());
|
||||
|
|
@ -244,6 +275,7 @@ llama_context::llama_context(
|
|||
|
||||
backend_buft.push_back(buft);
|
||||
backend_ptrs.push_back(backend.get());
|
||||
backend_buf_exp_size.push_back(0);
|
||||
}
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: backend_ptrs.size() = %zu\n", __func__, backend_ptrs.size());
|
||||
|
|
@ -359,7 +391,8 @@ llama_context::llama_context(
|
|||
|
||||
// reserve pp (prompt processing) graph first so that buffers are only allocated once
|
||||
{
|
||||
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
|
||||
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get(),
|
||||
model.hparams.no_alloc, model.hparams.no_alloc ? backend_buf_exp_size.data() : nullptr);
|
||||
if (!gf) {
|
||||
if (pipeline_parallel) {
|
||||
LLAMA_LOG_WARN("%s: compute buffer allocation failed, retrying without pipeline parallelism\n", __func__);
|
||||
|
|
@ -377,7 +410,7 @@ llama_context::llama_context(
|
|||
|
||||
// reserve with tg (token generation) graph to get the number of splits and nodes
|
||||
{
|
||||
auto * gf = graph_reserve(n_seqs, n_seqs, n_seqs, mctx.get());
|
||||
auto * gf = graph_reserve(n_seqs, n_seqs, n_seqs, mctx.get(), model.hparams.no_alloc);
|
||||
if (!gf) {
|
||||
throw std::runtime_error("failed to allocate compute tg buffers");
|
||||
}
|
||||
|
|
@ -392,7 +425,7 @@ llama_context::llama_context(
|
|||
//
|
||||
// auto * gf = graph_reserve(n_tokens, 1, n_tokens, mctx.get());
|
||||
//
|
||||
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
|
||||
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get(), model.hparams.no_alloc);
|
||||
if (!gf) {
|
||||
throw std::runtime_error("failed to allocate compute pp buffers");
|
||||
}
|
||||
|
|
@ -401,11 +434,13 @@ llama_context::llama_context(
|
|||
for (size_t i = 0; i < backend_ptrs.size(); ++i) {
|
||||
ggml_backend_t backend = backend_ptrs[i];
|
||||
ggml_backend_buffer_type_t buft = backend_buft[i];
|
||||
size_t size = ggml_backend_sched_get_buffer_size(sched.get(), backend);
|
||||
if (size > 1) {
|
||||
if (!model.hparams.no_alloc) {
|
||||
backend_buf_exp_size[i] = ggml_backend_sched_get_buffer_size(sched.get(), backend);
|
||||
}
|
||||
if (backend_buf_exp_size[i] > 1) {
|
||||
LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
|
||||
ggml_backend_buft_name(buft),
|
||||
size / 1024.0 / 1024.0);
|
||||
backend_buf_exp_size[i] / 1024.0 / 1024.0);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -424,6 +459,23 @@ llama_context::llama_context(
|
|||
}
|
||||
|
||||
llama_context::~llama_context() {
|
||||
// FIXME this currently results in a use-after-free bug if the model is freed before the context
|
||||
// if (!model.hparams.no_alloc) {
|
||||
// for (size_t i = 0; i < backend_ptrs.size(); ++i) {
|
||||
// ggml_backend_t backend = backend_ptrs[i];
|
||||
// ggml_backend_buffer_type_t buft = backend_buft[i];
|
||||
|
||||
// const size_t size_exp = backend_buf_exp_size[i];
|
||||
// const size_t size_act = ggml_backend_sched_get_buffer_size(sched.get(), backend);
|
||||
// if (size_exp == size_act) {
|
||||
// LLAMA_LOG_DEBUG("%s: %10s compute buffer size is %8.4f MiB, matches expectation of %8.4f MiB\n",
|
||||
// __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
|
||||
// } else {
|
||||
// LLAMA_LOG_WARN("%s: %10s compute buffer size of %8.4f MiB, does not match expectation of %8.4f MiB\n",
|
||||
// __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
ggml_opt_free(opt_ctx);
|
||||
}
|
||||
|
||||
|
|
@ -1325,6 +1377,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
|
|||
// This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark)
|
||||
LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
||||
#endif
|
||||
synchronize();
|
||||
buf_output = nullptr;
|
||||
logits = nullptr;
|
||||
embd = nullptr;
|
||||
|
|
@ -1396,7 +1449,8 @@ llm_graph_result * llama_context::get_gf_res_reserve() const {
|
|||
return static_cast<llm_graph_result *>(gf_res_reserve.get());
|
||||
}
|
||||
|
||||
ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only) {
|
||||
ggml_cgraph * llama_context::graph_reserve(
|
||||
uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only, size_t * sizes) {
|
||||
LLAMA_LOG_DEBUG("%s: reserving a graph for ubatch with n_tokens = %4u, n_seqs = %2u, n_outputs = %4u\n", __func__, n_tokens, n_seqs, n_outputs);
|
||||
GGML_ASSERT(n_outputs >= 1);
|
||||
|
||||
|
|
@ -1433,8 +1487,13 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
|
|||
|
||||
// initialize scheduler with the specified graph
|
||||
if (split_only) {
|
||||
ggml_backend_sched_split_graph(sched.get(), gf);
|
||||
if (sizes) {
|
||||
ggml_backend_sched_reserve_size(sched.get(), gf, sizes);
|
||||
} else {
|
||||
ggml_backend_sched_split_graph(sched.get(), gf);
|
||||
}
|
||||
} else if (!ggml_backend_sched_reserve(sched.get(), gf)) {
|
||||
GGML_ASSERT(!sizes);
|
||||
LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
|
||||
return nullptr;
|
||||
}
|
||||
|
|
@ -2056,15 +2115,26 @@ void llama_context::perf_reset() {
|
|||
|
||||
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> llama_context::memory_breakdown() const {
|
||||
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> ret;
|
||||
for (const auto & buft_size : model.memory_breakdown()) {
|
||||
ret[buft_size.first].model += buft_size.second;
|
||||
for (const auto & [buft, size] : model.memory_breakdown()) {
|
||||
ret[buft].model += size;
|
||||
}
|
||||
for (const auto & buft_size : memory->memory_breakdown()) {
|
||||
ret[buft_size.first].context += buft_size.second;
|
||||
if (memory) {
|
||||
for (const auto & [buft, size] : memory->memory_breakdown()) {
|
||||
ret[buft].context += size;
|
||||
}
|
||||
}
|
||||
for (const auto & backend_ptr : backends) {
|
||||
ggml_backend_t backend = backend_ptr.get();
|
||||
ret[ggml_backend_sched_get_buffer_type(sched.get(), backend)].compute += ggml_backend_sched_get_buffer_size(sched.get(), backend);
|
||||
if (model.hparams.no_alloc) {
|
||||
for (size_t i = 0; i < backends.size(); ++i) {
|
||||
ggml_backend_t backend = backends[i].get();
|
||||
ggml_backend_buffer_type_t buft = ggml_backend_sched_get_buffer_type(sched.get(), backend);
|
||||
ret[buft].compute += backend_buf_exp_size[i];
|
||||
}
|
||||
} else {
|
||||
for (const auto & backend_ptr : backends) {
|
||||
ggml_backend_t backend = backend_ptr.get();
|
||||
ggml_backend_buffer_type_t buft = ggml_backend_sched_get_buffer_type(sched.get(), backend);
|
||||
ret[buft].compute += ggml_backend_sched_get_buffer_size(sched.get(), backend);
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -26,6 +26,10 @@ struct llama_memory_breakdown_data {
|
|||
size_t model = 0; // memory allocated for the model
|
||||
size_t context = 0; // memory allocated for the context
|
||||
size_t compute = 0; // memory allocated for temporary compute buffers
|
||||
|
||||
size_t total() const {
|
||||
return model + context + compute;
|
||||
}
|
||||
};
|
||||
|
||||
struct llama_context {
|
||||
|
|
@ -206,7 +210,8 @@ public:
|
|||
ggml_status graph_compute(ggml_cgraph * gf, bool batched);
|
||||
|
||||
// reserve a graph with a dummy ubatch of the specified size
|
||||
ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false);
|
||||
ggml_cgraph * graph_reserve(
|
||||
uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false, size_t * sizes = nullptr);
|
||||
|
||||
private:
|
||||
llm_graph_params graph_params(
|
||||
|
|
@ -281,9 +286,10 @@ private:
|
|||
|
||||
std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns;
|
||||
|
||||
// buffer types used for the compute buffer of each backend
|
||||
// pointers and buffer types used for the compute buffer of each backend
|
||||
std::vector<ggml_backend_t> backend_ptrs;
|
||||
std::vector<ggml_backend_buffer_type_t> backend_buft;
|
||||
std::vector<size_t> backend_buf_exp_size; // expected buffer sizes
|
||||
|
||||
llm_graph_result_ptr gf_res_prev;
|
||||
llm_graph_result_ptr gf_res_reserve;
|
||||
|
|
|
|||
|
|
@ -78,7 +78,7 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
|
|||
for (int i = 0; i < n_tokens; ++i) {
|
||||
const float pos = ubatch->pos[i];
|
||||
attn_scale_data[i] = std::log(
|
||||
std::floor((pos + 1.0f) / n_attn_temp_floor_scale) + 1.0
|
||||
std::floor((pos + f_attn_temp_offset) / n_attn_temp_floor_scale) + 1.0
|
||||
) * f_attn_temp_scale + 1.0;
|
||||
}
|
||||
|
||||
|
|
@ -254,6 +254,24 @@ void llm_graph_input_rs::set_input(const llama_ubatch * ubatch) {
|
|||
}
|
||||
}
|
||||
|
||||
bool llm_graph_input_rs::can_reuse(const llm_graph_params & params) {
|
||||
const auto * mctx = static_cast<const llama_memory_recurrent_context *>(params.mctx);
|
||||
|
||||
this->mctx = mctx;
|
||||
|
||||
bool res = true;
|
||||
|
||||
res &= s_copy->ne[0] == mctx->get_n_rs();
|
||||
|
||||
res &= s_copy_main->ne[0] == params.ubatch.n_seqs;
|
||||
res &= s_copy_extra->ne[0] == mctx->get_n_rs() - params.ubatch.n_seqs;
|
||||
|
||||
res &= head == mctx->get_head();
|
||||
res &= rs_z == mctx->get_rs_z();
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
void llm_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) {
|
||||
GGML_UNUSED(ubatch);
|
||||
|
||||
|
|
@ -385,7 +403,7 @@ bool llm_graph_input_attn_kv::can_reuse(const llm_graph_params & params) {
|
|||
//res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
|
||||
|
||||
res &= self_kq_mask->ne[0] == mctx->get_n_kv();
|
||||
res &= self_kq_mask->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
|
||||
res &= self_kq_mask->ne[1] == params.ubatch.n_tokens;
|
||||
|
||||
return res;
|
||||
}
|
||||
|
|
@ -416,10 +434,10 @@ bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) {
|
|||
//res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
|
||||
|
||||
res &= self_kq_mask->ne[0] == mctx->get_base()->get_n_kv();
|
||||
res &= self_kq_mask->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
|
||||
res &= self_kq_mask->ne[1] == params.ubatch.n_tokens;
|
||||
|
||||
res &= self_kq_mask_swa->ne[0] == mctx->get_swa()->get_n_kv();
|
||||
res &= self_kq_mask_swa->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
|
||||
res &= self_kq_mask_swa->ne[1] == params.ubatch.n_tokens;
|
||||
|
||||
return res;
|
||||
}
|
||||
|
|
@ -452,7 +470,7 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
|
|||
}
|
||||
}
|
||||
|
||||
for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
|
||||
for (int i = n_tokens; i < n_tokens; ++i) {
|
||||
for (int j = 0; j < n_enc; ++j) {
|
||||
data[h*(n_enc*n_tokens) + i*n_enc + j] = -INFINITY;
|
||||
}
|
||||
|
|
@ -461,8 +479,46 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
|
|||
}
|
||||
|
||||
void llm_graph_input_mem_hybrid::set_input(const llama_ubatch * ubatch) {
|
||||
inp_attn->set_input(ubatch);
|
||||
inp_rs->set_input(ubatch);
|
||||
mctx->get_attn()->set_input_k_idxs(inp_attn->self_k_idxs, ubatch);
|
||||
mctx->get_attn()->set_input_v_idxs(inp_attn->self_v_idxs, ubatch);
|
||||
|
||||
mctx->get_attn()->set_input_kq_mask(inp_attn->self_kq_mask, ubatch, cparams.causal_attn);
|
||||
|
||||
const int64_t n_rs = mctx->get_recr()->get_n_rs();
|
||||
|
||||
if (inp_rs->s_copy) {
|
||||
GGML_ASSERT(ggml_backend_buffer_is_host(inp_rs->s_copy->buffer));
|
||||
int32_t * data = (int32_t *) inp_rs->s_copy->data;
|
||||
|
||||
// assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
|
||||
for (uint32_t i = 0; i < n_rs; ++i) {
|
||||
data[i] = mctx->get_recr()->s_copy(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool llm_graph_input_mem_hybrid::can_reuse(const llm_graph_params & params) {
|
||||
const auto * mctx = static_cast<const llama_memory_hybrid_context *>(params.mctx);
|
||||
|
||||
this->mctx = mctx;
|
||||
|
||||
bool res = true;
|
||||
|
||||
res &= inp_attn->self_k_idxs->ne[0] == params.ubatch.n_tokens;
|
||||
//res &= inp_attn->self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
|
||||
|
||||
res &= inp_attn->self_kq_mask->ne[0] == mctx->get_attn()->get_n_kv();
|
||||
res &= inp_attn->self_kq_mask->ne[1] == params.ubatch.n_tokens;
|
||||
|
||||
res &= inp_rs->s_copy->ne[0] == mctx->get_recr()->get_n_rs();
|
||||
|
||||
res &= inp_rs->s_copy_main->ne[0] == params.ubatch.n_seqs;
|
||||
res &= inp_rs->s_copy_extra->ne[0] == mctx->get_recr()->get_n_rs() - params.ubatch.n_seqs;
|
||||
|
||||
res &= inp_rs->head == mctx->get_recr()->get_head();
|
||||
res &= inp_rs->rs_z == mctx->get_recr()->get_rs_z();
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
//
|
||||
|
|
@ -1097,8 +1153,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|||
cur = ggml_relu(ctx0, cur);
|
||||
cur = ggml_sqr(ctx0, cur);
|
||||
cb(cur, "ffn_moe_relu_sqr", il);
|
||||
}
|
||||
break;
|
||||
} break;
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
|
@ -1213,7 +1268,7 @@ ggml_tensor * llm_graph_context::build_inp_pos() const {
|
|||
}
|
||||
|
||||
ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
|
||||
auto inp = std::make_unique<llm_graph_input_attn_temp>(hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale);
|
||||
auto inp = std::make_unique<llm_graph_input_attn_temp>(hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale, hparams.f_attn_temp_offset);
|
||||
|
||||
auto & cur = inp->attn_scale;
|
||||
|
||||
|
|
@ -1480,13 +1535,13 @@ llm_graph_input_attn_no_cache * llm_graph_context::build_attn_inp_no_cache() con
|
|||
auto inp = std::make_unique<llm_graph_input_attn_no_cache>(hparams, cparams);
|
||||
|
||||
// note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch
|
||||
inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
|
||||
inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens, 1, 1);
|
||||
ggml_set_input(inp->self_kq_mask);
|
||||
|
||||
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
|
||||
|
||||
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
|
||||
inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
|
||||
inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens, 1, 1);
|
||||
ggml_set_input(inp->self_kq_mask_swa);
|
||||
|
||||
inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
|
||||
|
|
@ -1568,7 +1623,7 @@ static std::unique_ptr<llm_graph_input_attn_kv> build_attn_inp_kv_impl(
|
|||
inp->self_k_idxs = mctx_cur->build_input_k_idxs(ctx0, ubatch);
|
||||
inp->self_v_idxs = mctx_cur->build_input_v_idxs(ctx0, ubatch);
|
||||
|
||||
inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens/n_stream, GGML_KQ_MASK_PAD), 1, n_stream);
|
||||
inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
|
||||
ggml_set_input(inp->self_kq_mask);
|
||||
|
||||
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
|
||||
|
|
@ -1711,7 +1766,7 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
|
|||
|
||||
const int32_t n_enc = !cross->v_embd.empty() ? cross->n_enc : hparams.n_ctx_train;
|
||||
|
||||
inp->cross_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_enc, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
|
||||
inp->cross_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_enc, n_tokens, 1, 1);
|
||||
ggml_set_input(inp->cross_kq_mask);
|
||||
|
||||
inp->cross_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->cross_kq_mask, GGML_TYPE_F16) : inp->cross_kq_mask;
|
||||
|
|
@ -1777,7 +1832,7 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
|
|||
inp->self_k_idxs = mctx_cur->get_base()->build_input_k_idxs(ctx0, ubatch);
|
||||
inp->self_v_idxs = mctx_cur->get_base()->build_input_v_idxs(ctx0, ubatch);
|
||||
|
||||
inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens/n_stream, GGML_KQ_MASK_PAD), 1, n_stream);
|
||||
inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
|
||||
ggml_set_input(inp->self_kq_mask);
|
||||
|
||||
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
|
||||
|
|
@ -1791,7 +1846,7 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
|
|||
inp->self_k_idxs_swa = mctx_cur->get_swa()->build_input_k_idxs(ctx0, ubatch);
|
||||
inp->self_v_idxs_swa = mctx_cur->get_swa()->build_input_v_idxs(ctx0, ubatch);
|
||||
|
||||
inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens/n_stream, GGML_KQ_MASK_PAD), 1, n_stream);
|
||||
inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
|
||||
ggml_set_input(inp->self_kq_mask_swa);
|
||||
|
||||
inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
|
||||
|
|
@ -1851,6 +1906,9 @@ static std::unique_ptr<llm_graph_input_rs> build_rs_inp_impl(
|
|||
inp->s_copy_main = ggml_view_1d(ctx0, inp->s_copy, n_seqs, 0);
|
||||
inp->s_copy_extra = ggml_view_1d(ctx0, inp->s_copy, n_rs - n_seqs, n_seqs * inp->s_copy->nb[0]);
|
||||
|
||||
inp->head = mctx_cur->get_head();
|
||||
inp->rs_z = mctx_cur->get_rs_z();
|
||||
|
||||
return inp;
|
||||
}
|
||||
|
||||
|
|
@ -1919,10 +1977,10 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_store(
|
|||
llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const {
|
||||
const auto * mctx_cur = static_cast<const llama_memory_hybrid_context *>(mctx);
|
||||
|
||||
auto inp_rs = build_rs_inp_impl(ctx0, ubatch, mctx_cur->get_recr());
|
||||
auto inp_rs = build_rs_inp_impl (ctx0, ubatch, mctx_cur->get_recr());
|
||||
auto inp_attn = build_attn_inp_kv_impl(ctx0, ubatch, hparams, cparams, mctx_cur->get_attn());
|
||||
|
||||
auto inp = std::make_unique<llm_graph_input_mem_hybrid>(std::move(inp_attn), std::move(inp_rs), mctx_cur);
|
||||
auto inp = std::make_unique<llm_graph_input_mem_hybrid>(cparams, std::move(inp_attn), std::move(inp_rs), mctx_cur);
|
||||
|
||||
return (llm_graph_input_mem_hybrid *) res->add_input(std::move(inp));
|
||||
}
|
||||
|
|
|
|||
|
|
@ -132,8 +132,8 @@ public:
|
|||
// temperature tuning, used by llama4
|
||||
class llm_graph_input_attn_temp : public llm_graph_input_i {
|
||||
public:
|
||||
llm_graph_input_attn_temp(uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale)
|
||||
: n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {}
|
||||
llm_graph_input_attn_temp(uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale, float f_attn_temp_offset)
|
||||
: n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale), f_attn_temp_offset(f_attn_temp_offset) {}
|
||||
virtual ~llm_graph_input_attn_temp() = default;
|
||||
|
||||
void set_input(const llama_ubatch * ubatch) override;
|
||||
|
|
@ -142,6 +142,7 @@ public:
|
|||
|
||||
const uint32_t n_attn_temp_floor_scale;
|
||||
const float f_attn_temp_scale;
|
||||
const float f_attn_temp_offset;
|
||||
};
|
||||
|
||||
class llm_graph_input_pos_bucket : public llm_graph_input_i {
|
||||
|
|
@ -224,6 +225,8 @@ public:
|
|||
|
||||
void set_input(const llama_ubatch * ubatch) override;
|
||||
|
||||
bool can_reuse(const llm_graph_params & params) override;
|
||||
|
||||
ggml_tensor * s_copy; // I32 [n_rs]
|
||||
|
||||
// views of s_copy, computed once per graph
|
||||
|
|
@ -232,6 +235,10 @@ public:
|
|||
ggml_tensor * s_copy_extra; // I32 [n_rs - n_seqs]
|
||||
|
||||
const llama_memory_recurrent_context * mctx;
|
||||
|
||||
// used in view offsets, need to match for valid graph reuse
|
||||
uint32_t head;
|
||||
int32_t rs_z;
|
||||
};
|
||||
|
||||
class llm_graph_input_cross_embd : public llm_graph_input_i {
|
||||
|
|
@ -364,22 +371,28 @@ public:
|
|||
class llm_graph_input_mem_hybrid : public llm_graph_input_i {
|
||||
public:
|
||||
llm_graph_input_mem_hybrid(
|
||||
const llama_cparams & cparams,
|
||||
std::unique_ptr<llm_graph_input_attn_kv> inp_attn,
|
||||
std::unique_ptr<llm_graph_input_rs> inp_rs,
|
||||
const llama_memory_hybrid_context * mctx) :
|
||||
std::unique_ptr<llm_graph_input_rs> inp_rs,
|
||||
const llama_memory_hybrid_context * mctx) :
|
||||
inp_attn(std::move(inp_attn)),
|
||||
inp_rs(std::move(inp_rs)),
|
||||
cparams(cparams),
|
||||
mctx(mctx) { }
|
||||
virtual ~llm_graph_input_mem_hybrid() = default;
|
||||
|
||||
void set_input(const llama_ubatch * ubatch) override;
|
||||
|
||||
bool can_reuse(const llm_graph_params & params) override;
|
||||
|
||||
std::unique_ptr<llm_graph_input_attn_kv> inp_attn;
|
||||
std::unique_ptr<llm_graph_input_rs> inp_rs;
|
||||
|
||||
llm_graph_input_attn_kv * get_attn() const { return inp_attn.get(); }
|
||||
llm_graph_input_rs * get_recr() const { return inp_rs.get(); }
|
||||
|
||||
const llama_cparams cparams;
|
||||
|
||||
const llama_memory_hybrid_context * mctx;
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -1,6 +1,8 @@
|
|||
#include "llama-hparams.h"
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
|
||||
void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) {
|
||||
|
|
@ -237,3 +239,7 @@ bool llama_hparams::is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama
|
|||
|
||||
return false;
|
||||
}
|
||||
|
||||
bool llama_hparams::use_mrope() const {
|
||||
return rope_sections[0] > 0 && rope_sections[1] > 0;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -34,6 +34,7 @@ struct llama_hparams_convnext {
|
|||
|
||||
struct llama_hparams {
|
||||
bool vocab_only;
|
||||
bool no_alloc;
|
||||
bool rope_finetuned;
|
||||
bool use_par_res;
|
||||
bool swin_norm;
|
||||
|
|
@ -109,6 +110,7 @@ struct llama_hparams {
|
|||
float rope_freq_base_train_swa;
|
||||
float rope_freq_scale_train;
|
||||
float rope_freq_scale_train_swa;
|
||||
|
||||
uint32_t n_ctx_orig_yarn;
|
||||
float rope_yarn_log_mul = 0.0f;
|
||||
|
||||
|
|
@ -166,6 +168,7 @@ struct llama_hparams {
|
|||
uint32_t n_no_rope_layer_step = 4;
|
||||
uint32_t n_attn_temp_floor_scale = 0;
|
||||
float f_attn_temp_scale = 0.0f;
|
||||
float f_attn_temp_offset = 0.0f; // offset position index
|
||||
|
||||
// gemma3n altup
|
||||
uint32_t n_altup = 4; // altup_num_inputs
|
||||
|
|
@ -272,7 +275,8 @@ struct llama_hparams {
|
|||
// TODO: think of a better place for this function
|
||||
// TODO: pack the SWA params in a struct?
|
||||
static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1);
|
||||
|
||||
bool use_mrope() const;
|
||||
};
|
||||
|
||||
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
|
||||
|
||||
|
|
|
|||
|
|
@ -25,6 +25,10 @@ time_meas::~time_meas() {
|
|||
}
|
||||
}
|
||||
|
||||
void llama_log_get(ggml_log_callback * log_callback, void ** user_data) {
|
||||
ggml_log_get(log_callback, user_data);
|
||||
}
|
||||
|
||||
void llama_log_set(ggml_log_callback log_callback, void * user_data) {
|
||||
ggml_log_set(log_callback, user_data);
|
||||
g_logger_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
|
||||
|
|
|
|||
|
|
@ -175,7 +175,15 @@ llama_kv_cache::llama_kv_cache(
|
|||
|
||||
// allocate tensors and initialize the buffers to avoid NaNs in the padding
|
||||
for (auto & [buft, ctx] : ctx_map) {
|
||||
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx.get(), buft);
|
||||
ggml_backend_buffer_t buf;
|
||||
if (model.hparams.no_alloc) {
|
||||
buf = ggml_backend_buft_alloc_buffer(buft, /*size =*/ 0); // dummy buffer
|
||||
for (ggml_tensor * t = ggml_get_first_tensor(ctx.get()); t != nullptr; t = ggml_get_next_tensor(ctx.get(), t)) {
|
||||
t->buffer = buf; // set dummy buffer for KV cache so that the backend scheduler won't try to allocate it
|
||||
}
|
||||
} else {
|
||||
buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx.get(), buft); // real buffer
|
||||
}
|
||||
if (!buf) {
|
||||
throw std::runtime_error("failed to allocate buffer for kv cache");
|
||||
}
|
||||
|
|
@ -482,9 +490,18 @@ llama_pos llama_kv_cache::seq_pos_max(llama_seq_id seq_id) const {
|
|||
|
||||
std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache::memory_breakdown() const {
|
||||
std::map<ggml_backend_buffer_type_t, size_t> ret;
|
||||
for (const auto & [_, buf] : ctxs_bufs) {
|
||||
ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
|
||||
for (const auto & [ctx, buf] : ctxs_bufs) {
|
||||
ggml_backend_buffer_type_t buft = ggml_backend_buffer_get_type(buf.get());
|
||||
|
||||
if (hparams.no_alloc) {
|
||||
GGML_ASSERT(ggml_backend_buffer_get_base(buf.get()) == nullptr);
|
||||
ret[buft] += ggml_backend_alloc_ctx_tensors_from_buft_size(ctx.get(), buft);
|
||||
} else {
|
||||
// GGML_ASSERT(ggml_backend_buffer_get_base(buf.get()) != nullptr); // multi_buffer does not have a defined base
|
||||
ret[buft] += ggml_backend_buffer_get_size(buf.get());
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
|
@ -1232,8 +1249,7 @@ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * u
|
|||
GGML_ASSERT(n_tokens%n_stream == 0);
|
||||
|
||||
// n_tps == n_tokens_per_stream
|
||||
const int64_t n_tps = n_tokens/n_stream;
|
||||
const int64_t n_tps_pad = GGML_PAD(n_tps, GGML_KQ_MASK_PAD);
|
||||
const int64_t n_tps = n_tokens/n_stream;
|
||||
|
||||
std::fill(data, data + ggml_nelements(dst), -INFINITY);
|
||||
|
||||
|
|
@ -1266,7 +1282,7 @@ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * u
|
|||
const llama_pos p1_x = is_2d ? ubatch->pos[i + ubatch->n_tokens*2] : 0;
|
||||
const llama_pos p1_y = is_2d ? ubatch->pos[i + ubatch->n_tokens] : 0;
|
||||
|
||||
const uint64_t idst = n_kv*(h*n_stream*n_tps_pad + s*n_tps_pad + ii);
|
||||
const uint64_t idst = n_kv*(h*n_stream*n_tps + s*n_tps + ii);
|
||||
|
||||
for (uint32_t j = 0; j < n_kv; ++j) {
|
||||
if (cells.is_empty(j)) {
|
||||
|
|
@ -1370,9 +1386,10 @@ ggml_tensor * llama_kv_cache::build_rope_shift(
|
|||
float freq_scale) const {
|
||||
const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;
|
||||
|
||||
const auto & yarn_ext_factor = cparams.yarn_ext_factor;
|
||||
const auto & yarn_beta_fast = cparams.yarn_beta_fast;
|
||||
const auto & yarn_beta_slow = cparams.yarn_beta_slow;
|
||||
const auto & yarn_ext_factor = cparams.yarn_ext_factor;
|
||||
const auto & yarn_beta_fast = cparams.yarn_beta_fast;
|
||||
const auto & yarn_beta_slow = cparams.yarn_beta_slow;
|
||||
const auto & yarn_attn_factor = cparams.yarn_attn_factor;
|
||||
|
||||
const auto & n_rot = hparams.n_rot;
|
||||
const auto & rope_type = hparams.rope_type == LLAMA_ROPE_TYPE_MROPE || hparams.rope_type == LLAMA_ROPE_TYPE_IMROPE
|
||||
|
|
@ -1383,12 +1400,6 @@ ggml_tensor * llama_kv_cache::build_rope_shift(
|
|||
? LLAMA_ROPE_TYPE_NEOX
|
||||
: hparams.rope_type;
|
||||
|
||||
// See llm_build_deepseek2() for why attn_factor has to be scaled for YaRN RoPE to work correctly.
|
||||
// See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
|
||||
const float yarn_attn_factor = model.arch == LLM_ARCH_DEEPSEEK2
|
||||
? 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale))
|
||||
: cparams.yarn_attn_factor;
|
||||
|
||||
ggml_tensor * tmp;
|
||||
|
||||
if (ggml_is_quantized(cur->type)) {
|
||||
|
|
@ -1550,9 +1561,11 @@ void llama_kv_cache::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama
|
|||
|
||||
const uint32_t strm = seq_id == -1 ? s : seq_to_stream[seq_id];
|
||||
|
||||
slot_info sinfo;
|
||||
|
||||
bool res = true;
|
||||
res = res && state_read_meta(io, strm, cell_count, seq_id);
|
||||
res = res && state_read_data(io, strm, cell_count);
|
||||
res = res && state_read_meta(io, strm, cell_count, sinfo, seq_id);
|
||||
res = res && state_read_data(io, strm, cell_count, sinfo);
|
||||
|
||||
if (!res) {
|
||||
if (seq_id == -1) {
|
||||
|
|
@ -1691,7 +1704,7 @@ void llama_kv_cache::state_write_data(llama_io_write_i & io, const cell_ranges_t
|
|||
}
|
||||
}
|
||||
|
||||
bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, llama_seq_id dest_seq_id) {
|
||||
bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, slot_info & sinfo, llama_seq_id dest_seq_id) {
|
||||
auto & cells = v_cells[strm];
|
||||
auto & head = v_heads[strm];
|
||||
|
||||
|
|
@ -1728,7 +1741,7 @@ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32
|
|||
ubatch.seq_id[i] = &dest_seq_id;
|
||||
}
|
||||
|
||||
const auto sinfo = find_slot(ubatch, true);
|
||||
sinfo = find_slot(ubatch, false);
|
||||
if (sinfo.empty()) {
|
||||
LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
|
||||
return false;
|
||||
|
|
@ -1738,20 +1751,16 @@ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32
|
|||
// see: https://github.com/ggml-org/llama.cpp/pull/16825#issuecomment-3460868350
|
||||
apply_ubatch(sinfo, ubatch);
|
||||
|
||||
const auto head_cur = sinfo.head();
|
||||
LLAMA_LOG_DEBUG("%s: cell_count = %d, dest_seq_id = %d\n", __func__, cell_count, dest_seq_id);
|
||||
|
||||
// keep the head at the old position because we will read the KV data into it in state_read_data()
|
||||
head = head_cur;
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: head_cur = %d, head = %d, cell_count = %d, dest_seq_id = %d\n", __func__, head_cur, head, cell_count, dest_seq_id);
|
||||
|
||||
// DEBUG CHECK: head_cur should be our first cell, head_cur + cell_count - 1 should be our last cell (verify seq_id and pos values)
|
||||
// Assume that this is one contiguous block of cells
|
||||
GGML_ASSERT(head_cur + cell_count <= cells.size());
|
||||
GGML_ASSERT(cells.pos_get(head_cur) == ubatch.pos[0]);
|
||||
GGML_ASSERT(cells.pos_get(head_cur + cell_count - 1) == ubatch.pos[cell_count - 1]);
|
||||
GGML_ASSERT(cells.seq_has(head_cur, dest_seq_id));
|
||||
GGML_ASSERT(cells.seq_has(head_cur + cell_count - 1, dest_seq_id));
|
||||
// DEBUG CHECK: verify that all cells were allocated and have correct seq_id and pos values
|
||||
GGML_ASSERT(sinfo.n_stream() == 1);
|
||||
GGML_ASSERT(sinfo.idxs[0].size() == cell_count);
|
||||
for (uint32_t i = 0; i < cell_count; ++i) {
|
||||
const uint32_t idx = sinfo.idxs[0][i];
|
||||
GGML_ASSERT(cells.pos_get(idx) == ubatch.pos[i]);
|
||||
GGML_ASSERT(cells.seq_has(idx, dest_seq_id));
|
||||
}
|
||||
} else {
|
||||
// whole KV cache restore
|
||||
|
||||
|
|
@ -1784,15 +1793,24 @@ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32
|
|||
}
|
||||
}
|
||||
|
||||
// Create contiguous slot_info for whole cache restore
|
||||
sinfo.s0 = strm;
|
||||
sinfo.s1 = strm;
|
||||
sinfo.resize(1);
|
||||
sinfo.strm[0] = strm;
|
||||
sinfo.idxs[0].resize(cell_count);
|
||||
for (uint32_t i = 0; i < cell_count; ++i) {
|
||||
sinfo.idxs[0][i] = i;
|
||||
}
|
||||
|
||||
head = 0;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count) {
|
||||
bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, const slot_info & sinfo) {
|
||||
auto & cells = v_cells[strm];
|
||||
auto & head = v_heads[strm];
|
||||
|
||||
uint32_t v_trans;
|
||||
uint32_t n_layer;
|
||||
|
|
@ -1842,8 +1860,17 @@ bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32
|
|||
}
|
||||
|
||||
if (cell_count) {
|
||||
// Read and set the keys for the whole cell range
|
||||
ggml_backend_tensor_set(k, io.read(cell_count * k_size_row), head * k_size_row, cell_count * k_size_row);
|
||||
if (sinfo.is_contiguous()) {
|
||||
// Fast path: contiguous cells, single memcpy
|
||||
ggml_backend_tensor_set(k, io.read(cell_count * k_size_row), sinfo.head() * k_size_row, cell_count * k_size_row);
|
||||
} else {
|
||||
// Slow path: scatter to non-contiguous positions
|
||||
const void * src = io.read(cell_count * k_size_row);
|
||||
for (uint32_t i = 0; i < cell_count; ++i) {
|
||||
const size_t dst_offset = sinfo.idxs[0][i] * k_size_row;
|
||||
ggml_backend_tensor_set(k, (const char*)src + i * k_size_row, dst_offset, k_size_row);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1874,8 +1901,17 @@ bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32
|
|||
}
|
||||
|
||||
if (cell_count) {
|
||||
// Read and set the values for the whole cell range
|
||||
ggml_backend_tensor_set(v, io.read(cell_count * v_size_row), head * v_size_row, cell_count * v_size_row);
|
||||
if (sinfo.is_contiguous()) {
|
||||
// Fast path: contiguous cells, single memcpy
|
||||
ggml_backend_tensor_set(v, io.read(cell_count * v_size_row), sinfo.head() * v_size_row, cell_count * v_size_row);
|
||||
} else {
|
||||
// Slow path: scatter to non-contiguous positions
|
||||
const void * src = io.read(cell_count * v_size_row);
|
||||
for (uint32_t i = 0; i < cell_count; ++i) {
|
||||
const size_t dst_offset = sinfo.idxs[0][i] * v_size_row;
|
||||
ggml_backend_tensor_set(v, (const char*)src + i * v_size_row, dst_offset, v_size_row);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
|
@ -1914,10 +1950,22 @@ bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32
|
|||
}
|
||||
|
||||
if (cell_count) {
|
||||
// For each row in the transposed matrix, read the values for the whole cell range
|
||||
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
||||
const size_t dst_offset = (head + j * cells.size()) * v_size_el;
|
||||
ggml_backend_tensor_set(v, io.read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
|
||||
if (sinfo.is_contiguous()) {
|
||||
// Fast path: contiguous cells
|
||||
const uint32_t h = sinfo.head();
|
||||
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
||||
const size_t dst_offset = (h + j * cells.size()) * v_size_el;
|
||||
ggml_backend_tensor_set(v, io.read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
|
||||
}
|
||||
} else {
|
||||
// Slow path: scatter to non-contiguous positions
|
||||
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
||||
const void * src = io.read(cell_count * v_size_el);
|
||||
for (uint32_t i = 0; i < cell_count; ++i) {
|
||||
const size_t dst_offset = (sinfo.idxs[0][i] + j * cells.size()) * v_size_el;
|
||||
ggml_backend_tensor_set(v, (const char*)src + i * v_size_el, dst_offset, v_size_el);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -72,6 +72,23 @@ public:
|
|||
void clear() {
|
||||
idxs.clear();
|
||||
}
|
||||
|
||||
// check if indices are contiguous starting from head()
|
||||
bool is_contiguous() const {
|
||||
if (idxs.empty() || idxs[0].empty()) {
|
||||
return true;
|
||||
}
|
||||
if (idxs.size() > 1) {
|
||||
return false;
|
||||
}
|
||||
const uint32_t h = idxs[0][0];
|
||||
for (size_t i = 0; i < idxs[0].size(); ++i) {
|
||||
if (idxs[0][i] != h + i) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
using slot_info_vec_t = std::vector<slot_info>;
|
||||
|
|
@ -264,8 +281,8 @@ private:
|
|||
void state_write_meta(llama_io_write_i & io, const cell_ranges_t & cr, llama_seq_id seq_id = -1) const;
|
||||
void state_write_data(llama_io_write_i & io, const cell_ranges_t & cr) const;
|
||||
|
||||
bool state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, llama_seq_id dest_seq_id = -1);
|
||||
bool state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count);
|
||||
bool state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, slot_info & sinfo, llama_seq_id dest_seq_id = -1);
|
||||
bool state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, const slot_info & sinfo);
|
||||
};
|
||||
|
||||
class llama_kv_cache_context : public llama_memory_context_i {
|
||||
|
|
|
|||
|
|
@ -222,7 +222,7 @@ llama_memory_hybrid_context::llama_memory_hybrid_context(
|
|||
ubatches(std::move(ubatches)),
|
||||
// note: here we copy the ubatches. not sure if this is ideal
|
||||
ctx_attn(new llama_kv_cache_context(mem->get_mem_attn(), std::move(sinfos_attn), this->ubatches)),
|
||||
ctx_recr(new llama_memory_recurrent_context(mem->get_mem_recr(), this->ubatches)),
|
||||
ctx_recr(new llama_memory_recurrent_context(mem->get_mem_recr(), this->ubatches)),
|
||||
status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) {
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -473,6 +473,7 @@ llama_model_loader::llama_model_loader(
|
|||
std::vector<std::string> & splits,
|
||||
bool use_mmap,
|
||||
bool check_tensors,
|
||||
bool no_alloc,
|
||||
const llama_model_kv_override * param_overrides_p,
|
||||
const llama_model_tensor_buft_override * param_tensor_buft_overrides_p) {
|
||||
int trace = 0;
|
||||
|
|
@ -716,6 +717,7 @@ llama_model_loader::llama_model_loader(
|
|||
|
||||
this->use_mmap = use_mmap;
|
||||
this->check_tensors = check_tensors;
|
||||
this->no_alloc = no_alloc;
|
||||
}
|
||||
|
||||
std::string llama_model_loader::get_arch_name() const {
|
||||
|
|
|
|||
|
|
@ -71,6 +71,7 @@ struct llama_model_loader {
|
|||
|
||||
bool use_mmap = false;
|
||||
bool check_tensors;
|
||||
bool no_alloc;
|
||||
|
||||
llama_files files;
|
||||
llama_ftype ftype;
|
||||
|
|
@ -97,6 +98,7 @@ struct llama_model_loader {
|
|||
std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
|
||||
bool use_mmap,
|
||||
bool check_tensors,
|
||||
bool no_alloc,
|
||||
const llama_model_kv_override * param_overrides_p,
|
||||
const llama_model_tensor_buft_override * param_tensor_buft_overrides_p);
|
||||
|
||||
|
|
|
|||
|
|
@ -669,6 +669,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
hparams.n_swa = 8192;
|
||||
hparams.n_attn_temp_floor_scale = 8192;
|
||||
hparams.f_attn_temp_scale = 0.1f;
|
||||
hparams.f_attn_temp_offset = 1.0f;
|
||||
hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
|
||||
}
|
||||
|
||||
|
|
@ -1636,12 +1637,19 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
// that have no expert_gating_func model parameter set
|
||||
hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
|
||||
}
|
||||
ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false);
|
||||
|
||||
if (ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, 0.0f)) {
|
||||
// [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
|
||||
// cancel the factor from the convert script
|
||||
hparams.rope_yarn_log_mul /= 0.1f;
|
||||
}
|
||||
|
||||
// (optional) temperature tuning - used by mistral-large
|
||||
ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false);
|
||||
ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_LENGTH, hparams.n_attn_temp_floor_scale, false);
|
||||
|
||||
hparams.f_attn_temp_offset = 0.0f;
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 27: type = LLM_TYPE_16B; break;
|
||||
case 60: type = LLM_TYPE_236B; break;
|
||||
|
|
@ -1681,7 +1689,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
} break;
|
||||
case LLM_ARCH_GLM4:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
|
||||
switch (hparams.n_layer) {
|
||||
case 40: type = LLM_TYPE_9B; break;
|
||||
case 61: type = LLM_TYPE_32B; break;
|
||||
|
|
@ -1690,8 +1699,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
} break;
|
||||
case LLM_ARCH_GLM4_MOE:
|
||||
{
|
||||
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
|
||||
|
||||
// MoE parameters
|
||||
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert);
|
||||
|
|
@ -2282,7 +2292,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
}
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 80: type = LLM_TYPE_80B_A3B; break;
|
||||
case 48: type = LLM_TYPE_80B_A3B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
|
|
@ -2291,9 +2301,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false);
|
||||
|
||||
ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast, false);
|
||||
ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow, false);
|
||||
ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false);
|
||||
ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast, false);
|
||||
ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow, false);
|
||||
ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, 0.0f);
|
||||
|
||||
hparams.f_attn_temp_offset = 0.0f;
|
||||
|
||||
// TODO: maybe add n_attn_temp_floor_scale as a separate KV?
|
||||
if (hparams.f_attn_temp_scale != 0.0f) {
|
||||
|
|
@ -2303,18 +2315,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
}
|
||||
}
|
||||
|
||||
// TODO: this seems to be correct with the case of mscale == mscale_all_dims == 1.0f
|
||||
// but may need further verification with other values
|
||||
if (hparams.rope_yarn_log_mul != 0.0f) {
|
||||
float factor = 1.0f / hparams.rope_freq_scale_train;
|
||||
float mscale = 1.0f;
|
||||
float mscale_all_dims = hparams.rope_yarn_log_mul;
|
||||
static auto get_mscale = [](float scale, float mscale) {
|
||||
return scale <= 1.0f ? 1.0f : (0.1f * mscale * logf(scale) + 1.0f);
|
||||
};
|
||||
hparams.yarn_attn_factor = get_mscale(factor, mscale) / get_mscale(factor, mscale_all_dims);
|
||||
}
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 26: type = LLM_TYPE_3B; break;
|
||||
case 34: type = LLM_TYPE_8B; break;
|
||||
|
|
@ -3414,9 +3414,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
||||
|
||||
// optional bias tensors
|
||||
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
|
||||
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
|
||||
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
|
||||
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
||||
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
||||
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
||||
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
|
||||
|
|
@ -6678,9 +6678,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
|
||||
std::vector<ggml_backend_buffer_ptr> bufs;
|
||||
if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
|
||||
GGML_ASSERT(!ml.no_alloc);
|
||||
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
|
||||
// only the mmap region containing the tensors in the model is mapped to the backend buffer
|
||||
// this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
|
||||
// this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer,
|
||||
// then we could just use metal for all layers
|
||||
// this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
|
||||
void * addr = nullptr;
|
||||
size_t first, last; // NOLINT
|
||||
|
|
@ -6696,9 +6698,16 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
bufs.emplace_back(buf);
|
||||
buf_map.emplace(idx, buf);
|
||||
}
|
||||
}
|
||||
else {
|
||||
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
|
||||
} else {
|
||||
ggml_backend_buffer_t buf;
|
||||
if (ml.no_alloc) {
|
||||
buf = ggml_backend_buft_alloc_buffer(buft, /*size =*/ 0); // dummy buffer
|
||||
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
|
||||
t->buffer = buf; // set dummy buffer for weights so that the backend scheduler won't try to allocate them
|
||||
}
|
||||
} else {
|
||||
buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); // real buffer
|
||||
}
|
||||
if (buf == nullptr) {
|
||||
throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
|
||||
}
|
||||
|
|
@ -6753,6 +6762,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
}
|
||||
}
|
||||
|
||||
if (ml.no_alloc) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// load tensor data
|
||||
for (auto & [ctx, buf_map] : ctx_buf_maps) {
|
||||
if (!ml.load_all_data(ctx, buf_map, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
|
||||
|
|
@ -6795,9 +6808,18 @@ size_t llama_model::n_devices() const {
|
|||
|
||||
std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
|
||||
std::map<ggml_backend_buffer_type_t, size_t> ret;
|
||||
for (const auto & [_, bufs] : pimpl->ctxs_bufs) {
|
||||
for (const auto & buf : bufs) {
|
||||
ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
|
||||
for (const auto & [ctx, bufs] : pimpl->ctxs_bufs) {
|
||||
if (hparams.no_alloc) {
|
||||
GGML_ASSERT(bufs.size() == 1);
|
||||
ggml_backend_buffer_t buf = bufs[0].get();
|
||||
GGML_ASSERT(ggml_backend_buffer_get_base(buf) == nullptr);
|
||||
ggml_backend_buffer_type_t buft = ggml_backend_buffer_get_type(buf);
|
||||
ret[buft] += ggml_backend_alloc_ctx_tensors_from_buft_size(ctx.get(), buft);
|
||||
} else {
|
||||
for (const auto & buf : bufs) {
|
||||
// GGML_ASSERT(ggml_backend_buffer_get_base(buf.get()) != nullptr); // multi_buffer does not have a defined base
|
||||
ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
|
||||
}
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
|
|
@ -6842,6 +6864,7 @@ void llama_model::print_info() const {
|
|||
// hparams
|
||||
LLAMA_LOG_INFO("%s: arch = %s\n", __func__, arch_name().c_str());
|
||||
LLAMA_LOG_INFO("%s: vocab_only = %d\n", __func__, hparams.vocab_only);
|
||||
LLAMA_LOG_INFO("%s: no_alloc = %d\n", __func__, hparams.no_alloc);
|
||||
|
||||
if (!hparams.vocab_only) {
|
||||
LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
|
||||
|
|
@ -6876,6 +6899,7 @@ void llama_model::print_info() const {
|
|||
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
||||
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
||||
LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
|
||||
LLAMA_LOG_INFO("%s: rope_yarn_log_mul= %.4f\n", __func__, hparams.rope_yarn_log_mul);
|
||||
LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
|
||||
// MRoPE (Multi-axis Rotary Position Embedding) sections
|
||||
if (const auto & s = hparams.rope_sections; s[0] || s[1] || s[2] || s[3]) {
|
||||
|
|
@ -6940,7 +6964,6 @@ void llama_model::print_info() const {
|
|||
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
||||
LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
|
||||
LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
|
||||
LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
|
||||
}
|
||||
|
||||
if (arch == LLM_ARCH_QWEN2MOE) {
|
||||
|
|
@ -7697,6 +7720,7 @@ llama_model_params llama_model_default_params() {
|
|||
/*.check_tensors =*/ false,
|
||||
/*.use_extra_bufts =*/ true,
|
||||
/*.no_host =*/ false,
|
||||
/*.no_alloc =*/ false,
|
||||
};
|
||||
|
||||
return result;
|
||||
|
|
@ -7817,7 +7841,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|||
case LLM_ARCH_DEEPSEEK2:
|
||||
case LLM_ARCH_PLM:
|
||||
case LLM_ARCH_CHATGLM:
|
||||
case LLM_ARCH_GLM4:
|
||||
case LLM_ARCH_GRANITE:
|
||||
case LLM_ARCH_GRANITE_MOE:
|
||||
case LLM_ARCH_GRANITE_HYBRID:
|
||||
|
|
@ -7880,7 +7903,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|||
case LLM_ARCH_LFM2:
|
||||
case LLM_ARCH_LFM2MOE:
|
||||
case LLM_ARCH_SMALLTHINKER:
|
||||
case LLM_ARCH_GLM4_MOE:
|
||||
case LLM_ARCH_SEED_OSS:
|
||||
case LLM_ARCH_GROVEMOE:
|
||||
case LLM_ARCH_APERTUS:
|
||||
|
|
@ -7897,6 +7919,11 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|||
case LLM_ARCH_QWEN3VLMOE:
|
||||
return LLAMA_ROPE_TYPE_IMROPE;
|
||||
|
||||
case LLM_ARCH_GLM4:
|
||||
return model->hparams.use_mrope() ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NORM;
|
||||
case LLM_ARCH_GLM4_MOE:
|
||||
return model->hparams.use_mrope() ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NEOX;
|
||||
|
||||
// all model arches should be listed explicitly here
|
||||
case LLM_ARCH_UNKNOWN:
|
||||
GGML_ABORT("unknown architecture");
|
||||
|
|
|
|||
|
|
@ -596,7 +596,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|||
}
|
||||
|
||||
std::vector<std::string> splits = {};
|
||||
llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides, nullptr);
|
||||
llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
|
||||
ml.init_mappings(false); // no prefetching
|
||||
|
||||
llama_model model(llama_model_default_params());
|
||||
|
|
|
|||
|
|
@ -1884,7 +1884,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|||
clean_spaces = false;
|
||||
} else if (
|
||||
tokenizer_pre == "qwen2" ||
|
||||
tokenizer_pre == "deepseek-r1-qwen") {
|
||||
tokenizer_pre == "deepseek-r1-qwen" ||
|
||||
tokenizer_pre == "kormo") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
|
||||
clean_spaces = false;
|
||||
} else if (
|
||||
|
|
|
|||
|
|
@ -1,6 +1,9 @@
|
|||
#include "llama.h"
|
||||
|
||||
#include "llama-impl.h"
|
||||
|
||||
#include "llama-chat.h"
|
||||
#include "llama-context.h"
|
||||
#include "llama-mmap.h"
|
||||
#include "llama-vocab.h"
|
||||
#include "llama-model-loader.h"
|
||||
|
|
@ -11,11 +14,14 @@
|
|||
#include "ggml-backend.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cinttypes>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <ctime>
|
||||
#include <stdexcept>
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
|
|
@ -37,6 +43,646 @@ const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_ty
|
|||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
struct llama_device_memory_data {
|
||||
int64_t total;
|
||||
int64_t free;
|
||||
llama_memory_breakdown_data mb;
|
||||
};
|
||||
|
||||
static std::vector<llama_device_memory_data> llama_get_device_memory_data(
|
||||
const char * path_model, const llama_model_params * mparams, const llama_context_params * cparams,
|
||||
std::vector<ggml_backend_dev_t> & devs, uint32_t & hp_ngl, uint32_t & hp_n_ctx_train, uint32_t & hp_n_expert,
|
||||
const ggml_log_level log_level) {
|
||||
struct user_data_t {
|
||||
struct {
|
||||
ggml_log_callback callback;
|
||||
void * user_data;
|
||||
} original_logger;
|
||||
ggml_log_level min_level; // prints below this log level go to debug log
|
||||
};
|
||||
user_data_t ud;
|
||||
llama_log_get(&ud.original_logger.callback, &ud.original_logger.user_data);
|
||||
ud.min_level = log_level;
|
||||
|
||||
llama_log_set([](ggml_log_level level, const char * text, void * user_data) {
|
||||
const user_data_t * ud = (const user_data_t *) user_data;
|
||||
const ggml_log_level level_eff = level >= ud->min_level ? level : GGML_LOG_LEVEL_DEBUG;
|
||||
ud->original_logger.callback(level_eff, text, ud->original_logger.user_data);
|
||||
}, &ud);
|
||||
|
||||
llama_model_params mparams_copy = *mparams;
|
||||
mparams_copy.no_alloc = true;
|
||||
mparams_copy.use_mmap = false;
|
||||
|
||||
llama_model * model = llama_model_load_from_file(path_model, mparams_copy);
|
||||
if (model == nullptr) {
|
||||
llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
|
||||
throw std::runtime_error("failed to load model");
|
||||
}
|
||||
|
||||
llama_context * ctx = llama_init_from_model(model, *cparams);
|
||||
if (ctx == nullptr) {
|
||||
llama_model_free(model);
|
||||
llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
|
||||
throw std::runtime_error("failed to create llama_context from model");
|
||||
}
|
||||
|
||||
std::vector<llama_device_memory_data> ret(model->devices.size());
|
||||
|
||||
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown = ctx->memory_breakdown();
|
||||
|
||||
for (const auto & [buft, mb] : memory_breakdown) {
|
||||
if (ggml_backend_buft_is_host(buft)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
|
||||
if (!dev) {
|
||||
continue;
|
||||
}
|
||||
for (size_t i = 0; i < ret.size(); i++) {
|
||||
if (model->devices[i] == dev) {
|
||||
ret[i].mb.model += mb.model;
|
||||
ret[i].mb.context += mb.context;
|
||||
ret[i].mb.compute += mb.compute;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
for (size_t i = 0; i < ret.size(); i++) {
|
||||
size_t free, total;
|
||||
ggml_backend_dev_memory(model->devices[i], &free, &total);
|
||||
ret[i].free = free;
|
||||
ret[i].total = total;
|
||||
}
|
||||
|
||||
devs = model->devices;
|
||||
hp_ngl = model->hparams.n_layer;
|
||||
hp_n_ctx_train = model->hparams.n_ctx_train;
|
||||
hp_n_expert = model->hparams.n_expert;
|
||||
|
||||
llama_memory_breakdown_print(ctx); // goes to debug log
|
||||
|
||||
llama_free(ctx);
|
||||
llama_model_free(model);
|
||||
llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
|
||||
return ret;
|
||||
}
|
||||
|
||||
// enum to identify part of a layer for distributing its tensors:
|
||||
enum layer_fraction_t {
|
||||
LAYER_FRACTION_NONE = 0, // nothing
|
||||
LAYER_FRACTION_ATTN = 1, // attention
|
||||
LAYER_FRACTION_UP = 2, // attention + up
|
||||
LAYER_FRACTION_GATE = 3, // attention + up + gate
|
||||
LAYER_FRACTION_MOE = 4, // everything but sparse MoE weights
|
||||
};
|
||||
// this enum is only used in llama_params_fit_impl but needs to be defined outside of it to fix a Windows compilation issue
|
||||
|
||||
static void llama_params_fit_impl(
|
||||
const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
|
||||
float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
|
||||
size_t margin_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
|
||||
constexpr int64_t MiB = 1024*1024;
|
||||
const int64_t margin = margin_s; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
|
||||
typedef std::vector<llama_device_memory_data> dmds_t;
|
||||
const llama_model_params default_mparams = llama_model_default_params();
|
||||
|
||||
std::vector<ggml_backend_dev_t> devs;
|
||||
uint32_t hp_ngl = 0; // hparams.n_gpu_layers
|
||||
uint32_t hp_nct = 0; // hparams.n_ctx_train
|
||||
uint32_t hp_nex = 0; // hparams.n_expert
|
||||
|
||||
// step 1: get data for default parameters and check whether any changes are necessary in the first place
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: getting device memory data for initial parameters:\n", __func__);
|
||||
const dmds_t dmds_full = llama_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
||||
const size_t nd = devs.size(); // number of devices
|
||||
if (nd == 0) {
|
||||
LLAMA_LOG_INFO("%s: no devices with dedicated memory found\n", __func__);
|
||||
return;
|
||||
}
|
||||
|
||||
std::vector<std::string> dev_names;
|
||||
{
|
||||
dev_names.reserve(nd);
|
||||
size_t max_length = 0;
|
||||
for (ggml_backend_dev_t dev : devs) {
|
||||
std::string name = ggml_backend_dev_name(dev);
|
||||
name += " (";
|
||||
name += ggml_backend_dev_description(dev);
|
||||
name += ")";
|
||||
dev_names.push_back(name);
|
||||
max_length = std::max(max_length, name.length());
|
||||
}
|
||||
for (std::string & dn : dev_names) {
|
||||
dn.insert(dn.end(), max_length - dn.length(), ' ');
|
||||
}
|
||||
}
|
||||
|
||||
int64_t sum_total = 0;
|
||||
int64_t sum_projected_free = 0;
|
||||
int64_t min_projected_free = INT64_MAX;
|
||||
int64_t sum_projected_used = 0;
|
||||
int64_t sum_projected_ctx = 0;
|
||||
|
||||
if (nd > 1) {
|
||||
LLAMA_LOG_INFO("%s: projected memory use with initial parameters [MiB]:\n", __func__);
|
||||
}
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
const llama_device_memory_data & dmd = dmds_full[id];
|
||||
|
||||
const int64_t projected_used = dmd.mb.total();
|
||||
const int64_t projected_free = dmd.free - projected_used;
|
||||
|
||||
sum_total += dmd.total;
|
||||
sum_projected_used += projected_used;
|
||||
sum_projected_free += projected_free;
|
||||
min_projected_free = std::min(min_projected_free, projected_free);
|
||||
sum_projected_ctx += dmd.mb.context;
|
||||
|
||||
if (nd > 1) {
|
||||
LLAMA_LOG_INFO("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " %s\n",
|
||||
__func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, std::abs(projected_free)/MiB,
|
||||
projected_free >= 0 ? "surplus" : "deficit");
|
||||
}
|
||||
}
|
||||
assert(sum_total >= 0 && sum_projected_used >= 0 && sum_projected_ctx >= 0);
|
||||
assert(sum_projected_used >= sum_projected_ctx);
|
||||
LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
|
||||
__func__, sum_projected_used/MiB, sum_total/MiB);
|
||||
if (min_projected_free >= margin) {
|
||||
if (nd == 1) {
|
||||
LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
|
||||
__func__, min_projected_free/MiB, margin/MiB);
|
||||
return;
|
||||
}
|
||||
LLAMA_LOG_INFO("%s: will leave at least %" PRId64 " >= %" PRId64 " MiB of free memory on all devices, no changes needed\n",
|
||||
__func__, min_projected_free/MiB, margin/MiB);
|
||||
return;
|
||||
}
|
||||
|
||||
// step 2: try reducing memory use by reducing the context size
|
||||
|
||||
{
|
||||
int64_t global_surplus = sum_projected_free - int64_t(nd)*margin;
|
||||
if (global_surplus < 0) {
|
||||
LLAMA_LOG_INFO(nd == 1 ?
|
||||
"%s: cannot fulfill margin of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n" :
|
||||
"%s: cannot fulfill margin of %" PRId64 " MiB on all devices, need to use %" PRId64 " MiB less in total\n",
|
||||
__func__, margin/MiB, -global_surplus/MiB);
|
||||
if (cparams->n_ctx == 0) {
|
||||
if (hp_nct > n_ctx_min) {
|
||||
const int64_t bytes_per_ctx = sum_projected_ctx / hp_nct;
|
||||
const uint32_t ctx_reduction = std::min(
|
||||
uint32_t((-global_surplus + bytes_per_ctx - 1) / bytes_per_ctx), hp_nct - n_ctx_min);
|
||||
cparams->n_ctx = hp_nct - ctx_reduction;
|
||||
const int64_t memory_reduction = ctx_reduction * bytes_per_ctx;
|
||||
global_surplus += memory_reduction;
|
||||
LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
|
||||
__func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
|
||||
if (global_surplus >= 0) {
|
||||
if (nd == 1) {
|
||||
LLAMA_LOG_INFO("%s: entire model can be fit by reducing context\n", __func__);
|
||||
return;
|
||||
}
|
||||
LLAMA_LOG_INFO("%s: entire model should be fit across devices by reducing context\n", __func__);
|
||||
}
|
||||
} else {
|
||||
LLAMA_LOG_INFO("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n",
|
||||
__func__, hp_nct, n_ctx_min);
|
||||
}
|
||||
} else {
|
||||
LLAMA_LOG_INFO("%s: context size set by user to %" PRIu32 " -> no change\n", __func__, cparams->n_ctx);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) {
|
||||
throw std::runtime_error("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort");
|
||||
}
|
||||
if (nd > 1) {
|
||||
if (!tensor_split) {
|
||||
throw std::runtime_error("did not provide a buffer to write the tensor_split to, abort");
|
||||
}
|
||||
if (mparams->tensor_split) {
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
if (mparams->tensor_split[id] != 0.0f) {
|
||||
throw std::runtime_error("model_params::tensor_split already set by user, abort");
|
||||
}
|
||||
}
|
||||
}
|
||||
if (mparams->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
||||
throw std::runtime_error("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort");
|
||||
}
|
||||
if (hp_ngl < 2*nd) {
|
||||
throw std::runtime_error("model has only " + std::to_string(hp_ngl) + " layers but need at least "
|
||||
+ std::to_string(2*nd) + " to fit memory for " + std::to_string(nd) + " devices, abort");
|
||||
}
|
||||
}
|
||||
if (!tensor_buft_overrides) {
|
||||
throw std::runtime_error("did not provide buffer to set tensor_buft_overrides, abort");
|
||||
}
|
||||
if (mparams->tensor_buft_overrides && (mparams->tensor_buft_overrides->pattern || mparams->tensor_buft_overrides->buft)) {
|
||||
throw std::runtime_error("model_params::tensor_buft_overrides already set by user, abort");
|
||||
}
|
||||
|
||||
// step 3: iteratively fill the back to front with "dense" layers
|
||||
// - for a dense model simply fill full layers, giving each device a contiguous slice of the model
|
||||
// - for a MoE model, same as dense model but with all MoE tensors in system memory
|
||||
|
||||
// utility function that returns a static C string matching the tensors for a specific layer index and layer fraction:
|
||||
auto get_overflow_pattern = [&](const size_t il, const layer_fraction_t lf) -> const char * {
|
||||
constexpr size_t n_strings = 1000;
|
||||
if (il >= n_strings) {
|
||||
throw std::runtime_error("at most " + std::to_string(n_strings) + " model layers are supported");
|
||||
}
|
||||
switch (lf) {
|
||||
case LAYER_FRACTION_ATTN: {
|
||||
static std::array<std::string, n_strings> patterns;
|
||||
if (patterns[il].empty()) {
|
||||
patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(up|gate|down).*";
|
||||
}
|
||||
return patterns[il].c_str();
|
||||
}
|
||||
case LAYER_FRACTION_UP: {
|
||||
static std::array<std::string, n_strings> patterns;
|
||||
if (patterns[il].empty()) {
|
||||
patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(gate|down).*";
|
||||
}
|
||||
return patterns[il].c_str();
|
||||
}
|
||||
case LAYER_FRACTION_GATE: {
|
||||
static std::array<std::string, n_strings> patterns;
|
||||
if (patterns[il].empty()) {
|
||||
patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_down.*";
|
||||
}
|
||||
return patterns[il].c_str();
|
||||
}
|
||||
case LAYER_FRACTION_MOE: {
|
||||
static std::array<std::string, n_strings> patterns;
|
||||
if (patterns[il].empty()) {
|
||||
patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(up|down|gate)_(ch|)exps";
|
||||
}
|
||||
return patterns[il].c_str();
|
||||
}
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
};
|
||||
|
||||
struct ngl_t {
|
||||
uint32_t n_layer = 0; // number of total layers
|
||||
uint32_t n_part = 0; // number of partial layers, <= n_layer
|
||||
|
||||
// for the first partial layer varying parts can overflow, all further layers use LAYER_FRACTION_MOE:
|
||||
layer_fraction_t overflow_type = LAYER_FRACTION_MOE;
|
||||
};
|
||||
|
||||
const size_t ntbo = llama_max_tensor_buft_overrides();
|
||||
|
||||
// utility function to set n_gpu_layers and tensor_split
|
||||
auto set_ngl_tensor_split_tbo = [&](
|
||||
const std::vector<ngl_t> & ngl_per_device,
|
||||
const std::vector<ggml_backend_buffer_type_t> & overflow_bufts,
|
||||
llama_model_params & mparams,
|
||||
const bool add_nonrepeating) {
|
||||
mparams.n_gpu_layers = 0;
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
mparams.n_gpu_layers += ngl_per_device[id].n_layer;
|
||||
if (nd > 1) {
|
||||
tensor_split[id] = ngl_per_device[id].n_layer;
|
||||
}
|
||||
}
|
||||
assert(uint32_t(mparams.n_gpu_layers) <= hp_ngl);
|
||||
uint32_t il0 = hp_ngl - mparams.n_gpu_layers; // start index for tensor buft overrides
|
||||
|
||||
if (add_nonrepeating) {
|
||||
mparams.n_gpu_layers += 1;
|
||||
tensor_split[nd - 1] += 1;
|
||||
}
|
||||
mparams.tensor_split = tensor_split;
|
||||
|
||||
size_t itbo = 0;
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
il0 += ngl_per_device[id].n_layer - ngl_per_device[id].n_part;
|
||||
for (uint32_t il = il0; il < il0 + ngl_per_device[id].n_part; il++) {
|
||||
if (itbo + 1 >= ntbo) {
|
||||
tensor_buft_overrides[itbo].pattern = nullptr;
|
||||
tensor_buft_overrides[itbo].buft = nullptr;
|
||||
itbo++;
|
||||
mparams.tensor_buft_overrides = tensor_buft_overrides;
|
||||
throw std::runtime_error("llama_params_fit_n_tensor_buft_overrides() == "
|
||||
+ std::to_string(ntbo) + " is insufficient for model\n");
|
||||
}
|
||||
tensor_buft_overrides[itbo].pattern = get_overflow_pattern(il, il == il0 ? ngl_per_device[id].overflow_type : LAYER_FRACTION_MOE);
|
||||
tensor_buft_overrides[itbo].buft = overflow_bufts[id];
|
||||
itbo++;
|
||||
}
|
||||
il0 += ngl_per_device[id].n_part;
|
||||
}
|
||||
tensor_buft_overrides[itbo].pattern = nullptr;
|
||||
tensor_buft_overrides[itbo].buft = nullptr;
|
||||
itbo++;
|
||||
mparams.tensor_buft_overrides = tensor_buft_overrides;
|
||||
};
|
||||
|
||||
// utility function that returns the memory use per device for given numbers of layers per device
|
||||
auto get_memory_for_layers = [&](
|
||||
const char * func_name,
|
||||
const std::vector<ngl_t> & ngl_per_device,
|
||||
const std::vector<ggml_backend_buffer_type_t> & overflow_bufts,
|
||||
const bool add_nonrepeating) -> std::vector<int64_t> {
|
||||
llama_model_params mparams_copy = *mparams;
|
||||
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, mparams_copy, add_nonrepeating);
|
||||
|
||||
const dmds_t dmd_nl = llama_get_device_memory_data(
|
||||
path_model, &mparams_copy, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: memory for test allocation by device:\n", func_name);
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
const ngl_t & n = ngl_per_device[id];
|
||||
LLAMA_LOG_DEBUG(
|
||||
"%s: id=%zu, n_layer=%2" PRIu32 ", n_part=%2" PRIu32 ", overflow_type=%d, mem=%6" PRId64 " MiB\n",
|
||||
func_name, id, n.n_layer, n.n_part, int(n.overflow_type), dmd_nl[id].mb.total()/MiB);
|
||||
}
|
||||
|
||||
std::vector<int64_t> ret;
|
||||
ret.reserve(nd);
|
||||
for (const llama_device_memory_data & dmd : dmd_nl) {
|
||||
ret.push_back(dmd.mb.total());
|
||||
}
|
||||
return ret;
|
||||
};
|
||||
|
||||
int64_t global_surplus_cpu_moe = 0;
|
||||
if (hp_nex > 0) {
|
||||
const static std::string pattern_moe_all = "blk\\.\\d+\\.ffn_(up|down|gate)_(ch|)exps"; // matches all MoE tensors
|
||||
ggml_backend_buffer_type_t cpu_buft = ggml_backend_cpu_buffer_type();
|
||||
tensor_buft_overrides[0] = {pattern_moe_all.c_str(), cpu_buft};
|
||||
tensor_buft_overrides[1] = {nullptr, nullptr};
|
||||
mparams->tensor_buft_overrides = tensor_buft_overrides;
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: getting device memory data with all MoE tensors moved to system memory:\n", __func__);
|
||||
const dmds_t dmds_cpu_moe = llama_get_device_memory_data(
|
||||
path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
||||
|
||||
for (const llama_device_memory_data & dmd : dmds_cpu_moe) {
|
||||
global_surplus_cpu_moe += dmd.free;
|
||||
global_surplus_cpu_moe -= int64_t(dmd.mb.total()) + margin;
|
||||
}
|
||||
|
||||
if (global_surplus_cpu_moe > 0) {
|
||||
LLAMA_LOG_INFO("%s: with only dense weights in device memory there is a total surplus of %" PRId64 " MiB\n",
|
||||
__func__, global_surplus_cpu_moe/MiB);
|
||||
} else {
|
||||
LLAMA_LOG_INFO("%s: with only dense weights in device memory there is still a total deficit of %" PRId64 " MiB\n",
|
||||
__func__, -global_surplus_cpu_moe/MiB);
|
||||
}
|
||||
|
||||
// reset
|
||||
tensor_buft_overrides[0] = {nullptr, nullptr};
|
||||
mparams->tensor_buft_overrides = tensor_buft_overrides;
|
||||
}
|
||||
|
||||
std::vector<int64_t> targets; // maximum acceptable memory use per device
|
||||
targets.reserve(nd);
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
targets.push_back(dmds_full[id].free - margin);
|
||||
LLAMA_LOG_DEBUG("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
|
||||
}
|
||||
|
||||
// whether for the optimal memory use we expect to load at least some MoE tensors:
|
||||
const bool partial_moe = hp_nex > 0 && global_surplus_cpu_moe > 0;
|
||||
|
||||
std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the partial layers of a device overflow to:
|
||||
overflow_bufts.reserve(nd);
|
||||
for (size_t id = 0; id < nd - 1; ++id) {
|
||||
overflow_bufts.push_back(ggml_backend_dev_buffer_type(devs[id + 1]));
|
||||
}
|
||||
overflow_bufts.push_back(ggml_backend_cpu_buffer_type());
|
||||
|
||||
std::vector<ngl_t> ngl_per_device(nd);
|
||||
std::vector<int64_t> mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts, partial_moe);
|
||||
if (hp_nex > 0) {
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
ngl_per_device[id].overflow_type = LAYER_FRACTION_MOE;
|
||||
}
|
||||
}
|
||||
|
||||
// optimize the number of layers per device using the method of false position:
|
||||
// - ngl_per_device has 0 layers for each device, lower bound
|
||||
// - try a "high" configuration where a device is given all unassigned layers
|
||||
// - interpolate the memory use / layer between low and high linearly to get a guess where it meets our target
|
||||
// - check memory use of our guess, replace either the low or high bound
|
||||
// - once we only have a difference of a single layer, stop and return the lower bound that just barely still fits
|
||||
if (hp_nex == 0) {
|
||||
LLAMA_LOG_INFO("%s: filling dense layers back-to-front:\n", __func__);
|
||||
} else {
|
||||
LLAMA_LOG_INFO("%s: filling dense-only layers back-to-front:\n", __func__);
|
||||
}
|
||||
uint32_t n_unassigned = hp_ngl;
|
||||
for (int id = nd - 1; id >= 0; id--) {
|
||||
std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
|
||||
ngl_per_device_high[id].n_layer = n_unassigned;
|
||||
if (hp_nex > 0) {
|
||||
ngl_per_device_high[id].n_part = ngl_per_device_high[id].n_layer;
|
||||
}
|
||||
if (ngl_per_device_high[id].n_layer > 0) {
|
||||
std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts, partial_moe);
|
||||
if (mem_high[id] > targets[id]) {
|
||||
uint32_t delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
|
||||
while (delta > 1) {
|
||||
uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
|
||||
step_size = std::max(step_size, uint32_t(1));
|
||||
step_size = std::min(step_size, delta - 1);
|
||||
|
||||
std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
|
||||
ngl_per_device_test[id].n_layer += step_size;
|
||||
if (hp_nex) {
|
||||
ngl_per_device_test[id].n_part += step_size;
|
||||
}
|
||||
const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
|
||||
|
||||
if (mem_test[id] <= targets[id]) {
|
||||
ngl_per_device = ngl_per_device_test;
|
||||
mem = mem_test;
|
||||
n_unassigned -= ngl_per_device[id].n_layer;
|
||||
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
|
||||
} else {
|
||||
ngl_per_device_high = ngl_per_device_test;
|
||||
mem_high = mem_test;
|
||||
LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
|
||||
}
|
||||
delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
|
||||
}
|
||||
} else {
|
||||
ngl_per_device = ngl_per_device_high;
|
||||
n_unassigned -= ngl_per_device[id].n_layer;
|
||||
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
|
||||
}
|
||||
}
|
||||
|
||||
const int64_t projected_margin = dmds_full[id].free - mem[id];
|
||||
LLAMA_LOG_INFO(
|
||||
"%s: - %s: %2" PRIu32 " layers, %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
|
||||
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, mem[id]/MiB, projected_margin/MiB);
|
||||
}
|
||||
if (hp_nex == 0 || global_surplus_cpu_moe <= 0) {
|
||||
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams, partial_moe);
|
||||
return;
|
||||
}
|
||||
|
||||
// step 4: for a MoE model where all dense tensors fit,
|
||||
// convert the dense-only layers in the back to full layers in the front until all devices are full
|
||||
// essentially the same procedure as for the dense-only layers except front-to-back
|
||||
// also, try fitting at least part of one more layer to reduce waste for "small" GPUs with e.g. 24 GiB VRAM
|
||||
|
||||
size_t id_dense_start = nd;
|
||||
for (int id = nd - 1; id >= 0; id--) {
|
||||
if (ngl_per_device[id].n_layer > 0) {
|
||||
id_dense_start = id;
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
assert(id_dense_start < nd);
|
||||
|
||||
LLAMA_LOG_INFO("%s: converting dense-only layers to full layers and filling them front-to-back with overflow to next device/system memory:\n", __func__);
|
||||
for (size_t id = 0; id <= id_dense_start; id++) {
|
||||
std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
|
||||
for (size_t jd = id_dense_start; jd < nd; jd++) {
|
||||
const uint32_t n_layer_move = ngl_per_device_high[jd].n_layer;
|
||||
ngl_per_device_high[id].n_layer += n_layer_move;
|
||||
ngl_per_device_high[jd].n_layer -= n_layer_move;
|
||||
ngl_per_device_high[jd].n_part = 0;
|
||||
}
|
||||
size_t id_dense_start_high = nd - 1;
|
||||
std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts, partial_moe);
|
||||
|
||||
if (mem_high[id] > targets[id]) {
|
||||
assert(ngl_per_device_high[id].n_layer >= ngl_per_device_high[id].n_part);
|
||||
assert(ngl_per_device[id].n_layer >= ngl_per_device[id].n_part);
|
||||
assert((ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
|
||||
>= ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
|
||||
uint32_t delta = (ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
|
||||
- (ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
|
||||
while (delta > 1) {
|
||||
uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
|
||||
step_size = std::max(step_size, uint32_t(1));
|
||||
step_size = std::min(step_size, delta - 1);
|
||||
|
||||
std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
|
||||
size_t id_dense_start_test = id_dense_start;
|
||||
uint32_t n_converted_test = 0;
|
||||
for (;id_dense_start_test < nd; id_dense_start_test++) {
|
||||
const uint32_t n_convert_jd = std::min(step_size - n_converted_test, ngl_per_device_test[id_dense_start_test].n_part);
|
||||
ngl_per_device_test[id_dense_start_test].n_layer -= n_convert_jd;
|
||||
ngl_per_device_test[id_dense_start_test].n_part -= n_convert_jd;
|
||||
ngl_per_device_test[id].n_layer += n_convert_jd;
|
||||
n_converted_test += n_convert_jd;
|
||||
|
||||
if (ngl_per_device_test[id_dense_start_test].n_layer > 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
|
||||
|
||||
if (mem_test[id] <= targets[id]) {
|
||||
ngl_per_device = ngl_per_device_test;
|
||||
mem = mem_test;
|
||||
id_dense_start = id_dense_start_test;
|
||||
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
|
||||
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
||||
} else {
|
||||
ngl_per_device_high = ngl_per_device_test;
|
||||
mem_high = mem_test;
|
||||
id_dense_start_high = id_dense_start_test;
|
||||
LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start_high=%zu\n",
|
||||
__func__, id, ngl_per_device_high[id].n_layer, ngl_per_device_high[id].n_part, id_dense_start_high);
|
||||
}
|
||||
delta = (ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
|
||||
- (ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
|
||||
}
|
||||
} else {
|
||||
ngl_per_device = ngl_per_device_high;
|
||||
id_dense_start = id_dense_start_high;
|
||||
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
|
||||
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
||||
}
|
||||
|
||||
// try to fit at least part of one more layer
|
||||
if (ngl_per_device[id_dense_start].n_layer > 0) {
|
||||
std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
|
||||
size_t id_dense_start_test = id_dense_start;
|
||||
ngl_per_device_test[id_dense_start_test].n_layer--;
|
||||
ngl_per_device_test[id_dense_start_test].n_part--;
|
||||
ngl_per_device_test[id].n_layer++;
|
||||
ngl_per_device_test[id].n_part++;
|
||||
if (ngl_per_device_test[id_dense_start_test].n_layer == 0) {
|
||||
id_dense_start_test++;
|
||||
}
|
||||
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_UP;
|
||||
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__);
|
||||
std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
|
||||
if (mem_test[id] < targets[id]) {
|
||||
ngl_per_device = ngl_per_device_test;
|
||||
mem = mem_test;
|
||||
id_dense_start = id_dense_start_test;
|
||||
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", UP), id_dense_start=%zu\n",
|
||||
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
||||
|
||||
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_GATE;
|
||||
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__);
|
||||
mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
|
||||
if (mem_test[id] < targets[id]) {
|
||||
ngl_per_device = ngl_per_device_test;
|
||||
mem = mem_test;
|
||||
id_dense_start = id_dense_start_test;
|
||||
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", GATE), id_dense_start=%zu\n",
|
||||
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
||||
}
|
||||
} else {
|
||||
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_ATTN;
|
||||
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__);
|
||||
mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
|
||||
if (mem_test[id] < targets[id]) {
|
||||
ngl_per_device = ngl_per_device_test;
|
||||
mem = mem_test;
|
||||
id_dense_start = id_dense_start_test;
|
||||
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", ATTN), id_dense_start=%zu\n",
|
||||
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const int64_t projected_margin = dmds_full[id].free - mem[id];
|
||||
LLAMA_LOG_INFO(
|
||||
"%s: - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
|
||||
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
|
||||
}
|
||||
|
||||
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams, partial_moe);
|
||||
}
|
||||
|
||||
bool llama_params_fit(
|
||||
const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
|
||||
float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
|
||||
size_t margin_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
|
||||
const int64_t t0_us = llama_time_us();
|
||||
bool ok = true;
|
||||
try {
|
||||
llama_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margin_s, n_ctx_min, log_level);
|
||||
LLAMA_LOG_INFO("%s: successfully fit params to free device memory\n", __func__);
|
||||
} catch (const std::runtime_error & e) {
|
||||
LLAMA_LOG_WARN("%s: failed to fit params to free device memory: %s\n", __func__, e.what());
|
||||
ok = false;
|
||||
}
|
||||
const int64_t t1_us = llama_time_us();
|
||||
LLAMA_LOG_INFO("%s: fitting params to free memory took %.2f seconds\n", __func__, (t1_us - t0_us) * 1e-6);
|
||||
return ok;
|
||||
}
|
||||
|
||||
struct llama_sampler_chain_params llama_sampler_chain_default_params() {
|
||||
struct llama_sampler_chain_params result = {
|
||||
/*.no_perf =*/ true,
|
||||
|
|
@ -49,6 +695,10 @@ size_t llama_max_devices(void) {
|
|||
return 16;
|
||||
}
|
||||
|
||||
size_t llama_max_tensor_buft_overrides() {
|
||||
return 4096;
|
||||
}
|
||||
|
||||
bool llama_supports_mmap(void) {
|
||||
return llama_mmap::SUPPORTED;
|
||||
}
|
||||
|
|
@ -108,11 +758,12 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
|
|||
model.t_start_us = tm.t_start_us;
|
||||
|
||||
try {
|
||||
llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides, params.tensor_buft_overrides);
|
||||
llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
|
||||
|
||||
ml.print_info();
|
||||
|
||||
model.hparams.vocab_only = params.vocab_only;
|
||||
model.hparams.no_alloc = params.no_alloc;
|
||||
|
||||
try {
|
||||
model.load_arch(ml);
|
||||
|
|
|
|||
|
|
@ -1,7 +1,5 @@
|
|||
#include "models.h"
|
||||
|
||||
|
||||
|
||||
llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_graph_params & params) :
|
||||
llm_graph_context(params) {
|
||||
// lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B
|
||||
|
|
@ -20,9 +18,15 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
|
|||
|
||||
// We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
|
||||
// See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
|
||||
const float mscale = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
|
||||
const float kq_scale = 1.0f * mscale * mscale / sqrtf(float(n_embd_head_k));
|
||||
const float attn_factor = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
|
||||
// And also: https://github.com/ggml-org/llama.cpp/pull/17945 [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
|
||||
|
||||
// first cancel the adjustment from llama_hparams::yarn_attn_factor_adjust to get the original attn_factor
|
||||
GGML_ASSERT(ext_factor >= 0.0f);
|
||||
const float attn_factor_org = attn_factor * (1.0f + 0.1f * logf(1.0f / freq_scale));
|
||||
|
||||
// use the original attn_factor to pre-scale the kq_scale
|
||||
const float mscale = attn_factor_org * (1.0f + 0.1f * hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
|
||||
const float kq_scale = 1.0f * mscale * mscale / sqrtf(float(n_embd_head_k));
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -5,11 +5,20 @@ llm_build_glm4_moe::llm_build_glm4_moe(const llama_model & model, const llm_grap
|
|||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
|
||||
int sections[4];
|
||||
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
||||
inpL = build_inp_embd(model.tok_embd);
|
||||
|
||||
bool use_mrope = hparams.use_mrope();
|
||||
if (ubatch.embd && !use_mrope) {
|
||||
// unfortunately, we need to forcefully stop here, to avoid users complaining about wrong results
|
||||
GGML_ABORT("This GGUF does not support multimodal. Please reconvert it.");
|
||||
}
|
||||
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
|
|
@ -60,17 +69,25 @@ llm_build_glm4_moe::llm_build_glm4_moe(const llama_model & model, const llm_grap
|
|||
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
||||
cb(Kcur, "Kcur_normed", il);
|
||||
}
|
||||
Qcur = ggml_rope_ext(
|
||||
ctx0, Qcur, inp_pos, nullptr,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
|
||||
Kcur = ggml_rope_ext(
|
||||
ctx0, Kcur, inp_pos, nullptr,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
if (use_mrope) {
|
||||
Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr,
|
||||
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
|
||||
Kcur = ggml_rope_multi(ctx0, Kcur, inp_pos, nullptr,
|
||||
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
} else {
|
||||
// Normal RoPE
|
||||
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot,
|
||||
rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
|
||||
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot,
|
||||
rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
}
|
||||
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
|
|
|||
|
|
@ -8,11 +8,20 @@ llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params
|
|||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
|
||||
int sections[4];
|
||||
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
||||
inpL = build_inp_embd(model.tok_embd);
|
||||
|
||||
bool use_mrope = hparams.use_mrope();
|
||||
if (ubatch.embd && !use_mrope) {
|
||||
// unfortunately, we need to forcefully stop here, to avoid users complaining about wrong results
|
||||
GGML_ABORT("This GGUF does not support multimodal. Please reconvert it.");
|
||||
}
|
||||
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
|
|
@ -63,11 +72,25 @@ llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params
|
|||
Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
|
||||
cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa));
|
||||
}
|
||||
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
|
||||
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
if (use_mrope) {
|
||||
Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr,
|
||||
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
|
||||
Kcur = ggml_rope_multi(ctx0, Kcur, inp_pos, nullptr,
|
||||
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
} else {
|
||||
// Normal RoPE
|
||||
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot,
|
||||
rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
|
||||
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot,
|
||||
rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
}
|
||||
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
|
|
|||
|
|
@ -441,23 +441,13 @@ private:
|
|||
ggml_tensor * cur,
|
||||
ggml_tensor * causal_mask,
|
||||
ggml_tensor * identity,
|
||||
ggml_tensor * diag_mask,
|
||||
int il);
|
||||
|
||||
ggml_tensor * build_layer_ffn(
|
||||
ggml_tensor * cur,
|
||||
int il);
|
||||
|
||||
ggml_tensor * build_delta_net_recurrent(
|
||||
ggml_tensor * q,
|
||||
ggml_tensor * k,
|
||||
ggml_tensor * v,
|
||||
ggml_tensor * g,
|
||||
ggml_tensor * beta,
|
||||
ggml_tensor * state,
|
||||
ggml_tensor * causal_mask,
|
||||
ggml_tensor * identity,
|
||||
int il);
|
||||
|
||||
ggml_tensor * build_delta_net_chunking(
|
||||
ggml_tensor * q,
|
||||
ggml_tensor * k,
|
||||
|
|
@ -467,8 +457,18 @@ private:
|
|||
ggml_tensor * state,
|
||||
ggml_tensor * causal_mask,
|
||||
ggml_tensor * identity,
|
||||
ggml_tensor * diag_mask,
|
||||
int il);
|
||||
|
||||
ggml_tensor * build_delta_net_autoregressive(
|
||||
ggml_tensor * q,
|
||||
ggml_tensor * k,
|
||||
ggml_tensor * v,
|
||||
ggml_tensor * g,
|
||||
ggml_tensor * beta,
|
||||
ggml_tensor * state,
|
||||
int il);
|
||||
|
||||
ggml_tensor * build_norm_gated(
|
||||
ggml_tensor * input,
|
||||
ggml_tensor * weights,
|
||||
|
|
|
|||
|
|
@ -31,16 +31,25 @@ llm_build_qwen2::llm_build_qwen2(const llama_model & model, const llm_graph_para
|
|||
{
|
||||
// compute Q and K and RoPE them
|
||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
||||
cb(Qcur, "Qcur", il);
|
||||
if (model.layers[il].bq) {
|
||||
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
||||
cb(Qcur, "Qcur", il);
|
||||
}
|
||||
|
||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
||||
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
||||
cb(Kcur, "Kcur", il);
|
||||
if (model.layers[il].bk) {
|
||||
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
||||
cb(Kcur, "Kcur", il);
|
||||
}
|
||||
|
||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
||||
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
||||
cb(Vcur, "Vcur", il);
|
||||
if (model.layers[il].bv) {
|
||||
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
||||
cb(Vcur, "Vcur", il);
|
||||
}
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||
|
|
|
|||
|
|
@ -17,13 +17,15 @@ llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_gr
|
|||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
|
||||
ggml_tensor * causal_mask =
|
||||
ggml_tri(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, ubatch.n_seq_tokens, ubatch.n_seq_tokens), 1.0f),
|
||||
ggml_tri(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, CHUNK_SIZE, CHUNK_SIZE), 1.0f),
|
||||
GGML_TRI_TYPE_LOWER);
|
||||
|
||||
ggml_tensor * identity = ggml_diag(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, ubatch.n_seq_tokens), 1.0f));
|
||||
ggml_tensor * identity = ggml_diag(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, CHUNK_SIZE), 1.0f));
|
||||
ggml_tensor * diag_mask = ggml_add(ctx0, causal_mask, identity);
|
||||
|
||||
ggml_build_forward_expand(gf, causal_mask);
|
||||
ggml_build_forward_expand(gf, identity);
|
||||
ggml_build_forward_expand(gf, diag_mask);
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
ggml_tensor * inpSA = inpL;
|
||||
|
|
@ -34,7 +36,7 @@ llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_gr
|
|||
// Determine layer type and build appropriate attention mechanism
|
||||
if (hparams.is_recurrent(il)) {
|
||||
// Linear attention layer (gated delta net)
|
||||
cur = build_layer_attn_linear(inp->get_recr(), cur, causal_mask, identity, il);
|
||||
cur = build_layer_attn_linear(inp->get_recr(), cur, causal_mask, identity, diag_mask, il);
|
||||
} else {
|
||||
// Full attention layer
|
||||
cur = build_layer_attn(inp->get_attn(), cur, inp_pos, il);
|
||||
|
|
@ -93,14 +95,8 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_chunking(
|
|||
ggml_tensor * state,
|
||||
ggml_tensor * causal_mask,
|
||||
ggml_tensor * identity,
|
||||
ggml_tensor * diag_mask,
|
||||
int il) {
|
||||
GGML_ASSERT(ggml_is_contiguous(q));
|
||||
GGML_ASSERT(ggml_is_contiguous(k));
|
||||
GGML_ASSERT(ggml_is_contiguous(v));
|
||||
GGML_ASSERT(ggml_is_contiguous(g));
|
||||
GGML_ASSERT(ggml_is_contiguous(beta));
|
||||
GGML_ASSERT(ggml_is_contiguous(state));
|
||||
|
||||
const int64_t S_k = q->ne[0];
|
||||
const int64_t H_k = q->ne[1];
|
||||
const int64_t n_tokens = q->ne[2];
|
||||
|
|
@ -120,15 +116,10 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_chunking(
|
|||
|
||||
GGML_ASSERT(H_k == H_v); // we did a repeat to make sure this is the case
|
||||
|
||||
// TODO: can this ever be false?
|
||||
const bool use_qk_l2norm = true;
|
||||
const float eps_norm = hparams.f_norm_rms_eps;
|
||||
|
||||
if (use_qk_l2norm) {
|
||||
const float eps_norm = hparams.f_norm_rms_eps;
|
||||
|
||||
q = ggml_l2_norm(ctx0, q, eps_norm);
|
||||
k = ggml_l2_norm(ctx0, k, eps_norm);
|
||||
}
|
||||
q = ggml_l2_norm(ctx0, q, eps_norm);
|
||||
k = ggml_l2_norm(ctx0, k, eps_norm);
|
||||
|
||||
const float scale = 1.0f / sqrtf(S_v);
|
||||
|
||||
|
|
@ -136,8 +127,6 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_chunking(
|
|||
|
||||
beta = ggml_sigmoid(ctx0, beta);
|
||||
|
||||
ggml_tensor * causal_diag_mask = ggml_add(ctx0, causal_mask, identity);
|
||||
|
||||
cb(q, "q_in", il);
|
||||
cb(k, "k_in", il);
|
||||
cb(v, "v_in", il);
|
||||
|
|
@ -188,36 +177,21 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_chunking(
|
|||
cb(v_beta, "v_beta", il);
|
||||
cb(k_beta, "k_beta", il);
|
||||
|
||||
ggml_tensor * chunked_mask =
|
||||
ggml_view_4d(ctx0, causal_mask, chunk_size,
|
||||
chunk_size, causal_mask->ne[2], causal_mask->ne[3],
|
||||
causal_mask->nb[1], causal_mask->nb[2], causal_mask->nb[3], 0);
|
||||
q = ggml_reshape_4d(ctx0, q, S_k, chunk_size, n_chunks, H_k * n_seqs);
|
||||
k = ggml_reshape_4d(ctx0, k, S_k, chunk_size, n_chunks, H_k * n_seqs);
|
||||
k_beta = ggml_reshape_4d(ctx0, k_beta, S_k, chunk_size, n_chunks, H_k * n_seqs);
|
||||
v = ggml_reshape_4d(ctx0, v, S_v, chunk_size, n_chunks, H_v * n_seqs);
|
||||
v_beta = ggml_reshape_4d(ctx0, v_beta, S_v, chunk_size, n_chunks, H_v * n_seqs);
|
||||
|
||||
ggml_tensor * chunked_diag_mask =
|
||||
ggml_view_4d(ctx0, causal_diag_mask, chunk_size,
|
||||
chunk_size, causal_diag_mask->ne[2], causal_diag_mask->ne[3],
|
||||
causal_diag_mask->nb[1], causal_diag_mask->nb[2], causal_diag_mask->nb[3], 0);
|
||||
|
||||
ggml_tensor * chunked_identity =
|
||||
ggml_view_4d(ctx0, identity, chunk_size,
|
||||
chunk_size, identity->ne[2], identity->ne[3],
|
||||
identity->nb[1], identity->nb[2], identity->nb[3], 0);
|
||||
|
||||
q = ggml_cont_4d(ctx0, q, S_k, chunk_size, n_chunks, H_k * n_seqs);
|
||||
k = ggml_cont_4d(ctx0, k, S_k, chunk_size, n_chunks, H_k * n_seqs);
|
||||
k_beta = ggml_cont_4d(ctx0, k_beta, S_k, chunk_size, n_chunks, H_k * n_seqs);
|
||||
v = ggml_cont_4d(ctx0, v, S_v, chunk_size, n_chunks, H_v * n_seqs);
|
||||
v_beta = ggml_cont_4d(ctx0, v_beta, S_v, chunk_size, n_chunks, H_v * n_seqs);
|
||||
|
||||
g = ggml_cont_4d(ctx0, g, chunk_size, 1, n_chunks, H_k * n_seqs);
|
||||
beta = ggml_cont_4d(ctx0, beta, 1, chunk_size, n_chunks, H_k * n_seqs);
|
||||
g = ggml_reshape_4d(ctx0, g, chunk_size, 1, n_chunks, H_k * n_seqs);
|
||||
beta = ggml_reshape_4d(ctx0, beta, 1, chunk_size, n_chunks, H_k * n_seqs);
|
||||
|
||||
ggml_tensor * g_cumsum = ggml_cumsum(ctx0, g);
|
||||
|
||||
cb(g_cumsum, "g_cumsum", il);
|
||||
|
||||
ggml_tensor * gcs_i = ggml_cont_4d(ctx0, g_cumsum, chunk_size, 1, n_chunks, H_v * n_seqs);
|
||||
ggml_tensor * gcs_j = ggml_cont_4d(ctx0, g_cumsum, 1, chunk_size, n_chunks, H_v * n_seqs);
|
||||
ggml_tensor * gcs_i = ggml_reshape_4d(ctx0, g_cumsum, chunk_size, 1, n_chunks, H_v * n_seqs);
|
||||
ggml_tensor * gcs_j = ggml_reshape_4d(ctx0, g_cumsum, 1, chunk_size, n_chunks, H_v * n_seqs);
|
||||
|
||||
ggml_tensor * gcs_j_broadcast =
|
||||
ggml_repeat_4d(ctx0, gcs_j, chunk_size, chunk_size, n_chunks, H_v * n_seqs);
|
||||
|
|
@ -226,23 +200,23 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_chunking(
|
|||
|
||||
cb(decay_mask, "decay_mask", il);
|
||||
|
||||
decay_mask = ggml_mul(ctx0, decay_mask, chunked_diag_mask);
|
||||
decay_mask = ggml_mul(ctx0, decay_mask, diag_mask);
|
||||
decay_mask = ggml_exp(ctx0, decay_mask);
|
||||
decay_mask = ggml_mul(ctx0, decay_mask, chunked_diag_mask);
|
||||
decay_mask = ggml_mul(ctx0, decay_mask, diag_mask);
|
||||
|
||||
ggml_tensor * kmulkbeta = ggml_mul_mat(ctx0, k, k_beta);
|
||||
|
||||
ggml_tensor * k_decay = ggml_mul(ctx0, kmulkbeta, decay_mask);
|
||||
ggml_tensor * attn = ggml_neg(ctx0, ggml_mul(ctx0, k_decay, chunked_mask));
|
||||
ggml_tensor * attn = ggml_neg(ctx0, ggml_mul(ctx0, k_decay, causal_mask));
|
||||
|
||||
cb(attn, "attn_pre_solve", il);
|
||||
|
||||
ggml_tensor * attn_lower = ggml_mul(ctx0, attn, chunked_mask);
|
||||
ggml_tensor * lhs = ggml_sub(ctx0, ggml_repeat(ctx0, chunked_identity, attn_lower), attn_lower);
|
||||
ggml_tensor * attn_lower = ggml_mul(ctx0, attn, causal_mask);
|
||||
ggml_tensor * lhs = ggml_sub(ctx0, ggml_repeat(ctx0, identity, attn_lower), attn_lower);
|
||||
|
||||
ggml_tensor * lin_solve = ggml_solve_tri(ctx0, lhs, attn, true, true, false);
|
||||
attn = ggml_mul(ctx0, lin_solve, chunked_mask);
|
||||
attn = ggml_add(ctx0, attn, chunked_identity);
|
||||
attn = ggml_mul(ctx0, lin_solve, causal_mask);
|
||||
attn = ggml_add(ctx0, attn, identity);
|
||||
|
||||
cb(attn, "attn_solved", il);
|
||||
|
||||
|
|
@ -291,7 +265,7 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_chunking(
|
|||
// attn = (q_i @ k_i.transpose(-1, -2) * decay_mask[:, :, i]).masked_fill_(mask, 0)
|
||||
attn = ggml_mul_mat(ctx0, k_chunk, q_chunk);
|
||||
attn = ggml_mul(ctx0, attn, decay_mask_chunk);
|
||||
attn = ggml_mul(ctx0, attn, ggml_add(ctx0, chunked_identity, chunked_mask));
|
||||
attn = ggml_mul(ctx0, attn, diag_mask);
|
||||
|
||||
ggml_tensor * state_t = ggml_cont_4d(ctx0, ggml_permute(ctx0, new_state, 1, 0, 2, 3), S_v, S_v, 1, H_v * n_seqs);
|
||||
|
||||
|
|
@ -361,23 +335,14 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_chunking(
|
|||
return ggml_concat(ctx0, flat_output, flat_state, 0);
|
||||
}
|
||||
|
||||
ggml_tensor * llm_build_qwen3next::build_delta_net_recurrent(
|
||||
ggml_tensor * llm_build_qwen3next::build_delta_net_autoregressive(
|
||||
ggml_tensor * q,
|
||||
ggml_tensor * k,
|
||||
ggml_tensor * v,
|
||||
ggml_tensor * g,
|
||||
ggml_tensor * beta,
|
||||
ggml_tensor * state,
|
||||
ggml_tensor * causal_mask,
|
||||
ggml_tensor * identity,
|
||||
int il) {
|
||||
GGML_ASSERT(ggml_is_contiguous(q));
|
||||
GGML_ASSERT(ggml_is_contiguous(k));
|
||||
GGML_ASSERT(ggml_is_contiguous(v));
|
||||
GGML_ASSERT(ggml_is_contiguous(g));
|
||||
GGML_ASSERT(ggml_is_contiguous(beta));
|
||||
GGML_ASSERT(ggml_is_contiguous(state));
|
||||
|
||||
const int64_t S_k = q->ne[0];
|
||||
const int64_t H_k = q->ne[1];
|
||||
const int64_t n_tokens = q->ne[2];
|
||||
|
|
@ -386,6 +351,7 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_recurrent(
|
|||
const int64_t S_v = v->ne[0];
|
||||
const int64_t H_v = v->ne[1];
|
||||
|
||||
GGML_ASSERT(n_tokens == 1); // This function is optimized for single token processing
|
||||
GGML_ASSERT(v->ne[2] == n_tokens);
|
||||
GGML_ASSERT(k->ne[2] == n_tokens);
|
||||
GGML_ASSERT(g->ne[0] == H_v && g->ne[1] == n_tokens && g->ne[2] == n_seqs);
|
||||
|
|
@ -397,215 +363,65 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_recurrent(
|
|||
|
||||
GGML_ASSERT(H_k == H_v); // we did a repeat to make sure this is the case
|
||||
|
||||
// TODO: can this ever be false?
|
||||
const bool use_qk_l2norm = true;
|
||||
const float eps_norm = hparams.f_norm_rms_eps;
|
||||
|
||||
if (use_qk_l2norm) {
|
||||
const float eps_norm = hparams.f_norm_rms_eps;
|
||||
|
||||
q = ggml_l2_norm(ctx0, q, eps_norm);
|
||||
k = ggml_l2_norm(ctx0, k, eps_norm);
|
||||
}
|
||||
q = ggml_l2_norm(ctx0, q, eps_norm);
|
||||
k = ggml_l2_norm(ctx0, k, eps_norm);
|
||||
|
||||
const float scale = 1.0f / sqrtf(S_v);
|
||||
|
||||
q = ggml_scale(ctx0, q, scale);
|
||||
|
||||
q = ggml_scale(ctx0, q, scale);
|
||||
beta = ggml_sigmoid(ctx0, beta);
|
||||
|
||||
ggml_tensor * causal_diag_mask = ggml_add(ctx0, causal_mask, identity);
|
||||
|
||||
cb(q, "q_in", il);
|
||||
cb(k, "k_in", il);
|
||||
cb(v, "v_in", il);
|
||||
cb(beta, "beta_in", il);
|
||||
cb(g, "g_in", il);
|
||||
|
||||
q = ggml_cont_4d(ctx0, ggml_permute(ctx0, q, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
|
||||
k = ggml_cont_4d(ctx0, ggml_permute(ctx0, k, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
|
||||
v = ggml_cont_4d(ctx0, ggml_permute(ctx0, v, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
|
||||
g = ggml_cont_4d(ctx0, ggml_permute(ctx0, g, 2, 0, 3, 1), n_tokens, 1, H_k, n_seqs);
|
||||
|
||||
beta = ggml_cont(ctx0, ggml_permute(ctx0, beta, 2, 0, 1, 3));
|
||||
state = ggml_reshape_4d(ctx0, state, S_v, S_v, H_v, n_seqs);
|
||||
|
||||
cb(q, "q_perm", il);
|
||||
cb(k, "k_perm", il);
|
||||
cb(v, "v_perm", il);
|
||||
cb(beta, "beta_perm", il);
|
||||
cb(g, "g_perm", il);
|
||||
cb(state, "state_in", il);
|
||||
ggml_tensor * g_t = ggml_reshape_4d(ctx0, ggml_transpose(ctx0, g), 1, 1, H_k, n_seqs);
|
||||
ggml_tensor * beta_t = ggml_reshape_4d(ctx0, ggml_transpose(ctx0, beta), 1, 1, H_k, n_seqs);
|
||||
|
||||
GGML_ASSERT(q->ne[1] == n_tokens && q->ne[0] == S_k && q->ne[2] == H_k && q->ne[3] == n_seqs);
|
||||
GGML_ASSERT(k->ne[1] == n_tokens && k->ne[0] == S_k && k->ne[2] == H_k && k->ne[3] == n_seqs);
|
||||
GGML_ASSERT(v->ne[1] == n_tokens && v->ne[0] == S_v && v->ne[2] == H_k && v->ne[3] == n_seqs);
|
||||
GGML_ASSERT(beta->ne[1] == n_tokens && beta->ne[2] == H_k && beta->ne[0] == 1 && beta->ne[3] == n_seqs);
|
||||
// Apply exponential to g_t
|
||||
g_t = ggml_exp(ctx0, g_t);
|
||||
|
||||
ggml_tensor * v_beta = ggml_mul(ctx0, v, beta);
|
||||
ggml_tensor * k_beta = ggml_mul(ctx0, k, beta);
|
||||
// Apply the gated delta rule for the single timestep
|
||||
// last_recurrent_state = last_recurrent_state * g_t
|
||||
state = ggml_mul(ctx0, state, g_t);
|
||||
|
||||
ggml_tensor * g_cumsum = ggml_cumsum(ctx0, g);
|
||||
// kv_mem = (last_recurrent_state * k_t.unsqueeze(-1)).sum(dim=-2)
|
||||
ggml_tensor * k_t_unsqueezed = ggml_reshape_4d(ctx0, k, 1, S_v, H_v, n_seqs);
|
||||
ggml_tensor * kv_mem = ggml_mul(ctx0, state, k_t_unsqueezed);
|
||||
// we need to sum over dim=-2, so we transpose, sum, then transpose again
|
||||
kv_mem = ggml_transpose(ctx0, ggml_sum_rows(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, kv_mem))));
|
||||
|
||||
cb(k_beta, "k_beta", il);
|
||||
cb(v_beta, "v_beta", il);
|
||||
cb(g_cumsum, "g_cumsum", il);
|
||||
// v_t = v.unsqueeze(2) (we insert the singleton dimension after n_seqs and H_v)
|
||||
ggml_tensor * v_t = ggml_reshape_4d(ctx0, v, S_v, 1, H_v, n_seqs);
|
||||
// delta = (v_t - kv_mem) * beta_t
|
||||
ggml_tensor * v_diff = ggml_sub(ctx0, v_t, kv_mem); // both should be [S_v, 1, H_v, n_seqs]
|
||||
ggml_tensor * delta = ggml_mul(ctx0, v_diff, beta_t);
|
||||
|
||||
ggml_tensor * gcs_i = ggml_cont_4d(ctx0, g_cumsum, n_tokens, 1, H_v, n_seqs); // [chunk_size, 1, n_tokens, n_seqs]
|
||||
ggml_tensor * gcs_j = ggml_cont_4d(ctx0, g_cumsum, 1, n_tokens, H_v, n_seqs); // [1, chunk_size, n_tokens, n_seqs]
|
||||
// last_recurrent_state = last_recurrent_state + k_t.unsqueeze(-1) * delta
|
||||
ggml_tensor * k_t_delta = ggml_mul(ctx0, ggml_repeat_4d(ctx0, k_t_unsqueezed, S_v, S_v, H_v, n_seqs), delta);
|
||||
state = ggml_add(ctx0, state, k_t_delta);
|
||||
|
||||
// Broadcast both tensors to [chunk_size, chunk_size, H_v, n_seqs]
|
||||
// ggml_tensor * gcs_i_broadcast =
|
||||
// ggml_repeat_4d(ctx0, gcs_i, GGML_DELTA_NET_CHUNK, GGML_DELTA_NET_CHUNK, num_chunks * H_v,
|
||||
// n_seqs); // [chunk_size, 1, H_v, n_seqs] -> [chunk_size, chunk_size, H_v, n_seqs]
|
||||
// Don't need this, this one will get auto-broadcast
|
||||
ggml_tensor * gcs_j_broadcast =
|
||||
ggml_repeat_4d(ctx0, gcs_j, n_tokens, n_tokens, H_v, n_seqs); // [1, chunk_size, H_v, n_seqs] -> [chunk_size, chunk_size, H_v, n_seqs]
|
||||
|
||||
ggml_tensor * decay_mask = ggml_sub(ctx0, gcs_j_broadcast, gcs_i);
|
||||
|
||||
// Apply lower triangular mask to ensure attention is causal (only past tokens influence current)
|
||||
decay_mask = ggml_mul(ctx0, decay_mask, causal_diag_mask);
|
||||
// Apply exponential to get the decay mask values
|
||||
decay_mask = ggml_exp(ctx0, decay_mask);
|
||||
// Apply lower triangular mask again to ensure only lower triangular values remain
|
||||
decay_mask = ggml_mul(ctx0, decay_mask, causal_diag_mask);
|
||||
|
||||
cb(decay_mask, "decay_mask", il);
|
||||
|
||||
// attn = -((k_beta @ key.transpose(-1, -2)) * decay_mask).masked_fill(mask, 0)
|
||||
ggml_tensor * kmulkbeta = ggml_mul_mat(ctx0, k, k_beta);
|
||||
|
||||
cb(kmulkbeta, "kmulkbeta", il);
|
||||
|
||||
ggml_tensor * k_decay = ggml_mul(ctx0, kmulkbeta, decay_mask);
|
||||
ggml_tensor * attn = ggml_neg(ctx0, ggml_mul(ctx0, k_decay, causal_mask));
|
||||
|
||||
cb(attn, "attn_pre_rec", il);
|
||||
|
||||
// for i in range(1, chunk_size):
|
||||
// row = attn[..., i, :i].clone()
|
||||
// sub = attn[..., :i, :i].clone()
|
||||
// attn[..., i, :i] = row + (row.unsqueeze(-1) * sub).sum(-2)
|
||||
// attn = attn + torch.eye(chunk_size, dtype=attn.dtype, device=attn.device)
|
||||
//
|
||||
// We reduce this to a linear triangular solve: AX = B, where B = attn, A = I - tril(A)
|
||||
ggml_tensor * attn_lower = ggml_mul(ctx0, attn, causal_mask);
|
||||
ggml_tensor * lhs = ggml_sub(ctx0, ggml_repeat(ctx0, identity, attn_lower), attn_lower);
|
||||
|
||||
ggml_tensor * lin_solve = ggml_solve_tri(ctx0, lhs, attn, true, true, false);
|
||||
attn = ggml_mul(ctx0, lin_solve, causal_mask);
|
||||
attn = ggml_add(ctx0, attn, identity);
|
||||
|
||||
// value = attn @ v_beta
|
||||
v = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_beta)), attn);
|
||||
|
||||
cb(v, "value_beta", il);
|
||||
|
||||
// k_cumdecay = attn @ (k_beta * g.exp().unsqueeze(-1))
|
||||
ggml_tensor * g_cumsum_t = ggml_cont(ctx0, ggml_transpose(ctx0, g_cumsum));
|
||||
ggml_tensor * gexp = ggml_exp(ctx0, g_cumsum_t);
|
||||
|
||||
cb(gexp, "g_cum_exp", il);
|
||||
|
||||
ggml_tensor * kbeta_gexp = ggml_mul(ctx0, k_beta, gexp);
|
||||
|
||||
cb(kbeta_gexp, "kbeta_gexp", il);
|
||||
|
||||
ggml_tensor * k_cumdecay =
|
||||
ggml_cont(ctx0, ggml_transpose(ctx0, ggml_mul_mat(ctx0, attn, ggml_cont(ctx0, ggml_transpose(ctx0, kbeta_gexp)))));
|
||||
|
||||
cb(k_cumdecay, "k_cumdecay", il);
|
||||
|
||||
// attn = (q_i @ k_i.transpose(-1, -2) * decay_mask[:, :, i]).masked_fill_(mask, 0)
|
||||
attn = ggml_mul_mat(ctx0, k, q);
|
||||
attn = ggml_mul(ctx0, attn, decay_mask);
|
||||
attn = ggml_mul(ctx0, attn, ggml_add(ctx0, identity, causal_mask));
|
||||
|
||||
cb(attn, "attn_decay_key", il);
|
||||
|
||||
ggml_tensor * state_t = ggml_cont(ctx0, ggml_transpose(ctx0, state));
|
||||
|
||||
// v_prime = (k_cumdecay[:, :, i]) @ last_recurrent_state
|
||||
ggml_tensor * v_prime = ggml_mul_mat(ctx0, state_t, k_cumdecay);
|
||||
|
||||
cb(v_prime, "v_prime", il);
|
||||
|
||||
// v_new = v_i - v_prime
|
||||
ggml_tensor * v_new = ggml_sub(ctx0, ggml_repeat(ctx0, v, v_prime), v_prime);
|
||||
|
||||
ggml_tensor * v_new_t = ggml_cont(ctx0, ggml_transpose(ctx0, v_new));
|
||||
|
||||
cb(v_new, "v_new", il);
|
||||
|
||||
// attn_inter = (q_i * g[:, :, i, :, None].exp()) @ last_recurrent_state
|
||||
ggml_tensor * q_g_exp = ggml_mul(ctx0, q, gexp);
|
||||
ggml_tensor * attn_inter = ggml_mul_mat(ctx0, state_t, q_g_exp);
|
||||
|
||||
cb(attn_inter, "attn_inter", il);
|
||||
|
||||
// core_attn_out[:, :, i] = attn_inter + attn @ v_new
|
||||
ggml_tensor * v_attn = ggml_mul_mat(ctx0, v_new_t, attn);
|
||||
|
||||
cb(v_attn, "v_attn", il);
|
||||
|
||||
ggml_tensor * core_attn_out = ggml_add(ctx0, attn_inter, v_attn);
|
||||
|
||||
cb(core_attn_out, "core_attn_out", il);
|
||||
|
||||
// g_last = torch.clamp(g_cum[:, :, -1], max=50.0).exp().unsqueeze(-1).unsqueeze(-1)
|
||||
// g_diff = torch.clamp(g_cum[:, :, -1:] - g_cum, max=50.0).exp()
|
||||
// key_gdiff = key * g_diff.unsqueeze(-1)
|
||||
// kgdmulvnew = (key_gdiff).transpose(-1, -2) @ v_new
|
||||
// last_recurrent_state = last_recurrent_state * g_last + kgdmulvnew
|
||||
|
||||
ggml_tensor * g_cum_last =
|
||||
ggml_cont(ctx0, ggml_view_4d(ctx0, g_cumsum_t, g_cumsum_t->ne[0], 1, g_cumsum_t->ne[2], g_cumsum_t->ne[3],
|
||||
g_cumsum_t->nb[1], g_cumsum_t->nb[2], g_cumsum_t->nb[3],
|
||||
g_cumsum_t->nb[0] * (g_cumsum_t->ne[1] - 1)));
|
||||
|
||||
cb(g_cum_last, "g_cum_last", il);
|
||||
|
||||
ggml_tensor * gexp_last =
|
||||
ggml_reshape_4d(ctx0, ggml_exp(ctx0, g_cum_last), 1, 1, g_cum_last->ne[0] * g_cum_last->ne[2], g_cum_last->ne[3]);
|
||||
|
||||
cb(gexp_last, "gexp_last", il);
|
||||
|
||||
ggml_tensor * g_cum_last_3d =
|
||||
ggml_reshape_3d(ctx0, g_cum_last, g_cum_last->ne[0], g_cum_last->ne[2], g_cum_last->ne[3]);
|
||||
|
||||
cb(g_cum_last_3d, "g_cum_last_3d", il);
|
||||
|
||||
ggml_tensor * g_cumsum_3d = ggml_reshape_3d(ctx0, g_cumsum, g_cumsum->ne[0], g_cumsum->ne[2], g_cumsum->ne[3]);
|
||||
|
||||
cb(g_cumsum_3d, "g_cumsum_3d", il);
|
||||
|
||||
ggml_tensor * g_diff = ggml_neg(ctx0, ggml_sub(ctx0, g_cumsum_3d, g_cum_last_3d));
|
||||
|
||||
cb(g_diff, "g_diff", il);
|
||||
|
||||
ggml_tensor * g_diff_exp = ggml_exp(ctx0, g_diff);
|
||||
|
||||
cb(g_diff_exp, "g_diff_exp", il);
|
||||
|
||||
ggml_tensor * key_gdiff = ggml_mul(ctx0, k,
|
||||
ggml_reshape_4d(ctx0, g_diff_exp, 1, g_diff_exp->ne[0], g_diff_exp->ne[1],
|
||||
g_diff_exp->ne[2] * g_diff_exp->ne[3]));
|
||||
|
||||
cb(key_gdiff, "key_gdiff", il);
|
||||
|
||||
ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, v_new_t, ggml_cont(ctx0, ggml_transpose(ctx0, key_gdiff)));
|
||||
|
||||
cb(kgdmulvnew, "kgdmulvnew", il);
|
||||
|
||||
state = ggml_add(ctx0, ggml_mul(ctx0, state, gexp_last), kgdmulvnew);
|
||||
// Compute the attention output
|
||||
// core_attn_out = (last_recurrent_state * q_t.unsqueeze(-1)).sum(dim=-2)
|
||||
ggml_tensor * q_t_unsqueezed = ggml_reshape_4d(ctx0, q, 1, S_v, H_v, n_seqs); // unsqueeze q_t
|
||||
ggml_tensor * state_q = ggml_mul(ctx0, state, q_t_unsqueezed);
|
||||
// again, since it's over dim = -2, transpose, sum, transpose back
|
||||
ggml_tensor * core_attn_out =
|
||||
ggml_transpose(ctx0, ggml_sum_rows(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, state_q))));
|
||||
|
||||
// core_attn_out should be [S_v, 1, H_v, n_seqs] after this
|
||||
cb(core_attn_out, "output_tokens", il);
|
||||
cb(state, "new_state", il);
|
||||
|
||||
// flatten output
|
||||
ggml_tensor * flat_output =
|
||||
ggml_cont_1d(ctx0, ggml_permute(ctx0, core_attn_out, 0, 2, 1, 3), S_v * H_v * n_tokens * n_seqs);
|
||||
|
||||
ggml_tensor * flat_state = ggml_cont_1d(ctx0, state, S_v * S_v * H_v * n_seqs);
|
||||
// flatten output, no need to permute since n_tokens is 1 so [S_v, 1, H_v, n_seqs] and [S_v, H_v, 1, n_seqs] are equivalent memory-layout wise
|
||||
ggml_tensor * flat_output = ggml_reshape_1d(ctx0, core_attn_out, S_v * H_v * n_tokens * n_seqs);
|
||||
ggml_tensor * flat_state = ggml_reshape_1d(ctx0, state, S_v * S_v * H_v * n_seqs);
|
||||
|
||||
return ggml_concat(ctx0, flat_output, flat_state, 0);
|
||||
}
|
||||
|
|
@ -712,6 +528,7 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
|
|||
ggml_tensor * cur,
|
||||
ggml_tensor * causal_mask,
|
||||
ggml_tensor * identity,
|
||||
ggml_tensor * diag_mask,
|
||||
int il) {
|
||||
const auto * mctx_cur = inp->mctx;
|
||||
|
||||
|
|
@ -737,11 +554,11 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
|
|||
cb(mixed_ba, "linear_attn_mixed_ba", il);
|
||||
|
||||
int64_t qkvz_new_dim = 2 * head_k_dim + 2 * head_v_dim * (num_v_heads / num_k_heads);
|
||||
ggml_tensor * mixed_qkvz_reshaped = ggml_cont_4d(ctx0, mixed_qkvz, qkvz_new_dim, num_k_heads, n_seq_tokens, n_seqs);
|
||||
ggml_tensor * mixed_qkvz_reshaped = ggml_reshape_4d(ctx0, mixed_qkvz, qkvz_new_dim, num_k_heads, n_seq_tokens, n_seqs);
|
||||
|
||||
// Reshape mixed_ba: [batch, seq_len, hidden_size] -> [batch, seq_len, num_k_heads, 2*num_v_heads/num_k_heads]
|
||||
int64_t ba_new_dim = 2 * num_v_heads / num_k_heads;
|
||||
ggml_tensor * mixed_ba_reshaped = ggml_cont_4d(ctx0, mixed_ba, ba_new_dim, num_k_heads, n_seq_tokens, n_seqs);
|
||||
ggml_tensor * mixed_ba_reshaped = ggml_reshape_4d(ctx0, mixed_ba, ba_new_dim, num_k_heads, n_seq_tokens, n_seqs);
|
||||
|
||||
// Split mixed_ba into b and a (beta and alpha parameters)
|
||||
int64_t split_sizes_ba[2] = {
|
||||
|
|
@ -762,8 +579,6 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
|
|||
ggml_tensor * beta = ggml_cont_3d(ctx0, b, num_v_heads, n_seq_tokens, n_seqs);
|
||||
ggml_tensor * alpha = ggml_cont_3d(ctx0, a, num_v_heads, n_seq_tokens, n_seqs);
|
||||
|
||||
GGML_ASSERT(ggml_nelements(beta) + ggml_nelements(alpha) == ggml_nelements(mixed_ba));
|
||||
|
||||
ggml_tensor * alpha_biased = ggml_add(ctx0, alpha, model.layers[il].ssm_dt);
|
||||
ggml_tensor * alpha_softplus = ggml_softplus(ctx0, alpha_biased);
|
||||
cb(alpha_softplus, "a_softplus", il);
|
||||
|
|
@ -799,9 +614,6 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
|
|||
(split_sizes_qkvz[0] + split_sizes_qkvz[1] + split_sizes_qkvz[2]) * sizeof(float));
|
||||
cb(z, "z", il);
|
||||
|
||||
GGML_ASSERT(ggml_nelements(query) + ggml_nelements(key) + ggml_nelements(value) + ggml_nelements(z) ==
|
||||
ggml_nelements(mixed_qkvz));
|
||||
|
||||
// After creating query, key, and value_reshaped, reshape each to flatten the head dimensions
|
||||
// query: [head_k_dim, num_k_heads, n_tokens, n_seqs] -> [head_k_dim * num_k_heads, n_tokens, n_seqs]
|
||||
ggml_tensor * query_flat = ggml_cont_3d(ctx0, query, head_k_dim * num_k_heads, n_seq_tokens, n_seqs);
|
||||
|
|
@ -925,10 +737,13 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
|
|||
cb(k_conv, "k_conv_predelta", il);
|
||||
cb(v_conv, "v_conv_predelta", il);
|
||||
|
||||
// Choose between build_delta_net_chunking and build_delta_net_recurrent based on n_tokens
|
||||
ggml_tensor * attn_out = n_seq_tokens > CHUNK_SIZE ?
|
||||
build_delta_net_chunking (q_conv, k_conv, v_conv, gate, beta, state, causal_mask, identity, il) :
|
||||
build_delta_net_recurrent(q_conv, k_conv, v_conv, gate, beta, state, causal_mask, identity, il);
|
||||
// Choose between build_delta_net_chunking, build_delta_net_recurrent, and build_delta_net_autoregressive based on n_tokens
|
||||
ggml_tensor * attn_out;
|
||||
if (n_seq_tokens == 1) {
|
||||
attn_out = build_delta_net_autoregressive(q_conv, k_conv, v_conv, gate, beta, state, il);
|
||||
} else {
|
||||
attn_out = build_delta_net_chunking(q_conv, k_conv, v_conv, gate, beta, state, causal_mask, identity, diag_mask, il);
|
||||
}
|
||||
cb(attn_out, "attn_out", il);
|
||||
|
||||
// The tensors were concatenated 1d, so we need to extract them 1d as well
|
||||
|
|
|
|||
|
|
@ -0,0 +1,121 @@
|
|||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-cpp.h"
|
||||
#include "clip.h"
|
||||
#include "clip-impl.h"
|
||||
#include "clip-model.h"
|
||||
|
||||
#include <vector>
|
||||
#include <functional>
|
||||
|
||||
#define DEFAULT_INTERPOLATION_MODE (GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ANTIALIAS)
|
||||
|
||||
struct clip_graph {
|
||||
const clip_model & model;
|
||||
const clip_hparams & hparams;
|
||||
projector_type proj_type;
|
||||
|
||||
// we only support single image per batch
|
||||
const clip_image_f32 & img;
|
||||
|
||||
const int patch_size;
|
||||
const int n_patches_x;
|
||||
const int n_patches_y;
|
||||
const int n_patches;
|
||||
const int n_embd;
|
||||
const int n_head;
|
||||
const int d_head;
|
||||
const int n_layer;
|
||||
const int n_mmproj_embd;
|
||||
const float eps;
|
||||
const float kq_scale;
|
||||
const clip_flash_attn_type flash_attn_type;
|
||||
|
||||
// for debugging
|
||||
const bool debug_graph;
|
||||
std::vector<ggml_tensor *> & debug_print_tensors;
|
||||
|
||||
ggml_context_ptr ctx0_ptr;
|
||||
ggml_context * ctx0;
|
||||
ggml_cgraph * gf;
|
||||
|
||||
clip_graph(clip_ctx * ctx, const clip_image_f32 & img);
|
||||
|
||||
virtual ~clip_graph() = default;
|
||||
virtual ggml_cgraph * build() = 0;
|
||||
|
||||
//
|
||||
// utility functions
|
||||
//
|
||||
void cb(ggml_tensor * cur0, const char * name, int il) const;
|
||||
|
||||
// siglip2 naflex
|
||||
ggml_tensor * resize_position_embeddings(uint32_t interpolation_mode = DEFAULT_INTERPOLATION_MODE);
|
||||
|
||||
// build vision transformer (ViT) cgraph
|
||||
// this function should cover most of the models
|
||||
// if your model has specific features, you should probably duplicate this function
|
||||
ggml_tensor * build_vit(
|
||||
ggml_tensor * inp,
|
||||
int64_t n_pos,
|
||||
norm_type norm_t,
|
||||
ffn_op_type ffn_t,
|
||||
ggml_tensor * learned_pos_embd,
|
||||
std::function<ggml_tensor *(ggml_tensor *, const clip_layer &)> add_pos);
|
||||
|
||||
// build the input after conv2d (inp_raw --> patches)
|
||||
// returns tensor with shape [n_embd, n_patches]
|
||||
ggml_tensor * build_inp();
|
||||
|
||||
ggml_tensor * build_inp_raw(int channels = 3);
|
||||
|
||||
ggml_tensor * build_norm(
|
||||
ggml_tensor * cur,
|
||||
ggml_tensor * mw,
|
||||
ggml_tensor * mb,
|
||||
norm_type type,
|
||||
float norm_eps,
|
||||
int il) const;
|
||||
|
||||
ggml_tensor * build_ffn(
|
||||
ggml_tensor * cur,
|
||||
ggml_tensor * up,
|
||||
ggml_tensor * up_b,
|
||||
ggml_tensor * gate,
|
||||
ggml_tensor * gate_b,
|
||||
ggml_tensor * down,
|
||||
ggml_tensor * down_b,
|
||||
ffn_op_type type_op,
|
||||
int il) const;
|
||||
|
||||
ggml_tensor * build_attn(
|
||||
ggml_tensor * wo,
|
||||
ggml_tensor * wo_b,
|
||||
ggml_tensor * q_cur,
|
||||
ggml_tensor * k_cur,
|
||||
ggml_tensor * v_cur,
|
||||
ggml_tensor * kq_mask,
|
||||
float kq_scale,
|
||||
int il) const;
|
||||
|
||||
// implementation of the 2D RoPE without adding a new op in ggml
|
||||
// this is not efficient (use double the memory), but works on all backends
|
||||
// TODO: there was a more efficient which relies on ggml_view and ggml_rope_ext_inplace, but the rope inplace does not work well with non-contiguous tensors ; we should fix that and revert back to the original implementation in https://github.com/ggml-org/llama.cpp/pull/13065
|
||||
ggml_tensor * build_rope_2d(
|
||||
ggml_context * ctx0,
|
||||
ggml_tensor * cur,
|
||||
ggml_tensor * pos_a, // first half
|
||||
ggml_tensor * pos_b, // second half
|
||||
const float freq_base,
|
||||
const bool interleave_freq
|
||||
);
|
||||
|
||||
// aka pixel_shuffle / pixel_unshuffle / patch_merger (Kimi-VL)
|
||||
// support dynamic resolution
|
||||
ggml_tensor * build_patch_merge_permute(ggml_tensor * cur, int scale_factor);
|
||||
|
||||
// Generic function to stack frames for audio processing
|
||||
// Abstracts out the StackAudioFrames logic used by ultravox
|
||||
ggml_tensor * build_stack(ggml_tensor * cur, int32_t stack_factor, int32_t n_embed);
|
||||
};
|
||||
|
|
@ -1,3 +1,5 @@
|
|||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "gguf.h"
|
||||
#include "clip.h"
|
||||
|
|
@ -13,6 +15,8 @@
|
|||
|
||||
// Internal header for clip.cpp
|
||||
|
||||
#define MTMD_INTERNAL_HEADER
|
||||
|
||||
#define KEY_FTYPE "general.file_type"
|
||||
#define KEY_NAME "general.name"
|
||||
#define KEY_DESCRIPTION "general.description"
|
||||
|
|
@ -64,6 +68,7 @@
|
|||
#define TN_PATCH_EMBD "v.patch_embd.weight" // not rename tensor with ".0" postfix for backwrad compat
|
||||
#define TN_PATCH_EMBD_1 "v.patch_embd.weight.1"
|
||||
#define TN_PATCH_BIAS "v.patch_embd.bias"
|
||||
#define TN_NORM_EMBD "v.norm_embd.%s"
|
||||
#define TN_ATTN_QKV "%s.blk.%d.attn_qkv.%s"
|
||||
#define TN_ATTN_K "%s.blk.%d.attn_k.%s"
|
||||
#define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
|
||||
|
|
@ -82,6 +87,10 @@
|
|||
#define TN_LN_PRE "%s.pre_ln.%s"
|
||||
#define TN_LN_POST "%s.post_ln.%s"
|
||||
#define TN_LLAVA_PROJ "mm.%d.%s"
|
||||
#define TN_MM_UP "mm.up.%s"
|
||||
#define TN_MM_GATE "mm.gate.%s"
|
||||
#define TN_MM_DOWN "mm.down.%s"
|
||||
#define TN_MM_POST_NORM "mm.post_norm.%s"
|
||||
#define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s"
|
||||
#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
|
||||
#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s"
|
||||
|
|
@ -91,7 +100,7 @@
|
|||
#define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3
|
||||
#define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3
|
||||
#define TN_MM_PROJECTOR "mm.model.fc.weight" // idefics3
|
||||
#define TN_MM_PATCH_MERGER "mm.patch_merger.weight" // mistral small 3.1
|
||||
#define TN_MM_PATCH_MERGER "mm.patch_merger.%s" // mistral small 3.1, glm4v
|
||||
#define TN_TOK_IMG_BREAK "v.token_embd.img_break" // pixtral
|
||||
#define TN_TOK_GLM_BOI "adapter.boi" // glm-edge (these embeddings are not in text model)
|
||||
#define TN_TOK_GLM_EOI "adapter.eoi" // glm-edge (these embeddings are not in text model)
|
||||
|
|
@ -132,6 +141,10 @@
|
|||
// align x to upper multiple of n
|
||||
#define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))
|
||||
|
||||
// forward declaration
|
||||
// TODO: improve this later
|
||||
struct clip_ctx;
|
||||
|
||||
enum projector_type {
|
||||
PROJECTOR_TYPE_MLP,
|
||||
PROJECTOR_TYPE_MLP_NORM,
|
||||
|
|
@ -149,6 +162,7 @@ enum projector_type {
|
|||
PROJECTOR_TYPE_INTERNVL,
|
||||
PROJECTOR_TYPE_LLAMA4,
|
||||
PROJECTOR_TYPE_QWEN2A,
|
||||
PROJECTOR_TYPE_GLMA,
|
||||
PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx
|
||||
PROJECTOR_TYPE_VOXTRAL,
|
||||
PROJECTOR_TYPE_LFM2,
|
||||
|
|
@ -156,6 +170,7 @@ enum projector_type {
|
|||
PROJECTOR_TYPE_LIGHTONOCR,
|
||||
PROJECTOR_TYPE_COGVLM,
|
||||
PROJECTOR_TYPE_JANUS_PRO,
|
||||
PROJECTOR_TYPE_GLM4V,
|
||||
PROJECTOR_TYPE_UNKNOWN,
|
||||
};
|
||||
|
||||
|
|
@ -175,6 +190,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
|||
{ PROJECTOR_TYPE_INTERNVL, "internvl"},
|
||||
{ PROJECTOR_TYPE_LLAMA4, "llama4"},
|
||||
{ PROJECTOR_TYPE_QWEN2A, "qwen2a"},
|
||||
{ PROJECTOR_TYPE_GLMA, "glma"},
|
||||
{ PROJECTOR_TYPE_QWEN25O, "qwen2.5o"},
|
||||
{ PROJECTOR_TYPE_VOXTRAL, "voxtral"},
|
||||
{ PROJECTOR_TYPE_LFM2, "lfm2"},
|
||||
|
|
@ -182,6 +198,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
|||
{ PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"},
|
||||
{ PROJECTOR_TYPE_COGVLM, "cogvlm"},
|
||||
{ PROJECTOR_TYPE_JANUS_PRO, "janus_pro"},
|
||||
{ PROJECTOR_TYPE_GLM4V, "glm4v"},
|
||||
};
|
||||
|
||||
static projector_type clip_projector_type_from_string(const std::string & str) {
|
||||
|
|
@ -485,6 +502,8 @@ static void print_tensor_data(ggml_tensor * t, uint8_t * data, int64_t n) {
|
|||
}
|
||||
}
|
||||
|
||||
void clip_debug_encode(clip_ctx * ctx, int h, int w, float fill_value);
|
||||
|
||||
//
|
||||
// API used internally with mtmd
|
||||
//
|
||||
|
|
|
|||
|
|
@ -0,0 +1,300 @@
|
|||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "clip.h"
|
||||
#include "clip-impl.h"
|
||||
|
||||
#include <vector>
|
||||
#include <unordered_set>
|
||||
#include <cstdint>
|
||||
#include <cmath>
|
||||
|
||||
enum ffn_op_type {
|
||||
FFN_GELU,
|
||||
FFN_GELU_ERF,
|
||||
FFN_SILU,
|
||||
FFN_GELU_QUICK,
|
||||
};
|
||||
|
||||
enum norm_type {
|
||||
NORM_TYPE_NORMAL,
|
||||
NORM_TYPE_RMS,
|
||||
};
|
||||
|
||||
enum patch_merge_type {
|
||||
PATCH_MERGE_FLAT,
|
||||
PATCH_MERGE_SPATIAL_UNPAD,
|
||||
};
|
||||
|
||||
struct clip_hparams {
|
||||
int32_t image_size = 0;
|
||||
int32_t patch_size = 0;
|
||||
int32_t n_embd = 0;
|
||||
int32_t n_ff = 0;
|
||||
int32_t projection_dim = 0;
|
||||
int32_t n_head = 0;
|
||||
int32_t n_layer = 0;
|
||||
// idefics3
|
||||
int32_t image_longest_edge = 0;
|
||||
int32_t image_min_pixels = -1;
|
||||
int32_t image_max_pixels = -1;
|
||||
int32_t n_merge = 0; // number of patch merges **per-side**
|
||||
|
||||
float image_mean[3];
|
||||
float image_std[3];
|
||||
|
||||
// for models using dynamic image size, we need to have a smaller image size to warmup
|
||||
// otherwise, user will get OOM everytime they load the model
|
||||
int32_t warmup_image_size = 0;
|
||||
int32_t warmup_audio_size = 3000;
|
||||
|
||||
ffn_op_type ffn_op = FFN_GELU;
|
||||
|
||||
patch_merge_type mm_patch_merge_type = PATCH_MERGE_FLAT;
|
||||
|
||||
float eps = 1e-6;
|
||||
float rope_theta = 0.0;
|
||||
|
||||
std::vector<clip_image_size> image_res_candidates; // for llava-uhd style models
|
||||
int32_t image_crop_resolution;
|
||||
std::unordered_set<int32_t> vision_feature_layer;
|
||||
int32_t attn_window_size = 0;
|
||||
int32_t n_wa_pattern = 0;
|
||||
|
||||
// audio
|
||||
int32_t n_mel_bins = 0; // whisper preprocessor
|
||||
int32_t proj_stack_factor = 0; // ultravox
|
||||
|
||||
// audio-to-mel preprocessor params
|
||||
int32_t audio_chunk_len = -1; // in seconds
|
||||
int32_t audio_sample_rate = -1;
|
||||
int32_t audio_n_fft = -1;
|
||||
int32_t audio_window_len = -1;
|
||||
int32_t audio_hop_len = -1;
|
||||
|
||||
// legacy
|
||||
bool has_llava_projector = false;
|
||||
int minicpmv_version = 0;
|
||||
int32_t minicpmv_query_num = 0; // MiniCPM-V query number
|
||||
|
||||
// custom value provided by user, can be undefined if not set
|
||||
int32_t custom_image_min_tokens = -1;
|
||||
int32_t custom_image_max_tokens = -1;
|
||||
|
||||
void set_limit_image_tokens(int n_tokens_min, int n_tokens_max) {
|
||||
const int cur_merge = n_merge == 0 ? 1 : n_merge;
|
||||
const int patch_area = patch_size * patch_size * cur_merge * cur_merge;
|
||||
image_min_pixels = (custom_image_min_tokens > 0 ? custom_image_min_tokens : n_tokens_min) * patch_area;
|
||||
image_max_pixels = (custom_image_max_tokens > 0 ? custom_image_max_tokens : n_tokens_max) * patch_area;
|
||||
warmup_image_size = static_cast<int>(std::sqrt(image_max_pixels));
|
||||
}
|
||||
|
||||
void set_warmup_n_tokens(int n_tokens) {
|
||||
int n_tok_per_side = static_cast<int>(std::sqrt(n_tokens));
|
||||
GGML_ASSERT(n_tok_per_side * n_tok_per_side == n_tokens && "n_tokens must be n*n");
|
||||
const int cur_merge = n_merge == 0 ? 1 : n_merge;
|
||||
warmup_image_size = n_tok_per_side * patch_size * cur_merge;
|
||||
// TODO: support warmup size for custom token numbers
|
||||
}
|
||||
};
|
||||
|
||||
struct clip_layer {
|
||||
// attention
|
||||
ggml_tensor * k_w = nullptr;
|
||||
ggml_tensor * k_b = nullptr;
|
||||
ggml_tensor * q_w = nullptr;
|
||||
ggml_tensor * q_b = nullptr;
|
||||
ggml_tensor * v_w = nullptr;
|
||||
ggml_tensor * v_b = nullptr;
|
||||
ggml_tensor * qkv_w = nullptr;
|
||||
ggml_tensor * qkv_b = nullptr;
|
||||
|
||||
ggml_tensor * o_w = nullptr;
|
||||
ggml_tensor * o_b = nullptr;
|
||||
|
||||
ggml_tensor * k_norm = nullptr;
|
||||
ggml_tensor * q_norm = nullptr;
|
||||
|
||||
// layernorm 1
|
||||
ggml_tensor * ln_1_w = nullptr;
|
||||
ggml_tensor * ln_1_b = nullptr;
|
||||
|
||||
ggml_tensor * ff_up_w = nullptr;
|
||||
ggml_tensor * ff_up_b = nullptr;
|
||||
ggml_tensor * ff_gate_w = nullptr;
|
||||
ggml_tensor * ff_gate_b = nullptr;
|
||||
ggml_tensor * ff_down_w = nullptr;
|
||||
ggml_tensor * ff_down_b = nullptr;
|
||||
|
||||
// layernorm 2
|
||||
ggml_tensor * ln_2_w = nullptr;
|
||||
ggml_tensor * ln_2_b = nullptr;
|
||||
|
||||
// layer scale (no bias)
|
||||
ggml_tensor * ls_1_w = nullptr;
|
||||
ggml_tensor * ls_2_w = nullptr;
|
||||
|
||||
// qwen3vl deepstack merger
|
||||
ggml_tensor * deepstack_norm_w = nullptr;
|
||||
ggml_tensor * deepstack_norm_b = nullptr;
|
||||
ggml_tensor * deepstack_fc1_w = nullptr;
|
||||
ggml_tensor * deepstack_fc1_b = nullptr;
|
||||
ggml_tensor * deepstack_fc2_w = nullptr;
|
||||
ggml_tensor * deepstack_fc2_b = nullptr;
|
||||
|
||||
bool has_deepstack() const {
|
||||
return deepstack_fc1_w != nullptr;
|
||||
}
|
||||
};
|
||||
|
||||
struct clip_model {
|
||||
clip_modality modality = CLIP_MODALITY_VISION;
|
||||
projector_type proj_type = PROJECTOR_TYPE_MLP;
|
||||
clip_hparams hparams;
|
||||
|
||||
// embeddings
|
||||
ggml_tensor * class_embedding = nullptr;
|
||||
ggml_tensor * patch_embeddings_0 = nullptr;
|
||||
ggml_tensor * patch_embeddings_1 = nullptr; // second Conv2D kernel when we decouple Conv3D along temproal dimension (Qwen2VL)
|
||||
ggml_tensor * patch_bias = nullptr;
|
||||
ggml_tensor * position_embeddings = nullptr;
|
||||
ggml_tensor * norm_embd_w = nullptr;
|
||||
ggml_tensor * norm_embd_b = nullptr;
|
||||
|
||||
ggml_tensor * pre_ln_w = nullptr;
|
||||
ggml_tensor * pre_ln_b = nullptr;
|
||||
|
||||
std::vector<clip_layer> layers;
|
||||
|
||||
int32_t n_deepstack_layers = 0; // used by Qwen3-VL, calculated from clip_layer
|
||||
|
||||
ggml_tensor * post_ln_w;
|
||||
ggml_tensor * post_ln_b;
|
||||
|
||||
ggml_tensor * projection; // TODO: rename it to fc (fully connected layer)
|
||||
ggml_tensor * mm_fc_w;
|
||||
ggml_tensor * mm_fc_b;
|
||||
ggml_tensor * mm_ffn_up_w = nullptr;
|
||||
ggml_tensor * mm_ffn_up_b = nullptr;
|
||||
ggml_tensor * mm_ffn_gate_w = nullptr;
|
||||
ggml_tensor * mm_ffn_gate_b = nullptr;
|
||||
ggml_tensor * mm_ffn_down_w = nullptr;
|
||||
ggml_tensor * mm_ffn_down_b = nullptr;
|
||||
ggml_tensor * mm_post_norm_w = nullptr;
|
||||
ggml_tensor * mm_post_norm_b = nullptr;
|
||||
|
||||
// LLaVA projection
|
||||
ggml_tensor * mm_input_norm_w = nullptr;
|
||||
ggml_tensor * mm_input_norm_b = nullptr;
|
||||
ggml_tensor * mm_0_w = nullptr;
|
||||
ggml_tensor * mm_0_b = nullptr;
|
||||
ggml_tensor * mm_2_w = nullptr;
|
||||
ggml_tensor * mm_2_b = nullptr;
|
||||
|
||||
ggml_tensor * image_newline = nullptr;
|
||||
|
||||
// Yi type models with mlp+normalization projection
|
||||
ggml_tensor * mm_1_w = nullptr; // Yi type models have 0, 1, 3, 4
|
||||
ggml_tensor * mm_1_b = nullptr;
|
||||
ggml_tensor * mm_3_w = nullptr;
|
||||
ggml_tensor * mm_3_b = nullptr;
|
||||
ggml_tensor * mm_4_w = nullptr;
|
||||
ggml_tensor * mm_4_b = nullptr;
|
||||
|
||||
// GLMV-Edge projection
|
||||
ggml_tensor * mm_model_adapter_conv_w = nullptr;
|
||||
ggml_tensor * mm_model_adapter_conv_b = nullptr;
|
||||
|
||||
// MobileVLM projection
|
||||
ggml_tensor * mm_model_mlp_1_w = nullptr;
|
||||
ggml_tensor * mm_model_mlp_1_b = nullptr;
|
||||
ggml_tensor * mm_model_mlp_3_w = nullptr;
|
||||
ggml_tensor * mm_model_mlp_3_b = nullptr;
|
||||
ggml_tensor * mm_model_block_1_block_0_0_w = nullptr;
|
||||
ggml_tensor * mm_model_block_1_block_0_1_w = nullptr;
|
||||
ggml_tensor * mm_model_block_1_block_0_1_b = nullptr;
|
||||
ggml_tensor * mm_model_block_1_block_1_fc1_w = nullptr;
|
||||
ggml_tensor * mm_model_block_1_block_1_fc1_b = nullptr;
|
||||
ggml_tensor * mm_model_block_1_block_1_fc2_w = nullptr;
|
||||
ggml_tensor * mm_model_block_1_block_1_fc2_b = nullptr;
|
||||
ggml_tensor * mm_model_block_1_block_2_0_w = nullptr;
|
||||
ggml_tensor * mm_model_block_1_block_2_1_w = nullptr;
|
||||
ggml_tensor * mm_model_block_1_block_2_1_b = nullptr;
|
||||
ggml_tensor * mm_model_block_2_block_0_0_w = nullptr;
|
||||
ggml_tensor * mm_model_block_2_block_0_1_w = nullptr;
|
||||
ggml_tensor * mm_model_block_2_block_0_1_b = nullptr;
|
||||
ggml_tensor * mm_model_block_2_block_1_fc1_w = nullptr;
|
||||
ggml_tensor * mm_model_block_2_block_1_fc1_b = nullptr;
|
||||
ggml_tensor * mm_model_block_2_block_1_fc2_w = nullptr;
|
||||
ggml_tensor * mm_model_block_2_block_1_fc2_b = nullptr;
|
||||
ggml_tensor * mm_model_block_2_block_2_0_w = nullptr;
|
||||
ggml_tensor * mm_model_block_2_block_2_1_w = nullptr;
|
||||
ggml_tensor * mm_model_block_2_block_2_1_b = nullptr;
|
||||
|
||||
// MobileVLM_V2 projection
|
||||
ggml_tensor * mm_model_mlp_0_w = nullptr;
|
||||
ggml_tensor * mm_model_mlp_0_b = nullptr;
|
||||
ggml_tensor * mm_model_mlp_2_w = nullptr;
|
||||
ggml_tensor * mm_model_mlp_2_b = nullptr;
|
||||
ggml_tensor * mm_model_peg_0_w = nullptr;
|
||||
ggml_tensor * mm_model_peg_0_b = nullptr;
|
||||
|
||||
// MINICPMV projection
|
||||
ggml_tensor * mm_model_pos_embed_k = nullptr;
|
||||
ggml_tensor * mm_model_query = nullptr;
|
||||
ggml_tensor * mm_model_proj = nullptr;
|
||||
ggml_tensor * mm_model_kv_proj = nullptr;
|
||||
ggml_tensor * mm_model_attn_q_w = nullptr;
|
||||
ggml_tensor * mm_model_attn_q_b = nullptr;
|
||||
ggml_tensor * mm_model_attn_k_w = nullptr;
|
||||
ggml_tensor * mm_model_attn_k_b = nullptr;
|
||||
ggml_tensor * mm_model_attn_v_w = nullptr;
|
||||
ggml_tensor * mm_model_attn_v_b = nullptr;
|
||||
ggml_tensor * mm_model_attn_o_w = nullptr;
|
||||
ggml_tensor * mm_model_attn_o_b = nullptr;
|
||||
ggml_tensor * mm_model_ln_q_w = nullptr;
|
||||
ggml_tensor * mm_model_ln_q_b = nullptr;
|
||||
ggml_tensor * mm_model_ln_kv_w = nullptr;
|
||||
ggml_tensor * mm_model_ln_kv_b = nullptr;
|
||||
ggml_tensor * mm_model_ln_post_w = nullptr;
|
||||
ggml_tensor * mm_model_ln_post_b = nullptr;
|
||||
|
||||
// gemma3
|
||||
ggml_tensor * mm_input_proj_w = nullptr;
|
||||
ggml_tensor * mm_soft_emb_norm_w = nullptr;
|
||||
|
||||
// pixtral, glm4v
|
||||
ggml_tensor * token_embd_img_break = nullptr;
|
||||
ggml_tensor * mm_patch_merger_w = nullptr;
|
||||
ggml_tensor * mm_patch_merger_b = nullptr;
|
||||
|
||||
// ultravox / whisper encoder
|
||||
ggml_tensor * conv1d_1_w = nullptr;
|
||||
ggml_tensor * conv1d_1_b = nullptr;
|
||||
ggml_tensor * conv1d_2_w = nullptr;
|
||||
ggml_tensor * conv1d_2_b = nullptr;
|
||||
ggml_tensor * mm_norm_pre_w = nullptr;
|
||||
ggml_tensor * mm_norm_pre_b = nullptr;
|
||||
ggml_tensor * mm_norm_mid_w = nullptr;
|
||||
|
||||
// cogvlm
|
||||
ggml_tensor * mm_post_fc_norm_w = nullptr;
|
||||
ggml_tensor * mm_post_fc_norm_b = nullptr;
|
||||
ggml_tensor * mm_h_to_4h_w = nullptr;
|
||||
ggml_tensor * mm_gate_w = nullptr;
|
||||
ggml_tensor * mm_4h_to_h_w = nullptr;
|
||||
ggml_tensor * mm_boi = nullptr;
|
||||
ggml_tensor * mm_eoi = nullptr;
|
||||
|
||||
bool audio_has_avgpool() const {
|
||||
return proj_type == PROJECTOR_TYPE_QWEN2A
|
||||
|| proj_type == PROJECTOR_TYPE_VOXTRAL;
|
||||
}
|
||||
|
||||
bool audio_has_stack_frames() const {
|
||||
return proj_type == PROJECTOR_TYPE_ULTRAVOX
|
||||
|| proj_type == PROJECTOR_TYPE_VOXTRAL;
|
||||
}
|
||||
};
|
||||
|
||||
const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx);
|
||||
File diff suppressed because it is too large
Load Diff
|
|
@ -7,6 +7,8 @@
|
|||
|
||||
// !!! Internal header, to be used by mtmd only !!!
|
||||
|
||||
#define MTMD_INTERNAL_HEADER
|
||||
|
||||
struct clip_ctx;
|
||||
|
||||
struct clip_image_size {
|
||||
|
|
@ -102,7 +104,7 @@ bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct
|
|||
|
||||
int clip_is_minicpmv(const struct clip_ctx * ctx);
|
||||
bool clip_is_glm(const struct clip_ctx * ctx);
|
||||
bool clip_is_qwen2vl(const struct clip_ctx * ctx);
|
||||
bool clip_is_mrope(const struct clip_ctx * ctx);
|
||||
bool clip_is_llava(const struct clip_ctx * ctx);
|
||||
bool clip_is_gemma3(const struct clip_ctx * ctx);
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,98 @@
|
|||
#include "models.h"
|
||||
|
||||
ggml_cgraph * clip_graph_cogvlm::build() {
|
||||
GGML_ASSERT(model.class_embedding != nullptr);
|
||||
GGML_ASSERT(model.position_embeddings != nullptr);
|
||||
|
||||
const int n_pos = n_patches + 1; // +1 for [CLS]
|
||||
|
||||
// build input and concatenate class embedding
|
||||
ggml_tensor * inp = build_inp();
|
||||
inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
|
||||
|
||||
inp = ggml_add(ctx0, inp, model.position_embeddings);
|
||||
cb(inp, "inp_pos", -1);
|
||||
|
||||
ggml_tensor * inpL = inp;
|
||||
|
||||
for (int il = 0; il < n_layer; il++) {
|
||||
auto & layer = model.layers[il];
|
||||
ggml_tensor * cur = inpL;
|
||||
|
||||
cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
|
||||
|
||||
cur = ggml_add(ctx0, cur, layer.qkv_b);
|
||||
|
||||
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, d_head*sizeof(float),
|
||||
cur->nb[1], 0);
|
||||
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, d_head*sizeof(float),
|
||||
cur->nb[1], n_embd * sizeof(float));
|
||||
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, d_head*sizeof(float),
|
||||
cur->nb[1], 2 * n_embd * sizeof(float));
|
||||
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
cur = build_attn(layer.o_w, layer.o_b,
|
||||
Qcur, Kcur, Vcur, nullptr, kq_scale, il);
|
||||
cb(cur, "attn_out", il);
|
||||
|
||||
cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
|
||||
cb(cur, "attn_post_norm", il);
|
||||
|
||||
cur = ggml_add(ctx0, cur, inpL);
|
||||
inpL = cur;
|
||||
|
||||
cur = build_ffn(cur,
|
||||
layer.ff_up_w, layer.ff_up_b,
|
||||
layer.ff_gate_w, layer.ff_gate_b,
|
||||
layer.ff_down_w, layer.ff_down_b,
|
||||
hparams.ffn_op, il);
|
||||
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
|
||||
cb(cur, "ffn_post_norm", il);
|
||||
|
||||
cur = ggml_add(ctx0, cur, inpL);
|
||||
cb(cur, "layer_out", il);
|
||||
inpL = cur;
|
||||
|
||||
}
|
||||
|
||||
// remove CLS token (like build_llama4 does)
|
||||
ggml_tensor * cur = ggml_view_2d(ctx0, inpL,
|
||||
n_embd, n_patches,
|
||||
ggml_row_size(inpL->type, n_embd), 0);
|
||||
|
||||
// Multiply with mm_model_proj
|
||||
cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur);
|
||||
|
||||
// Apply layernorm, weight, bias
|
||||
cur = build_norm(cur, model.mm_post_fc_norm_w, model.mm_post_fc_norm_b, NORM_TYPE_NORMAL, 1e-5, -1);
|
||||
|
||||
// Apply GELU
|
||||
cur = ggml_gelu_inplace(ctx0, cur);
|
||||
|
||||
// Branch 1: multiply with mm_h_to_4h_w
|
||||
ggml_tensor * h_to_4h = ggml_mul_mat(ctx0, model.mm_h_to_4h_w, cur);
|
||||
|
||||
// Branch 2: multiply with mm_gate_w
|
||||
ggml_tensor * gate = ggml_mul_mat(ctx0, model.mm_gate_w, cur);
|
||||
|
||||
// Apply silu
|
||||
gate = ggml_swiglu_split(ctx0, gate, h_to_4h);
|
||||
|
||||
// Apply mm_4h_to_h_w
|
||||
cur = ggml_mul_mat(ctx0, model.mm_4h_to_h_w, gate);
|
||||
|
||||
// Concatenate with boi and eoi
|
||||
cur = ggml_concat(ctx0, model.mm_boi, cur, 1);
|
||||
cur = ggml_concat(ctx0, cur, model.mm_eoi, 1);
|
||||
|
||||
// build the graph
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return gf;
|
||||
}
|
||||
|
|
@ -0,0 +1,120 @@
|
|||
#include "models.h"
|
||||
|
||||
ggml_cgraph * clip_graph_glm4v::build() {
|
||||
GGML_ASSERT(model.patch_bias != nullptr);
|
||||
GGML_ASSERT(model.position_embeddings != nullptr);
|
||||
GGML_ASSERT(model.class_embedding == nullptr);
|
||||
|
||||
const int batch_size = 1;
|
||||
|
||||
norm_type norm_t = NORM_TYPE_RMS;
|
||||
|
||||
ggml_tensor * inp_raw = build_inp_raw();
|
||||
ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
|
||||
|
||||
int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
|
||||
ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches * 4);
|
||||
ggml_set_name(positions, "positions");
|
||||
ggml_set_input(positions);
|
||||
|
||||
GGML_ASSERT(img.nx % (patch_size * 2) == 0);
|
||||
GGML_ASSERT(img.ny % (patch_size * 2) == 0);
|
||||
|
||||
// second conv dimension
|
||||
{
|
||||
auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
|
||||
inp = ggml_add(ctx0, inp, inp_1);
|
||||
|
||||
inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w, h, c, b] -> [c, w, h, b]
|
||||
inp = ggml_cont_4d(
|
||||
ctx0, inp,
|
||||
n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
|
||||
inp = ggml_reshape_4d(
|
||||
ctx0, inp,
|
||||
n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
|
||||
inp = ggml_permute(ctx0, inp, 0, 2, 1, 3);
|
||||
inp = ggml_cont_3d(
|
||||
ctx0, inp,
|
||||
n_embd, n_patches_x * n_patches_y, batch_size);
|
||||
}
|
||||
|
||||
// add patch bias
|
||||
inp = ggml_add(ctx0, inp, model.patch_bias);
|
||||
cb(inp, "patch_bias", -1);
|
||||
|
||||
// pos-conv norm
|
||||
inp = build_norm(inp, model.norm_embd_w, model.norm_embd_b, norm_t, eps, -1);
|
||||
|
||||
// calculate absolute position embedding and apply
|
||||
ggml_tensor * learned_pos_embd = resize_position_embeddings(GGML_SCALE_MODE_BICUBIC);
|
||||
learned_pos_embd = ggml_cont_4d(
|
||||
ctx0, learned_pos_embd,
|
||||
n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
|
||||
learned_pos_embd = ggml_reshape_4d(
|
||||
ctx0, learned_pos_embd,
|
||||
n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
|
||||
learned_pos_embd = ggml_permute(ctx0, learned_pos_embd, 0, 2, 1, 3);
|
||||
learned_pos_embd = ggml_cont_3d(
|
||||
ctx0, learned_pos_embd,
|
||||
n_embd, n_patches_x * n_patches_y, batch_size);
|
||||
cb(learned_pos_embd, "learned_pos_embd", -1);
|
||||
|
||||
auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
|
||||
return ggml_rope_multi(
|
||||
ctx0, cur, positions, nullptr,
|
||||
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION,
|
||||
32768, hparams.rope_theta, 1, 0, 1, 32, 1);
|
||||
};
|
||||
|
||||
ggml_tensor * cur = build_vit(
|
||||
inp, n_patches,
|
||||
norm_t,
|
||||
hparams.ffn_op,
|
||||
learned_pos_embd,
|
||||
add_pos);
|
||||
|
||||
cb(cur, "vit_out", -1);
|
||||
// cb(ggml_sum(ctx0, cur), "vit_out_sum", -1);
|
||||
|
||||
// GLM4V projector
|
||||
// ref: https://github.com/huggingface/transformers/blob/40dc11cd3eb4126652aa41ef8272525affd4a636/src/transformers/models/glm4v/modeling_glm4v.py#L116-L130
|
||||
|
||||
// patch merger (downsample)
|
||||
{
|
||||
int n_merge = hparams.n_merge;
|
||||
GGML_ASSERT(n_merge > 0);
|
||||
|
||||
int n_token_out = n_patches / n_merge / n_merge;
|
||||
cur = ggml_reshape_4d(ctx0, cur, n_embd, n_merge, n_merge, n_token_out);
|
||||
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3)); // [n_merge, n_merge, n_embd, n_token_out]
|
||||
cur = ggml_conv_2d(ctx0, model.mm_patch_merger_w, cur, n_merge, n_merge, 0, 0, 1, 1);
|
||||
cur = ggml_reshape_2d(ctx0, cur, cur->ne[2], n_token_out); // [n_embd_out, n_token_out]
|
||||
|
||||
cur = ggml_add(ctx0, cur, model.mm_patch_merger_b);
|
||||
}
|
||||
|
||||
// FC projector
|
||||
{
|
||||
cur = ggml_mul_mat(ctx0, model.projection, cur);
|
||||
// default LayerNorm (post_projection_norm)
|
||||
cur = build_norm(cur, model.mm_post_norm_w, model.mm_post_norm_b, NORM_TYPE_NORMAL, 1e-5, -1);
|
||||
cur = ggml_gelu_erf(ctx0, cur);
|
||||
cb(cur, "after_fc_proj", -1);
|
||||
}
|
||||
|
||||
// FFN projector
|
||||
{
|
||||
cur = build_ffn(cur,
|
||||
model.mm_ffn_up_w, model.mm_ffn_up_b,
|
||||
model.mm_ffn_gate_w, model.mm_ffn_gate_b,
|
||||
model.mm_ffn_down_w, model.mm_ffn_down_b,
|
||||
hparams.ffn_op, -1);
|
||||
cb(cur, "after_ffn_proj", -1);
|
||||
// cb(ggml_sum(ctx0, cur), "merged_sum", -1);
|
||||
}
|
||||
|
||||
// build the graph
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return gf;
|
||||
}
|
||||
|
|
@ -0,0 +1,69 @@
|
|||
#include "models.h"
|
||||
|
||||
ggml_cgraph * clip_graph_internvl::build() {
|
||||
GGML_ASSERT(model.class_embedding != nullptr);
|
||||
GGML_ASSERT(model.position_embeddings != nullptr);
|
||||
|
||||
const int n_pos = n_patches + 1;
|
||||
ggml_tensor * inp = build_inp();
|
||||
|
||||
// add CLS token
|
||||
inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
|
||||
|
||||
// The larger models use a different ViT, which uses RMS norm instead of layer norm
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/13443#issuecomment-2869786188
|
||||
norm_type norm_t = (hparams.n_embd == 3200 && hparams.n_layer == 45)
|
||||
? NORM_TYPE_RMS // 6B ViT (Used by InternVL 2.5/3 - 26B, 38B, 78B)
|
||||
: NORM_TYPE_NORMAL; // 300M ViT (Used by all smaller InternVL models)
|
||||
|
||||
ggml_tensor * cur = build_vit(
|
||||
inp, n_pos,
|
||||
norm_t,
|
||||
hparams.ffn_op,
|
||||
model.position_embeddings,
|
||||
nullptr);
|
||||
|
||||
// remove CLS token
|
||||
cur = ggml_view_2d(ctx0, cur,
|
||||
n_embd, n_patches,
|
||||
ggml_row_size(cur->type, n_embd), 0);
|
||||
|
||||
// pixel shuffle
|
||||
{
|
||||
const int scale_factor = model.hparams.n_merge;
|
||||
const int bsz = 1; // batch size, always 1 for now since we don't support batching
|
||||
const int height = n_patches_y;
|
||||
const int width = n_patches_x;
|
||||
GGML_ASSERT(scale_factor > 0);
|
||||
cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, height / scale_factor, width, bsz);
|
||||
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
||||
cur = ggml_cont_4d(ctx0, cur,
|
||||
n_embd * scale_factor * scale_factor,
|
||||
height / scale_factor,
|
||||
width / scale_factor,
|
||||
bsz);
|
||||
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
||||
// flatten to 2D
|
||||
cur = ggml_cont_2d(ctx0, cur,
|
||||
n_embd * scale_factor * scale_factor,
|
||||
cur->ne[1] * cur->ne[2]);
|
||||
}
|
||||
|
||||
// projector (always using GELU activation)
|
||||
{
|
||||
// projector LayerNorm uses pytorch's default eps = 1e-5
|
||||
// ref: https://huggingface.co/OpenGVLab/InternVL3-8B-Instruct/blob/a34d3e4e129a5856abfd6aa6de79776484caa14e/modeling_internvl_chat.py#L79
|
||||
cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1);
|
||||
cur = build_ffn(cur,
|
||||
model.mm_1_w, model.mm_1_b,
|
||||
nullptr, nullptr,
|
||||
model.mm_3_w, model.mm_3_b,
|
||||
FFN_GELU,
|
||||
-1);
|
||||
}
|
||||
|
||||
// build the graph
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return gf;
|
||||
}
|
||||
|
|
@ -0,0 +1,63 @@
|
|||
#include "models.h"
|
||||
|
||||
ggml_cgraph * clip_graph_kimivl::build() {
|
||||
// 2D input positions
|
||||
ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
|
||||
ggml_set_name(pos_h, "pos_h");
|
||||
ggml_set_input(pos_h);
|
||||
|
||||
ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
|
||||
ggml_set_name(pos_w, "pos_w");
|
||||
ggml_set_input(pos_w);
|
||||
|
||||
ggml_tensor * learned_pos_embd = resize_position_embeddings();
|
||||
|
||||
// build ViT with 2D position embeddings
|
||||
auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
|
||||
// first half is X axis and second half is Y axis
|
||||
return build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false);
|
||||
};
|
||||
|
||||
ggml_tensor * inp = build_inp();
|
||||
ggml_tensor * cur = build_vit(
|
||||
inp, n_patches,
|
||||
NORM_TYPE_NORMAL,
|
||||
hparams.ffn_op,
|
||||
learned_pos_embd,
|
||||
add_pos);
|
||||
|
||||
cb(cur, "vit_out", -1);
|
||||
|
||||
{
|
||||
// patch_merger
|
||||
const int scale_factor = model.hparams.n_merge;
|
||||
cur = build_patch_merge_permute(cur, scale_factor);
|
||||
|
||||
// projection norm
|
||||
int proj_inp_dim = cur->ne[0];
|
||||
cur = ggml_view_2d(ctx0, cur,
|
||||
n_embd, cur->ne[1] * scale_factor * scale_factor,
|
||||
ggml_row_size(cur->type, n_embd), 0);
|
||||
cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm
|
||||
cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
|
||||
cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
|
||||
cur = ggml_view_2d(ctx0, cur,
|
||||
proj_inp_dim, cur->ne[1] / scale_factor / scale_factor,
|
||||
ggml_row_size(cur->type, proj_inp_dim), 0);
|
||||
cb(cur, "proj_inp_normed", -1);
|
||||
|
||||
// projection mlp
|
||||
cur = build_ffn(cur,
|
||||
model.mm_1_w, model.mm_1_b,
|
||||
nullptr, nullptr,
|
||||
model.mm_2_w, model.mm_2_b,
|
||||
FFN_GELU,
|
||||
-1);
|
||||
cb(cur, "proj_out", -1);
|
||||
}
|
||||
|
||||
// build the graph
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return gf;
|
||||
}
|
||||
|
|
@ -0,0 +1,96 @@
|
|||
#include "models.h"
|
||||
|
||||
ggml_cgraph * clip_graph_llama4::build() {
|
||||
GGML_ASSERT(model.class_embedding != nullptr);
|
||||
GGML_ASSERT(model.position_embeddings != nullptr);
|
||||
|
||||
const int n_pos = n_patches + 1; // +1 for [CLS]
|
||||
|
||||
// 2D input positions
|
||||
ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
|
||||
ggml_set_name(pos_h, "pos_h");
|
||||
ggml_set_input(pos_h);
|
||||
|
||||
ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
|
||||
ggml_set_name(pos_w, "pos_w");
|
||||
ggml_set_input(pos_w);
|
||||
|
||||
ggml_tensor * inp = build_inp_raw();
|
||||
|
||||
// Llama4UnfoldConvolution
|
||||
{
|
||||
ggml_tensor * kernel = ggml_reshape_4d(ctx0, model.patch_embeddings_0,
|
||||
patch_size, patch_size, 3, n_embd);
|
||||
inp = ggml_im2col(ctx0, kernel, inp, patch_size, patch_size, 0, 0, 1, 1, true, inp->type);
|
||||
inp = ggml_mul_mat(ctx0, model.patch_embeddings_0, inp);
|
||||
inp = ggml_reshape_2d(ctx0, inp, n_embd, n_patches);
|
||||
cb(inp, "patch_conv", -1);
|
||||
}
|
||||
|
||||
// add CLS token
|
||||
inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
|
||||
|
||||
// build ViT with 2D position embeddings
|
||||
auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
|
||||
// first half is X axis and second half is Y axis
|
||||
// ref: https://github.com/huggingface/transformers/blob/40a493c7ed4f19f08eadb0639cf26d49bfa5e180/src/transformers/models/llama4/modeling_llama4.py#L1312
|
||||
// ref: https://github.com/Blaizzy/mlx-vlm/blob/a57156aa87b33cca6e5ee6cfc14dd4ef8f611be6/mlx_vlm/models/llama4/vision.py#L441
|
||||
return build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false);
|
||||
};
|
||||
ggml_tensor * cur = build_vit(
|
||||
inp, n_pos,
|
||||
NORM_TYPE_NORMAL,
|
||||
hparams.ffn_op,
|
||||
model.position_embeddings,
|
||||
add_pos);
|
||||
|
||||
// remove CLS token
|
||||
cur = ggml_view_2d(ctx0, cur,
|
||||
n_embd, n_patches,
|
||||
ggml_row_size(cur->type, n_embd), 0);
|
||||
|
||||
// pixel shuffle
|
||||
// based on Llama4VisionPixelShuffleMLP
|
||||
// https://github.com/huggingface/transformers/blob/2932f318a20d9e54cc7aea052e040164d85de7d6/src/transformers/models/llama4/modeling_llama4.py#L1151
|
||||
{
|
||||
const int scale_factor = model.hparams.n_merge;
|
||||
const int bsz = 1; // batch size, always 1 for now since we don't support batching
|
||||
GGML_ASSERT(scale_factor > 0);
|
||||
GGML_ASSERT(n_patches_x == n_patches_y); // llama4 only supports square images
|
||||
cur = ggml_reshape_4d(ctx0, cur,
|
||||
n_embd * scale_factor,
|
||||
n_patches_x / scale_factor,
|
||||
n_patches_y,
|
||||
bsz);
|
||||
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
||||
cur = ggml_cont_4d(ctx0, cur,
|
||||
n_embd * scale_factor * scale_factor,
|
||||
n_patches_x / scale_factor,
|
||||
n_patches_y / scale_factor,
|
||||
bsz);
|
||||
//cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
||||
// flatten to 2D
|
||||
cur = ggml_cont_2d(ctx0, cur,
|
||||
n_embd * scale_factor * scale_factor,
|
||||
n_patches / scale_factor / scale_factor);
|
||||
cb(cur, "pixel_shuffle", -1);
|
||||
}
|
||||
|
||||
// based on Llama4VisionMLP2 (always uses GELU activation, no bias)
|
||||
{
|
||||
cur = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, cur);
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
cur = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, cur);
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
cb(cur, "adapter_mlp", -1);
|
||||
}
|
||||
|
||||
// Llama4MultiModalProjector
|
||||
cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur);
|
||||
cb(cur, "projected", -1);
|
||||
|
||||
// build the graph
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return gf;
|
||||
}
|
||||
|
|
@ -0,0 +1,374 @@
|
|||
#include "models.h"
|
||||
|
||||
// this graph is used by llava, granite and glm
|
||||
// due to having embedding_stack (used by granite), we cannot reuse build_vit
|
||||
ggml_cgraph * clip_graph_llava::build() {
|
||||
const int batch_size = 1;
|
||||
const int n_pos = n_patches + (model.class_embedding ? 1 : 0);
|
||||
|
||||
GGML_ASSERT(n_patches_x == n_patches_y && "only square images supported");
|
||||
|
||||
// Calculate the deepest feature layer based on hparams and projector type
|
||||
int max_feature_layer = n_layer;
|
||||
{
|
||||
// Get the index of the second to last layer; this is the default for models that have a llava projector
|
||||
int il_last = hparams.n_layer - 1;
|
||||
int deepest_feature_layer = -1;
|
||||
|
||||
if (proj_type == PROJECTOR_TYPE_MINICPMV || proj_type == PROJECTOR_TYPE_GLM_EDGE) {
|
||||
il_last += 1;
|
||||
}
|
||||
|
||||
// If we set explicit vision feature layers, only go up to the deepest one
|
||||
// NOTE: only used by granite-vision models for now
|
||||
for (const auto & feature_layer : hparams.vision_feature_layer) {
|
||||
if (feature_layer > deepest_feature_layer) {
|
||||
deepest_feature_layer = feature_layer;
|
||||
}
|
||||
}
|
||||
max_feature_layer = deepest_feature_layer < 0 ? il_last : deepest_feature_layer;
|
||||
}
|
||||
|
||||
ggml_tensor * inp = build_inp();
|
||||
|
||||
// concat class_embeddings and patch_embeddings
|
||||
if (model.class_embedding) {
|
||||
inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
|
||||
}
|
||||
|
||||
ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
|
||||
ggml_set_name(positions, "positions");
|
||||
ggml_set_input(positions);
|
||||
|
||||
inp = ggml_add(ctx0, inp, ggml_get_rows(ctx0, model.position_embeddings, positions));
|
||||
|
||||
ggml_tensor * inpL = inp;
|
||||
|
||||
// pre-layernorm
|
||||
if (model.pre_ln_w) {
|
||||
inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, NORM_TYPE_NORMAL, eps, -1);
|
||||
cb(inpL, "pre_ln", -1);
|
||||
}
|
||||
|
||||
std::vector<ggml_tensor *> embedding_stack;
|
||||
const auto & vision_feature_layer = hparams.vision_feature_layer;
|
||||
|
||||
// loop over layers
|
||||
for (int il = 0; il < max_feature_layer; il++) {
|
||||
auto & layer = model.layers[il];
|
||||
ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
|
||||
|
||||
// If this is an embedding feature layer, save the output.
|
||||
// NOTE: 0 index here refers to the input to the encoder.
|
||||
if (vision_feature_layer.find(il) != vision_feature_layer.end()) {
|
||||
embedding_stack.push_back(cur);
|
||||
}
|
||||
|
||||
// layernorm1
|
||||
cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
|
||||
cb(cur, "layer_inp_normed", il);
|
||||
|
||||
// self-attention
|
||||
{
|
||||
ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
|
||||
if (layer.q_b) {
|
||||
Qcur = ggml_add(ctx0, Qcur, layer.q_b);
|
||||
}
|
||||
|
||||
ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
|
||||
if (layer.k_b) {
|
||||
Kcur = ggml_add(ctx0, Kcur, layer.k_b);
|
||||
}
|
||||
|
||||
ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
|
||||
if (layer.v_b) {
|
||||
Vcur = ggml_add(ctx0, Vcur, layer.v_b);
|
||||
}
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
|
||||
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
cur = build_attn(layer.o_w, layer.o_b,
|
||||
Qcur, Kcur, Vcur, nullptr, kq_scale, il);
|
||||
cb(cur, "attn_out", il);
|
||||
}
|
||||
|
||||
// re-add the layer input, e.g., residual
|
||||
cur = ggml_add(ctx0, cur, inpL);
|
||||
|
||||
inpL = cur; // inpL = residual, cur = hidden_states
|
||||
|
||||
cb(cur, "ffn_inp", il);
|
||||
|
||||
// layernorm2
|
||||
cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
|
||||
cb(cur, "ffn_inp_normed", il);
|
||||
|
||||
// ffn
|
||||
cur = build_ffn(cur,
|
||||
layer.ff_up_w, layer.ff_up_b,
|
||||
layer.ff_gate_w, layer.ff_gate_b,
|
||||
layer.ff_down_w, layer.ff_down_b,
|
||||
hparams.ffn_op, il);
|
||||
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
// residual 2
|
||||
cur = ggml_add(ctx0, inpL, cur);
|
||||
cb(cur, "layer_out", il);
|
||||
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
// post-layernorm
|
||||
if (model.post_ln_w) {
|
||||
inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, NORM_TYPE_NORMAL, eps, -1);
|
||||
}
|
||||
|
||||
ggml_tensor * embeddings = inpL;
|
||||
|
||||
// process vision feature layers (used by granite)
|
||||
{
|
||||
// final layer is a vision feature layer
|
||||
if (vision_feature_layer.find(max_feature_layer) != vision_feature_layer.end()) {
|
||||
embedding_stack.push_back(inpL);
|
||||
}
|
||||
|
||||
// If feature layers are explicitly set, stack them (if we have multiple)
|
||||
if (!embedding_stack.empty()) {
|
||||
embeddings = embedding_stack[0];
|
||||
for (size_t i = 1; i < embedding_stack.size(); i++) {
|
||||
embeddings = ggml_concat(ctx0, embeddings, embedding_stack[i], 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// llava projector (also used by granite)
|
||||
if (hparams.has_llava_projector) {
|
||||
embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
|
||||
|
||||
ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
|
||||
ggml_set_name(patches, "patches");
|
||||
ggml_set_input(patches);
|
||||
|
||||
// shape [1, 576, 1024]
|
||||
// ne is whcn, ne = [1024, 576, 1, 1]
|
||||
embeddings = ggml_get_rows(ctx0, embeddings, patches);
|
||||
|
||||
// print_tensor_info(embeddings, "embeddings");
|
||||
|
||||
// llava projector
|
||||
if (proj_type == PROJECTOR_TYPE_MLP) {
|
||||
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
|
||||
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
|
||||
|
||||
embeddings = ggml_gelu(ctx0, embeddings);
|
||||
if (model.mm_2_w) {
|
||||
embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
|
||||
embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
|
||||
}
|
||||
}
|
||||
else if (proj_type == PROJECTOR_TYPE_MLP_NORM) {
|
||||
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
|
||||
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
|
||||
// ggml_tensor_printf(embeddings, "mm_0_w",0,true,false);
|
||||
// First LayerNorm
|
||||
embeddings = ggml_norm(ctx0, embeddings, eps);
|
||||
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_1_w),
|
||||
model.mm_1_b);
|
||||
|
||||
// GELU activation
|
||||
embeddings = ggml_gelu(ctx0, embeddings);
|
||||
|
||||
// Second linear layer
|
||||
embeddings = ggml_mul_mat(ctx0, model.mm_3_w, embeddings);
|
||||
embeddings = ggml_add(ctx0, embeddings, model.mm_3_b);
|
||||
|
||||
// Second LayerNorm
|
||||
embeddings = ggml_norm(ctx0, embeddings, eps);
|
||||
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_4_w),
|
||||
model.mm_4_b);
|
||||
}
|
||||
else if (proj_type == PROJECTOR_TYPE_LDP) {
|
||||
// MobileVLM projector
|
||||
int n_patch = 24;
|
||||
ggml_tensor * mlp_1 = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, embeddings);
|
||||
mlp_1 = ggml_add(ctx0, mlp_1, model.mm_model_mlp_1_b);
|
||||
mlp_1 = ggml_gelu(ctx0, mlp_1);
|
||||
ggml_tensor * mlp_3 = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, mlp_1);
|
||||
mlp_3 = ggml_add(ctx0, mlp_3, model.mm_model_mlp_3_b);
|
||||
// mlp_3 shape = [1, 576, 2048], ne = [2048, 576, 1, 1]
|
||||
|
||||
// block 1
|
||||
ggml_tensor * block_1 = nullptr;
|
||||
{
|
||||
// transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24]
|
||||
mlp_3 = ggml_permute(ctx0, mlp_3, 1, 0, 2, 3);
|
||||
mlp_3 = ggml_cont_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]);
|
||||
// stride = 1, padding = 1, bias is nullptr
|
||||
block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1);
|
||||
|
||||
// layer norm
|
||||
// // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
|
||||
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
|
||||
// block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
|
||||
block_1 = ggml_norm(ctx0, block_1, eps);
|
||||
block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_0_1_w), model.mm_model_block_1_block_0_1_b);
|
||||
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
|
||||
|
||||
// block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
|
||||
// hardswish
|
||||
ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
|
||||
|
||||
block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
|
||||
// block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
|
||||
// pointwise conv
|
||||
block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
|
||||
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc1_w, block_1);
|
||||
block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc1_b);
|
||||
block_1 = ggml_relu(ctx0, block_1);
|
||||
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc2_w, block_1);
|
||||
block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc2_b);
|
||||
block_1 = ggml_hardsigmoid(ctx0, block_1);
|
||||
// block_1_hw shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1], block_1 shape = [1, 2048], ne = [2048, 1, 1, 1]
|
||||
block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
|
||||
block_1 = ggml_mul(ctx0, block_1_hw, block_1);
|
||||
|
||||
int w = block_1->ne[0], h = block_1->ne[1];
|
||||
block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
|
||||
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
|
||||
|
||||
// block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
|
||||
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_2_0_w, block_1);
|
||||
block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
|
||||
|
||||
// block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
|
||||
block_1 = ggml_norm(ctx0, block_1, eps);
|
||||
block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_2_1_w), model.mm_model_block_1_block_2_1_b);
|
||||
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
|
||||
// block1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
|
||||
// residual
|
||||
block_1 = ggml_add(ctx0, mlp_3, block_1);
|
||||
}
|
||||
|
||||
// block_2
|
||||
{
|
||||
// stride = 2
|
||||
block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1);
|
||||
|
||||
// block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
|
||||
// layer norm
|
||||
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
|
||||
// block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
|
||||
block_1 = ggml_norm(ctx0, block_1, eps);
|
||||
block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_0_1_w), model.mm_model_block_2_block_0_1_b);
|
||||
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
|
||||
// block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
|
||||
// hardswish
|
||||
ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
|
||||
|
||||
// not sure the parameters is right for globalAvgPooling
|
||||
block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
|
||||
// block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
|
||||
// pointwise conv
|
||||
block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
|
||||
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc1_w, block_1);
|
||||
block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc1_b);
|
||||
block_1 = ggml_relu(ctx0, block_1);
|
||||
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc2_w, block_1);
|
||||
block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc2_b);
|
||||
block_1 = ggml_hardsigmoid(ctx0, block_1);
|
||||
|
||||
// block_1_hw shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1], block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
|
||||
block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
|
||||
block_1 = ggml_mul(ctx0, block_1_hw, block_1);
|
||||
|
||||
int w = block_1->ne[0], h = block_1->ne[1];
|
||||
block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
|
||||
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
|
||||
// block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
|
||||
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_2_0_w, block_1);
|
||||
block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
|
||||
|
||||
|
||||
// block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
|
||||
block_1 = ggml_norm(ctx0, block_1, eps);
|
||||
block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_2_1_w), model.mm_model_block_2_block_2_1_b);
|
||||
block_1 = ggml_reshape_3d(ctx0, block_1, block_1->ne[0], block_1->ne[1] * block_1->ne[2], block_1->ne[3]);
|
||||
// block_1 shape = [1, 144, 2048], ne = [2048, 144, 1]
|
||||
}
|
||||
embeddings = block_1;
|
||||
}
|
||||
else if (proj_type == PROJECTOR_TYPE_LDPV2)
|
||||
{
|
||||
int n_patch = 24;
|
||||
ggml_tensor * mlp_0 = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
|
||||
mlp_0 = ggml_add(ctx0, mlp_0, model.mm_model_mlp_0_b);
|
||||
mlp_0 = ggml_gelu(ctx0, mlp_0);
|
||||
ggml_tensor * mlp_2 = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, mlp_0);
|
||||
mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b);
|
||||
// mlp_2 ne = [2048, 576, 1, 1]
|
||||
// // AVG Pool Layer 2*2, strides = 2
|
||||
mlp_2 = ggml_permute(ctx0, mlp_2, 1, 0, 2, 3);
|
||||
// mlp_2 ne = [576, 2048, 1, 1]
|
||||
mlp_2 = ggml_cont_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]);
|
||||
// mlp_2 ne [24, 24, 2048, 1]
|
||||
mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0);
|
||||
// weight ne = [3, 3, 2048, 1]
|
||||
ggml_tensor * peg_0 = ggml_conv_2d_dw(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1);
|
||||
peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3));
|
||||
peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b);
|
||||
mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3));
|
||||
peg_0 = ggml_add(ctx0, peg_0, mlp_2);
|
||||
peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]);
|
||||
embeddings = peg_0;
|
||||
}
|
||||
else {
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
// glm projector
|
||||
else if (proj_type == PROJECTOR_TYPE_GLM_EDGE) {
|
||||
size_t gridsz = (size_t)sqrt(embeddings->ne[1]);
|
||||
embeddings = ggml_permute(ctx0,embeddings,1,0,2,3);
|
||||
embeddings = ggml_cont_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]);
|
||||
embeddings = ggml_conv_2d(ctx0, model.mm_model_adapter_conv_w, embeddings, 2, 2, 0, 0, 1, 1);
|
||||
embeddings = ggml_reshape_3d(ctx0, embeddings,embeddings->ne[0]*embeddings->ne[1] , embeddings->ne[2], batch_size);
|
||||
embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings, 1, 0, 2, 3));
|
||||
embeddings = ggml_add(ctx0, embeddings, model.mm_model_adapter_conv_b);
|
||||
// GLU
|
||||
{
|
||||
embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
|
||||
embeddings = ggml_norm(ctx0, embeddings, eps);
|
||||
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
|
||||
embeddings = ggml_gelu_inplace(ctx0, embeddings);
|
||||
ggml_tensor * x = embeddings;
|
||||
embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, embeddings);
|
||||
x = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w,x);
|
||||
embeddings = ggml_swiglu_split(ctx0, embeddings, x);
|
||||
embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, embeddings);
|
||||
}
|
||||
// arrangement of BOI/EOI token embeddings
|
||||
// note: these embeddings are not present in text model, hence we cannot process them as text tokens
|
||||
// see: https://huggingface.co/THUDM/glm-edge-v-2b/blob/main/siglip.py#L53
|
||||
{
|
||||
embeddings = ggml_concat(ctx0, model.mm_boi, embeddings, 1); // BOI
|
||||
embeddings = ggml_concat(ctx0, embeddings, model.mm_eoi, 1); // EOI
|
||||
}
|
||||
}
|
||||
|
||||
else {
|
||||
GGML_ABORT("llava: unknown projector type");
|
||||
}
|
||||
|
||||
// build the graph
|
||||
ggml_build_forward_expand(gf, embeddings);
|
||||
|
||||
return gf;
|
||||
}
|
||||
|
|
@ -0,0 +1,114 @@
|
|||
#include "models.h"
|
||||
|
||||
ggml_cgraph * clip_graph_minicpmv::build() {
|
||||
GGML_ASSERT(model.class_embedding == nullptr);
|
||||
const int n_pos = n_patches;
|
||||
const int n_embd_proj = n_mmproj_embd;
|
||||
|
||||
// position embeddings for the projector (not for ViT)
|
||||
// see: https://huggingface.co/openbmb/MiniCPM-o-2_6/blob/main/resampler.py#L70
|
||||
// base frequency omega
|
||||
ggml_tensor * omega = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_embd_proj / 4);
|
||||
ggml_set_name(omega, "omega");
|
||||
ggml_set_input(omega);
|
||||
|
||||
// 2D input positions (using float for sinusoidal embeddings)
|
||||
ggml_tensor * pos_h = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_pos);
|
||||
ggml_set_name(pos_h, "pos_h");
|
||||
ggml_set_input(pos_h);
|
||||
ggml_tensor * pos_w = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_pos);
|
||||
ggml_set_name(pos_w, "pos_w");
|
||||
ggml_set_input(pos_w);
|
||||
|
||||
// for selecting learned pos embd, used by ViT
|
||||
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
|
||||
ggml_set_name(positions, "positions");
|
||||
ggml_set_input(positions);
|
||||
|
||||
ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, model.position_embeddings, positions);
|
||||
|
||||
ggml_tensor * inp = build_inp();
|
||||
ggml_tensor * embeddings = build_vit(
|
||||
inp, n_pos,
|
||||
NORM_TYPE_NORMAL,
|
||||
hparams.ffn_op,
|
||||
learned_pos_embd,
|
||||
nullptr);
|
||||
|
||||
// resampler projector (it is just another transformer)
|
||||
|
||||
ggml_tensor * q = model.mm_model_query;
|
||||
ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings);
|
||||
|
||||
// norm
|
||||
q = build_norm(q, model.mm_model_ln_q_w, model.mm_model_ln_q_b, NORM_TYPE_NORMAL, eps, -1);
|
||||
v = build_norm(v, model.mm_model_ln_kv_w, model.mm_model_ln_kv_b, NORM_TYPE_NORMAL, eps, -1);
|
||||
|
||||
// calculate sinusoidal pos embd
|
||||
ggml_tensor * pos_embed = nullptr;
|
||||
{
|
||||
// outer product
|
||||
ggml_tensor * omega_b = ggml_repeat_4d(ctx0, omega, omega->ne[0], n_pos, 1, 1); // n_pos rows
|
||||
ggml_tensor * theta_x = ggml_mul(ctx0, omega_b, pos_w);
|
||||
ggml_tensor * theta_y = ggml_mul(ctx0, omega_b, pos_h);
|
||||
// sin and cos
|
||||
ggml_tensor * pos_embd_x = ggml_concat(
|
||||
ctx0,
|
||||
ggml_sin(ctx0, theta_x),
|
||||
ggml_cos(ctx0, theta_x),
|
||||
0 // concat on first dim
|
||||
);
|
||||
ggml_tensor * pos_embd_y = ggml_concat(
|
||||
ctx0,
|
||||
ggml_sin(ctx0, theta_y),
|
||||
ggml_cos(ctx0, theta_y),
|
||||
0 // concat on first dim
|
||||
);
|
||||
pos_embed = ggml_concat(ctx0, pos_embd_x, pos_embd_y, 0);
|
||||
}
|
||||
|
||||
// k = v + pos_embed
|
||||
ggml_tensor * k = ggml_add(ctx0, v, pos_embed);
|
||||
|
||||
// attention
|
||||
{
|
||||
const int d_head = 128;
|
||||
int n_head = n_embd_proj/d_head;
|
||||
// Use actual config value if available, otherwise fall back to hardcoded values
|
||||
int num_query = hparams.minicpmv_query_num;
|
||||
ggml_tensor * Q = ggml_add(ctx0,
|
||||
ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q),
|
||||
model.mm_model_attn_q_b);
|
||||
ggml_tensor * K = ggml_add(ctx0,
|
||||
ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k),
|
||||
model.mm_model_attn_k_b);
|
||||
ggml_tensor * V = ggml_add(ctx0,
|
||||
ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v),
|
||||
model.mm_model_attn_v_b);
|
||||
|
||||
Q = ggml_reshape_3d(ctx0, Q, d_head, n_head, num_query);
|
||||
K = ggml_reshape_3d(ctx0, K, d_head, n_head, n_pos);
|
||||
V = ggml_reshape_3d(ctx0, V, d_head, n_head, n_pos);
|
||||
|
||||
cb(Q, "resampler_Q", -1);
|
||||
cb(K, "resampler_K", -1);
|
||||
cb(V, "resampler_V", -1);
|
||||
|
||||
float resampler_kq_scale = 1.0f/ sqrtf(float(d_head));
|
||||
embeddings = build_attn(
|
||||
model.mm_model_attn_o_w,
|
||||
model.mm_model_attn_o_b,
|
||||
Q, K, V, nullptr, resampler_kq_scale, -1);
|
||||
cb(embeddings, "resampler_attn_out", -1);
|
||||
}
|
||||
// layernorm
|
||||
embeddings = build_norm(embeddings, model.mm_model_ln_post_w, model.mm_model_ln_post_b, NORM_TYPE_NORMAL, eps, -1);
|
||||
|
||||
// projection
|
||||
embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings);
|
||||
|
||||
// build the graph
|
||||
ggml_build_forward_expand(gf, embeddings);
|
||||
|
||||
return gf;
|
||||
}
|
||||
|
|
@ -0,0 +1,6 @@
|
|||
package models
|
||||
|
||||
// #cgo CXXFLAGS: -std=c++17
|
||||
// #cgo CPPFLAGS: -I${SRCDIR}/../../../include -I${SRCDIR}/../../../common -I${SRCDIR}/../../../vendor
|
||||
// #cgo CPPFLAGS: -I${SRCDIR}/../../../../../ml/backend/ggml/ggml/include
|
||||
import "C"
|
||||
|
|
@ -0,0 +1,63 @@
|
|||
#pragma once
|
||||
|
||||
#include "../clip-graph.h"
|
||||
|
||||
struct clip_graph_siglip : clip_graph {
|
||||
clip_graph_siglip(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
};
|
||||
|
||||
struct clip_graph_pixtral : clip_graph {
|
||||
clip_graph_pixtral(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
};
|
||||
|
||||
struct clip_graph_qwen2vl : clip_graph {
|
||||
clip_graph_qwen2vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
};
|
||||
|
||||
struct clip_graph_qwen3vl : clip_graph {
|
||||
clip_graph_qwen3vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
};
|
||||
|
||||
struct clip_graph_minicpmv : clip_graph {
|
||||
clip_graph_minicpmv(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
};
|
||||
|
||||
struct clip_graph_internvl : clip_graph {
|
||||
clip_graph_internvl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
};
|
||||
|
||||
struct clip_graph_llama4 : clip_graph {
|
||||
clip_graph_llama4(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
};
|
||||
|
||||
struct clip_graph_kimivl : clip_graph {
|
||||
clip_graph_kimivl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
};
|
||||
|
||||
struct clip_graph_cogvlm : clip_graph {
|
||||
clip_graph_cogvlm(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
};
|
||||
|
||||
struct clip_graph_llava : clip_graph {
|
||||
clip_graph_llava(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
};
|
||||
|
||||
struct clip_graph_whisper_enc : clip_graph {
|
||||
clip_graph_whisper_enc(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
};
|
||||
|
||||
struct clip_graph_glm4v : clip_graph {
|
||||
clip_graph_glm4v(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
};
|
||||
|
|
@ -0,0 +1,86 @@
|
|||
#include "models.h"
|
||||
|
||||
ggml_cgraph * clip_graph_pixtral::build() {
|
||||
const int n_merge = hparams.n_merge;
|
||||
|
||||
// 2D input positions
|
||||
ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
|
||||
ggml_set_name(pos_h, "pos_h");
|
||||
ggml_set_input(pos_h);
|
||||
|
||||
ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
|
||||
ggml_set_name(pos_w, "pos_w");
|
||||
ggml_set_input(pos_w);
|
||||
|
||||
auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
|
||||
return build_rope_2d(ctx0, cur, pos_h, pos_w, hparams.rope_theta, true);
|
||||
};
|
||||
|
||||
ggml_tensor * inp = build_inp();
|
||||
ggml_tensor * cur = build_vit(
|
||||
inp, n_patches,
|
||||
NORM_TYPE_RMS,
|
||||
hparams.ffn_op,
|
||||
nullptr, // no learned pos embd
|
||||
add_pos);
|
||||
|
||||
// mistral small 3.1 patch merger
|
||||
// ref: https://github.com/huggingface/transformers/blob/7a3e208892c06a5e278144eaf38c8599a42f53e7/src/transformers/models/mistral3/modeling_mistral3.py#L67
|
||||
if (model.mm_patch_merger_w) {
|
||||
GGML_ASSERT(hparams.n_merge > 0);
|
||||
|
||||
cur = ggml_mul(ctx0, ggml_rms_norm(ctx0, cur, eps), model.mm_input_norm_w);
|
||||
|
||||
// reshape image tokens to 2D grid
|
||||
cur = ggml_reshape_3d(ctx0, cur, n_embd, n_patches_x, n_patches_y);
|
||||
cur = ggml_permute(ctx0, cur, 2, 0, 1, 3); // [x, y, n_embd]
|
||||
cur = ggml_cont(ctx0, cur);
|
||||
|
||||
// torch.nn.functional.unfold is just an im2col under the hood
|
||||
// we just need a dummy kernel to make it work
|
||||
ggml_tensor * kernel = ggml_view_3d(ctx0, cur, n_merge, n_merge, cur->ne[2], 0, 0, 0);
|
||||
cur = ggml_im2col(ctx0, kernel, cur, n_merge, n_merge, 0, 0, 1, 1, true, inp->type);
|
||||
|
||||
// project to n_embd
|
||||
cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]);
|
||||
cur = ggml_mul_mat(ctx0, model.mm_patch_merger_w, cur);
|
||||
}
|
||||
|
||||
// LlavaMultiModalProjector (always using GELU activation)
|
||||
{
|
||||
cur = build_ffn(cur,
|
||||
model.mm_1_w, model.mm_1_b,
|
||||
nullptr, nullptr,
|
||||
model.mm_2_w, model.mm_2_b,
|
||||
FFN_GELU,
|
||||
-1);
|
||||
}
|
||||
|
||||
// arrangement of the [IMG_BREAK] token
|
||||
if (model.token_embd_img_break) {
|
||||
// not efficient, but works
|
||||
// the trick is to view the embeddings as a 3D tensor with shape [n_embd, n_patches_per_row, n_rows]
|
||||
// and then concatenate the [IMG_BREAK] token to the end of each row, aka n_patches_per_row dimension
|
||||
// after the concatenation, we have a tensor with shape [n_embd, n_patches_per_row + 1, n_rows]
|
||||
|
||||
const int p_y = n_merge > 0 ? n_patches_y / n_merge : n_patches_y;
|
||||
const int p_x = n_merge > 0 ? n_patches_x / n_merge : n_patches_x;
|
||||
const int p_total = p_x * p_y;
|
||||
const int n_embd_text = cur->ne[0];
|
||||
const int n_tokens_output = p_total + p_y - 1; // one [IMG_BREAK] per row, except the last row
|
||||
|
||||
ggml_tensor * tmp = ggml_reshape_3d(ctx0, cur, n_embd_text, p_x, p_y);
|
||||
ggml_tensor * tok = ggml_new_tensor_3d(ctx0, tmp->type, n_embd_text, 1, p_y);
|
||||
tok = ggml_scale(ctx0, tok, 0.0); // clear the tensor
|
||||
tok = ggml_add(ctx0, tok, model.token_embd_img_break);
|
||||
tmp = ggml_concat(ctx0, tmp, tok, 1);
|
||||
cur = ggml_view_2d(ctx0, tmp,
|
||||
n_embd_text, n_tokens_output,
|
||||
ggml_row_size(tmp->type, n_embd_text), 0);
|
||||
}
|
||||
|
||||
// build the graph
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return gf;
|
||||
}
|
||||
|
|
@ -0,0 +1,183 @@
|
|||
#include "models.h"
|
||||
|
||||
ggml_cgraph * clip_graph_qwen2vl::build() {
|
||||
GGML_ASSERT(model.patch_bias == nullptr);
|
||||
GGML_ASSERT(model.class_embedding == nullptr);
|
||||
|
||||
const int batch_size = 1;
|
||||
const bool use_window_attn = hparams.n_wa_pattern > 0;
|
||||
const int n_wa_pattern = hparams.n_wa_pattern;
|
||||
const int n_pos = n_patches;
|
||||
const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position
|
||||
|
||||
norm_type norm_t = proj_type == PROJECTOR_TYPE_QWEN25VL
|
||||
? NORM_TYPE_RMS // qwen 2.5 vl
|
||||
: NORM_TYPE_NORMAL; // qwen 2 vl
|
||||
|
||||
int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
|
||||
|
||||
ggml_tensor * inp_raw = build_inp_raw();
|
||||
ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
|
||||
|
||||
GGML_ASSERT(img.nx % (patch_size * 2) == 0);
|
||||
GGML_ASSERT(img.ny % (patch_size * 2) == 0);
|
||||
|
||||
// second conv dimension
|
||||
{
|
||||
auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
|
||||
inp = ggml_add(ctx0, inp, inp_1);
|
||||
|
||||
inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w, h, c, b] -> [c, w, h, b]
|
||||
inp = ggml_cont_4d(
|
||||
ctx0, inp,
|
||||
n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
|
||||
inp = ggml_reshape_4d(
|
||||
ctx0, inp,
|
||||
n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
|
||||
inp = ggml_permute(ctx0, inp, 0, 2, 1, 3);
|
||||
inp = ggml_cont_3d(
|
||||
ctx0, inp,
|
||||
n_embd, n_patches_x * n_patches_y, batch_size);
|
||||
}
|
||||
|
||||
ggml_tensor * inpL = inp;
|
||||
ggml_tensor * window_mask = nullptr;
|
||||
ggml_tensor * window_idx = nullptr;
|
||||
ggml_tensor * inv_window_idx = nullptr;
|
||||
|
||||
ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
|
||||
ggml_set_name(positions, "positions");
|
||||
ggml_set_input(positions);
|
||||
|
||||
// pre-layernorm
|
||||
if (model.pre_ln_w) {
|
||||
inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1);
|
||||
}
|
||||
|
||||
if (use_window_attn) {
|
||||
// handle window attention inputs
|
||||
inv_window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4);
|
||||
ggml_set_name(inv_window_idx, "inv_window_idx");
|
||||
ggml_set_input(inv_window_idx);
|
||||
// mask for window attention
|
||||
window_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_pos, n_pos);
|
||||
ggml_set_name(window_mask, "window_mask");
|
||||
ggml_set_input(window_mask);
|
||||
|
||||
// if flash attn is used, we need to pad the mask and cast to f16
|
||||
if (flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
|
||||
window_mask = ggml_cast(ctx0, window_mask, GGML_TYPE_F16);
|
||||
}
|
||||
|
||||
// inpL shape: [n_embd, n_patches_x * n_patches_y, batch_size]
|
||||
GGML_ASSERT(batch_size == 1);
|
||||
inpL = ggml_reshape_2d(ctx0, inpL, n_embd * 4, n_patches_x * n_patches_y * batch_size / 4);
|
||||
inpL = ggml_get_rows(ctx0, inpL, inv_window_idx);
|
||||
inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_patches_x * n_patches_y, batch_size);
|
||||
}
|
||||
|
||||
// loop over layers
|
||||
for (int il = 0; il < n_layer; il++) {
|
||||
const auto & layer = model.layers[il];
|
||||
const bool full_attn = use_window_attn ? (il + 1) % n_wa_pattern == 0 : true;
|
||||
|
||||
ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
|
||||
|
||||
// layernorm1
|
||||
cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il);
|
||||
cb(cur, "ln1", il);
|
||||
|
||||
// self-attention
|
||||
{
|
||||
ggml_tensor * Qcur = ggml_add(ctx0,
|
||||
ggml_mul_mat(ctx0, layer.q_w, cur), layer.q_b);
|
||||
ggml_tensor * Kcur = ggml_add(ctx0,
|
||||
ggml_mul_mat(ctx0, layer.k_w, cur), layer.k_b);
|
||||
ggml_tensor * Vcur = ggml_add(ctx0,
|
||||
ggml_mul_mat(ctx0, layer.v_w, cur), layer.v_b);
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_patches);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_patches);
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_patches);
|
||||
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
// apply M-RoPE
|
||||
Qcur = ggml_rope_multi(
|
||||
ctx0, Qcur, positions, nullptr,
|
||||
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
|
||||
Kcur = ggml_rope_multi(
|
||||
ctx0, Kcur, positions, nullptr,
|
||||
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
|
||||
|
||||
cb(Qcur, "Qcur_rope", il);
|
||||
cb(Kcur, "Kcur_rope", il);
|
||||
|
||||
ggml_tensor * attn_mask = full_attn ? nullptr : window_mask;
|
||||
|
||||
cur = build_attn(layer.o_w, layer.o_b,
|
||||
Qcur, Kcur, Vcur, attn_mask, kq_scale, il);
|
||||
cb(cur, "attn_out", il);
|
||||
}
|
||||
|
||||
// re-add the layer input, e.g., residual
|
||||
cur = ggml_add(ctx0, cur, inpL);
|
||||
|
||||
inpL = cur; // inpL = residual, cur = hidden_states
|
||||
|
||||
cb(cur, "ffn_inp", il);
|
||||
|
||||
// layernorm2
|
||||
cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
|
||||
cb(cur, "ffn_inp_normed", il);
|
||||
|
||||
// ffn
|
||||
cur = build_ffn(cur,
|
||||
layer.ff_up_w, layer.ff_up_b,
|
||||
layer.ff_gate_w, layer.ff_gate_b,
|
||||
layer.ff_down_w, layer.ff_down_b,
|
||||
hparams.ffn_op, il);
|
||||
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
// residual 2
|
||||
cur = ggml_add(ctx0, inpL, cur);
|
||||
cb(cur, "layer_out", il);
|
||||
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
// post-layernorm
|
||||
if (model.post_ln_w) {
|
||||
inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer);
|
||||
}
|
||||
|
||||
// multimodal projection
|
||||
ggml_tensor * embeddings = inpL;
|
||||
embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size);
|
||||
embeddings = build_ffn(embeddings,
|
||||
model.mm_0_w, model.mm_0_b,
|
||||
nullptr, nullptr,
|
||||
model.mm_1_w, model.mm_1_b,
|
||||
FFN_GELU,
|
||||
-1);
|
||||
|
||||
if (use_window_attn) {
|
||||
window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4);
|
||||
ggml_set_name(window_idx, "window_idx");
|
||||
ggml_set_input(window_idx);
|
||||
|
||||
// embeddings shape: [n_embd, n_patches_x * n_patches_y, batch_size]
|
||||
GGML_ASSERT(batch_size == 1);
|
||||
embeddings = ggml_reshape_2d(ctx0, embeddings, hparams.projection_dim, n_patches_x * n_patches_y / 4);
|
||||
embeddings = ggml_get_rows(ctx0, embeddings, window_idx);
|
||||
embeddings = ggml_reshape_3d(ctx0, embeddings, hparams.projection_dim, n_patches_x * n_patches_y / 4, batch_size);
|
||||
}
|
||||
|
||||
// build the graph
|
||||
ggml_build_forward_expand(gf, embeddings);
|
||||
|
||||
return gf;
|
||||
}
|
||||
|
|
@ -0,0 +1,191 @@
|
|||
#include "models.h"
|
||||
|
||||
ggml_cgraph * clip_graph_qwen3vl::build() {
|
||||
GGML_ASSERT(model.patch_bias != nullptr);
|
||||
GGML_ASSERT(model.position_embeddings != nullptr);
|
||||
GGML_ASSERT(model.class_embedding == nullptr);
|
||||
|
||||
const int batch_size = 1;
|
||||
const int n_pos = n_patches;
|
||||
const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position
|
||||
|
||||
norm_type norm_t = NORM_TYPE_NORMAL;
|
||||
|
||||
int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
|
||||
|
||||
ggml_tensor * inp_raw = build_inp_raw();
|
||||
ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
|
||||
|
||||
GGML_ASSERT(img.nx % (patch_size * 2) == 0);
|
||||
GGML_ASSERT(img.ny % (patch_size * 2) == 0);
|
||||
|
||||
// second conv dimension
|
||||
{
|
||||
auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
|
||||
inp = ggml_add(ctx0, inp, inp_1);
|
||||
|
||||
inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w, h, c, b] -> [c, w, h, b]
|
||||
inp = ggml_cont_4d(
|
||||
ctx0, inp,
|
||||
n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
|
||||
inp = ggml_reshape_4d(
|
||||
ctx0, inp,
|
||||
n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
|
||||
inp = ggml_permute(ctx0, inp, 0, 2, 1, 3);
|
||||
inp = ggml_cont_3d(
|
||||
ctx0, inp,
|
||||
n_embd, n_patches_x * n_patches_y, batch_size);
|
||||
}
|
||||
|
||||
// add patch bias
|
||||
if (model.patch_bias != nullptr) {
|
||||
inp = ggml_add(ctx0, inp, model.patch_bias);
|
||||
cb(inp, "patch_bias", -1);
|
||||
}
|
||||
|
||||
// calculate absolute position embedding and apply
|
||||
ggml_tensor * learned_pos_embd = resize_position_embeddings();
|
||||
learned_pos_embd = ggml_cont_4d(
|
||||
ctx0, learned_pos_embd,
|
||||
n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
|
||||
learned_pos_embd = ggml_reshape_4d(
|
||||
ctx0, learned_pos_embd,
|
||||
n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
|
||||
learned_pos_embd = ggml_permute(ctx0, learned_pos_embd, 0, 2, 1, 3);
|
||||
learned_pos_embd = ggml_cont_3d(
|
||||
ctx0, learned_pos_embd,
|
||||
n_embd, n_patches_x * n_patches_y, batch_size);
|
||||
inp = ggml_add(ctx0, inp, learned_pos_embd);
|
||||
cb(inp, "inp_pos_emb", -1);
|
||||
|
||||
ggml_tensor * inpL = inp;
|
||||
|
||||
ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
|
||||
ggml_set_name(positions, "positions");
|
||||
ggml_set_input(positions);
|
||||
|
||||
// pre-layernorm
|
||||
if (model.pre_ln_w) {
|
||||
inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1);
|
||||
}
|
||||
|
||||
// deepstack features (stack along the feature dimension), [n_embd * len(deepstack_layers), n_patches_x * n_patches_y, batch_size]
|
||||
ggml_tensor * deepstack_features = nullptr;
|
||||
const int merge_factor = hparams.n_merge > 0 ? hparams.n_merge * hparams.n_merge : 4; // default 2x2=4 for qwen3vl
|
||||
|
||||
// loop over layers
|
||||
for (int il = 0; il < n_layer; il++) {
|
||||
auto & layer = model.layers[il];
|
||||
|
||||
ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
|
||||
|
||||
// layernorm1
|
||||
cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il);
|
||||
cb(cur, "ln1", il);
|
||||
|
||||
// self-attention
|
||||
{
|
||||
cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
|
||||
cur = ggml_add(ctx0, cur, layer.qkv_b);
|
||||
|
||||
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
|
||||
/* nb1 */ ggml_row_size(cur->type, d_head),
|
||||
/* nb2 */ cur->nb[1],
|
||||
/* offset */ 0);
|
||||
|
||||
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
|
||||
/* nb1 */ ggml_row_size(cur->type, d_head),
|
||||
/* nb2 */ cur->nb[1],
|
||||
/* offset */ ggml_row_size(cur->type, n_embd));
|
||||
|
||||
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
|
||||
/* nb1 */ ggml_row_size(cur->type, d_head),
|
||||
/* nb2 */ cur->nb[1],
|
||||
/* offset */ ggml_row_size(cur->type, 2 * n_embd));
|
||||
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
// apply M-RoPE
|
||||
Qcur = ggml_rope_multi(
|
||||
ctx0, Qcur, positions, nullptr,
|
||||
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
|
||||
Kcur = ggml_rope_multi(
|
||||
ctx0, Kcur, positions, nullptr,
|
||||
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
|
||||
|
||||
cb(Qcur, "Qcur_rope", il);
|
||||
cb(Kcur, "Kcur_rope", il);
|
||||
|
||||
cur = build_attn(layer.o_w, layer.o_b,
|
||||
Qcur, Kcur, Vcur, nullptr, kq_scale, il);
|
||||
cb(cur, "attn_out", il);
|
||||
}
|
||||
|
||||
// re-add the layer input, e.g., residual
|
||||
cur = ggml_add(ctx0, cur, inpL);
|
||||
|
||||
inpL = cur; // inpL = residual, cur = hidden_states
|
||||
|
||||
cb(cur, "ffn_inp", il);
|
||||
|
||||
// layernorm2
|
||||
cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
|
||||
cb(cur, "ffn_inp_normed", il);
|
||||
|
||||
// ffn
|
||||
cur = build_ffn(cur,
|
||||
layer.ff_up_w, layer.ff_up_b,
|
||||
layer.ff_gate_w, layer.ff_gate_b,
|
||||
layer.ff_down_w, layer.ff_down_b,
|
||||
hparams.ffn_op, il);
|
||||
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
// residual 2
|
||||
cur = ggml_add(ctx0, inpL, cur);
|
||||
cb(cur, "layer_out", il);
|
||||
|
||||
if (layer.has_deepstack()) {
|
||||
ggml_tensor * feat = ggml_reshape_3d(ctx0, cur, n_embd * merge_factor, n_pos / merge_factor, batch_size);
|
||||
feat = build_norm(feat, layer.deepstack_norm_w, layer.deepstack_norm_b, norm_t, eps, il);
|
||||
feat = build_ffn(feat,
|
||||
layer.deepstack_fc1_w, layer.deepstack_fc1_b,
|
||||
nullptr, nullptr,
|
||||
layer.deepstack_fc2_w, layer.deepstack_fc2_b,
|
||||
ffn_op_type::FFN_GELU, il);
|
||||
|
||||
if(!deepstack_features) {
|
||||
deepstack_features = feat;
|
||||
} else {
|
||||
// concat along the feature dimension
|
||||
deepstack_features = ggml_concat(ctx0, deepstack_features, feat, 0);
|
||||
}
|
||||
}
|
||||
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
// post-layernorm
|
||||
if (model.post_ln_w) {
|
||||
inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer);
|
||||
}
|
||||
|
||||
// multimodal projection
|
||||
ggml_tensor * embeddings = inpL;
|
||||
embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size);
|
||||
|
||||
embeddings = build_ffn(embeddings,
|
||||
model.mm_0_w, model.mm_0_b,
|
||||
nullptr, nullptr,
|
||||
model.mm_1_w, model.mm_1_b,
|
||||
ffn_op_type::FFN_GELU, -1);
|
||||
|
||||
embeddings = ggml_concat(ctx0, embeddings, deepstack_features, 0); // concat along the feature dimension
|
||||
|
||||
// build the graph
|
||||
ggml_build_forward_expand(gf, embeddings);
|
||||
|
||||
return gf;
|
||||
}
|
||||
|
|
@ -0,0 +1,81 @@
|
|||
#include "models.h"
|
||||
|
||||
ggml_cgraph * clip_graph_siglip::build() {
|
||||
ggml_tensor * inp = build_inp();
|
||||
|
||||
ggml_tensor * learned_pos_embd = model.position_embeddings;
|
||||
if (proj_type == PROJECTOR_TYPE_LFM2) {
|
||||
learned_pos_embd = resize_position_embeddings();
|
||||
}
|
||||
|
||||
ggml_tensor * cur = build_vit(
|
||||
inp, n_patches,
|
||||
NORM_TYPE_NORMAL,
|
||||
hparams.ffn_op,
|
||||
learned_pos_embd,
|
||||
nullptr);
|
||||
|
||||
if (proj_type == PROJECTOR_TYPE_GEMMA3) {
|
||||
const int batch_size = 1;
|
||||
GGML_ASSERT(n_patches_x == n_patches_y);
|
||||
const int patches_per_image = n_patches_x;
|
||||
const int kernel_size = hparams.n_merge;
|
||||
|
||||
cur = ggml_transpose(ctx0, cur);
|
||||
cur = ggml_cont_4d(ctx0, cur, patches_per_image, patches_per_image, n_embd, batch_size);
|
||||
|
||||
// doing a pool2d to reduce the number of output tokens
|
||||
cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, kernel_size, kernel_size, kernel_size, kernel_size, 0, 0);
|
||||
cur = ggml_reshape_3d(ctx0, cur, cur->ne[0] * cur->ne[0], n_embd, batch_size);
|
||||
cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
|
||||
|
||||
// apply norm before projection
|
||||
cur = ggml_rms_norm(ctx0, cur, eps);
|
||||
cur = ggml_mul(ctx0, cur, model.mm_soft_emb_norm_w);
|
||||
|
||||
// apply projection
|
||||
cur = ggml_mul_mat(ctx0,
|
||||
ggml_cont(ctx0, ggml_transpose(ctx0, model.mm_input_proj_w)),
|
||||
cur);
|
||||
|
||||
} else if (proj_type == PROJECTOR_TYPE_IDEFICS3) {
|
||||
// pixel_shuffle
|
||||
// https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578
|
||||
const int scale_factor = model.hparams.n_merge;
|
||||
cur = build_patch_merge_permute(cur, scale_factor);
|
||||
cur = ggml_mul_mat(ctx0, model.projection, cur);
|
||||
|
||||
} else if (proj_type == PROJECTOR_TYPE_LFM2) {
|
||||
// pixel unshuffle block
|
||||
const int scale_factor = model.hparams.n_merge;
|
||||
cur = build_patch_merge_permute(cur, scale_factor);
|
||||
|
||||
// projection
|
||||
cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm
|
||||
cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
|
||||
cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
|
||||
|
||||
cur = build_ffn(cur,
|
||||
model.mm_1_w, model.mm_1_b,
|
||||
nullptr, nullptr,
|
||||
model.mm_2_w, model.mm_2_b,
|
||||
FFN_GELU,
|
||||
-1);
|
||||
|
||||
} else if (proj_type == PROJECTOR_TYPE_JANUS_PRO) {
|
||||
cur = build_ffn(cur,
|
||||
model.mm_0_w, model.mm_0_b,
|
||||
nullptr, nullptr,
|
||||
model.mm_1_w, model.mm_1_b,
|
||||
hparams.ffn_op,
|
||||
-1);
|
||||
|
||||
} else {
|
||||
GGML_ABORT("SigLIP: Unsupported projector type");
|
||||
}
|
||||
|
||||
// build the graph
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return gf;
|
||||
}
|
||||
|
|
@ -0,0 +1,106 @@
|
|||
#include "models.h"
|
||||
|
||||
ggml_cgraph * clip_graph_whisper_enc::build() {
|
||||
const int n_frames = img.nx;
|
||||
const int n_pos = n_frames / 2;
|
||||
GGML_ASSERT(model.position_embeddings->ne[1] >= n_pos);
|
||||
|
||||
ggml_tensor * inp = build_inp_raw(1);
|
||||
|
||||
// conv1d block
|
||||
{
|
||||
// convolution + gelu
|
||||
ggml_tensor * cur = ggml_conv_1d_ph(ctx0, model.conv1d_1_w, inp, 1, 1);
|
||||
cur = ggml_add(ctx0, cur, model.conv1d_1_b);
|
||||
|
||||
cur = ggml_gelu_erf(ctx0, cur);
|
||||
|
||||
cur = ggml_conv_1d_ph(ctx0, model.conv1d_2_w, cur, 2, 1);
|
||||
cur = ggml_add(ctx0, cur, model.conv1d_2_b);
|
||||
|
||||
cur = ggml_gelu_erf(ctx0, cur);
|
||||
// transpose
|
||||
inp = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
|
||||
cb(inp, "after_conv1d", -1);
|
||||
}
|
||||
|
||||
// sanity check (only check one layer, but it should be the same for all)
|
||||
GGML_ASSERT(model.layers[0].ln_1_w && model.layers[0].ln_1_b);
|
||||
GGML_ASSERT(model.layers[0].ln_2_w && model.layers[0].ln_2_b);
|
||||
GGML_ASSERT(model.layers[0].q_b);
|
||||
GGML_ASSERT(model.layers[0].v_b);
|
||||
GGML_ASSERT(!model.layers[0].k_b); // no bias for k
|
||||
|
||||
ggml_tensor * pos_embd_selected = ggml_view_2d(
|
||||
ctx0, model.position_embeddings,
|
||||
model.position_embeddings->ne[0], n_pos,
|
||||
model.position_embeddings->nb[1], 0
|
||||
);
|
||||
ggml_tensor * cur = build_vit(
|
||||
inp, n_pos,
|
||||
NORM_TYPE_NORMAL,
|
||||
hparams.ffn_op,
|
||||
pos_embd_selected,
|
||||
nullptr);
|
||||
|
||||
cb(cur, "after_transformer", -1);
|
||||
|
||||
if (model.audio_has_stack_frames()) {
|
||||
// StackAudioFrames
|
||||
// https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/blob/main/ultravox_model.py
|
||||
cur = build_stack(cur, hparams.proj_stack_factor, n_embd);
|
||||
cb(cur, "after_stacked", -1);
|
||||
}
|
||||
|
||||
if (proj_type == PROJECTOR_TYPE_ULTRAVOX) {
|
||||
// UltravoxProjector
|
||||
// pre-norm
|
||||
cur = ggml_rms_norm(ctx0, cur, 1e-6);
|
||||
cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w);
|
||||
|
||||
// ffn in
|
||||
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
|
||||
|
||||
// swiglu
|
||||
// see SwiGLU in ultravox_model.py, the second half passed through is silu, not the first half
|
||||
cur = ggml_swiglu_swapped(ctx0, cur);
|
||||
|
||||
// mid-norm
|
||||
cur = ggml_rms_norm(ctx0, cur, 1e-6);
|
||||
cur = ggml_mul(ctx0, cur, model.mm_norm_mid_w);
|
||||
|
||||
// ffn out
|
||||
cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
|
||||
|
||||
} else if (proj_type == PROJECTOR_TYPE_QWEN2A) {
|
||||
// projector
|
||||
cur = ggml_mul_mat(ctx0, model.mm_fc_w, cur);
|
||||
cur = ggml_add(ctx0, cur, model.mm_fc_b);
|
||||
|
||||
} else if (proj_type == PROJECTOR_TYPE_VOXTRAL) {
|
||||
// projector
|
||||
cur = build_ffn(cur,
|
||||
model.mm_1_w, model.mm_1_b,
|
||||
nullptr, nullptr,
|
||||
model.mm_2_w, model.mm_2_b,
|
||||
FFN_GELU_ERF,
|
||||
-1);
|
||||
|
||||
} else if (proj_type == PROJECTOR_TYPE_GLMA) {
|
||||
cur = ggml_norm(ctx0, cur, hparams.eps);
|
||||
cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w);
|
||||
cur = ggml_add(ctx0, cur, model.mm_norm_pre_b);
|
||||
cur = build_stack(cur, hparams.proj_stack_factor, n_embd);
|
||||
cur = build_ffn(cur, model.mm_1_w, model.mm_1_b, nullptr, nullptr, model.mm_2_w, model.mm_2_b, hparams.ffn_op, 0);
|
||||
cur = ggml_concat(ctx0, model.mm_boi, cur, 1);
|
||||
cur = ggml_concat(ctx0, cur, model.mm_eoi, 1);
|
||||
} else {
|
||||
GGML_ABORT("%s: unknown projector type", __func__);
|
||||
}
|
||||
|
||||
cb(cur, "projected", -1);
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return gf;
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
|
|
@ -1,23 +1,15 @@
|
|||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "clip-model.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
#define WHISPER_ASSERT GGML_ASSERT
|
||||
#define MTMD_INTERNAL_HEADER
|
||||
|
||||
#define WHISPER_SAMPLE_RATE 16000
|
||||
#define WHISPER_N_FFT 400
|
||||
#define WHISPER_HOP_LENGTH 160
|
||||
#define WHISPER_CHUNK_SIZE 30
|
||||
|
||||
#define COMMON_SAMPLE_RATE 16000
|
||||
|
||||
namespace whisper_preprocessor {
|
||||
|
||||
struct whisper_mel {
|
||||
struct mtmd_audio_mel {
|
||||
int n_len;
|
||||
int n_len_org;
|
||||
int n_mel;
|
||||
|
|
@ -25,23 +17,18 @@ struct whisper_mel {
|
|||
std::vector<float> data;
|
||||
};
|
||||
|
||||
struct whisper_filters {
|
||||
int32_t n_mel;
|
||||
int32_t n_fft;
|
||||
struct mtmd_audio_preprocessor {
|
||||
const clip_hparams & hparams;
|
||||
|
||||
std::vector<float> data;
|
||||
mtmd_audio_preprocessor(const clip_ctx * ctx): hparams(*clip_get_hparams(ctx)) {}
|
||||
|
||||
virtual ~mtmd_audio_preprocessor() = default;
|
||||
virtual void initialize() = 0; // NOT thread-safe
|
||||
virtual bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) = 0;
|
||||
};
|
||||
|
||||
bool preprocess_audio(
|
||||
const float * samples,
|
||||
size_t n_samples,
|
||||
const whisper_filters & filters,
|
||||
std::vector<whisper_mel> & output);
|
||||
|
||||
} // namespace whisper_preprocessor
|
||||
|
||||
namespace whisper_precalc_filters {
|
||||
|
||||
whisper_preprocessor::whisper_filters get_128_bins();
|
||||
|
||||
} // namespace whisper_precalc_filters
|
||||
struct mtmd_audio_preprocessor_whisper : mtmd_audio_preprocessor {
|
||||
mtmd_audio_preprocessor_whisper(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {}
|
||||
void initialize() override;
|
||||
bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override;
|
||||
};
|
||||
|
|
|
|||
|
|
@ -32,6 +32,10 @@
|
|||
#define STB_IMAGE_IMPLEMENTATION
|
||||
#include "stb/stb_image.h"
|
||||
|
||||
#ifdef MTMD_INTERNAL_HEADER
|
||||
#error "mtmd-helper is a public library outside of mtmd. it must not include internal headers"
|
||||
#endif
|
||||
|
||||
//
|
||||
// internal logging functions
|
||||
//
|
||||
|
|
|
|||
|
|
@ -161,8 +161,7 @@ struct mtmd_context {
|
|||
// string template for slice image delimiters with row/col (idefics3)
|
||||
std::string sli_img_start_tmpl;
|
||||
|
||||
// for whisper, we pre-calculate the mel filter bank
|
||||
whisper_preprocessor::whisper_filters w_filters;
|
||||
std::unique_ptr<mtmd_audio_preprocessor> audio_preproc;
|
||||
|
||||
// TODO @ngxson : add timings
|
||||
|
||||
|
|
@ -228,7 +227,7 @@ struct mtmd_context {
|
|||
|
||||
void init_vision() {
|
||||
GGML_ASSERT(ctx_v != nullptr);
|
||||
use_mrope = clip_is_qwen2vl(ctx_v);
|
||||
use_mrope = clip_is_mrope(ctx_v);
|
||||
|
||||
projector_type proj = clip_get_projector_type(ctx_v);
|
||||
int minicpmv_version = clip_is_minicpmv(ctx_v);
|
||||
|
|
@ -320,6 +319,10 @@ struct mtmd_context {
|
|||
img_beg = "<|image_start|>";
|
||||
img_end = "<|image_end|>";
|
||||
|
||||
} else if (proj == PROJECTOR_TYPE_GLM4V) {
|
||||
img_beg = "<|begin_of_image|>";
|
||||
img_end = "<|end_of_image|>";
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -327,14 +330,25 @@ struct mtmd_context {
|
|||
GGML_ASSERT(ctx_a != nullptr);
|
||||
projector_type proj = clip_get_projector_type(ctx_a);
|
||||
|
||||
if (clip_has_whisper_encoder(ctx_a)) {
|
||||
// TODO @ngxson : check if model n_mel is 128 or 80
|
||||
w_filters = whisper_precalc_filters::get_128_bins();
|
||||
}
|
||||
|
||||
LOG_WRN("%s: audio input is in experimental stage and may have reduced quality:\n"
|
||||
" https://github.com/ggml-org/llama.cpp/discussions/13759\n", __func__);
|
||||
|
||||
// set preprocessor
|
||||
switch (proj) {
|
||||
case PROJECTOR_TYPE_QWEN2A:
|
||||
case PROJECTOR_TYPE_QWEN25O:
|
||||
case PROJECTOR_TYPE_ULTRAVOX:
|
||||
case PROJECTOR_TYPE_VOXTRAL:
|
||||
audio_preproc = std::make_unique<mtmd_audio_preprocessor_whisper>(ctx_a);
|
||||
break;
|
||||
default:
|
||||
GGML_ABORT("unsupported audio projector type");
|
||||
}
|
||||
|
||||
// initialize audio preprocessor
|
||||
audio_preproc->initialize();
|
||||
|
||||
// set special tokens
|
||||
if (proj == PROJECTOR_TYPE_QWEN2A) {
|
||||
// <|audio_bos|> ... (embeddings) ... <|audio_eos|>
|
||||
aud_beg = "<|audio_bos|>";
|
||||
|
|
@ -663,11 +677,10 @@ struct mtmd_tokenizer {
|
|||
}
|
||||
|
||||
// preprocess audio
|
||||
GGML_ASSERT(ctx->w_filters.n_mel); // make sure we have filter preloaded
|
||||
std::vector<whisper_preprocessor::whisper_mel> mel_spec_chunks;
|
||||
std::vector<mtmd_audio_mel> mel_spec_chunks;
|
||||
const float * samples = (const float *)bitmap->data.data();
|
||||
size_t n_samples = bitmap->data.size() / sizeof(float);
|
||||
bool ok = whisper_preprocessor::preprocess_audio(samples, n_samples, ctx->w_filters, mel_spec_chunks);
|
||||
bool ok = ctx->audio_preproc->preprocess(samples, n_samples, mel_spec_chunks);
|
||||
if (!ok) {
|
||||
LOG_ERR("Unable to preprocess audio\n");
|
||||
return 2;
|
||||
|
|
@ -873,8 +886,7 @@ int mtmd_get_audio_bitrate(mtmd_context * ctx) {
|
|||
if (!ctx->ctx_a) {
|
||||
return -1;
|
||||
}
|
||||
// for now, we assume that all audio models have the same bitrate
|
||||
return 16000; // 16kHz
|
||||
return clip_get_hparams(ctx->ctx_a)->audio_sample_rate;
|
||||
}
|
||||
|
||||
//
|
||||
|
|
|
|||
|
|
@ -22,6 +22,11 @@
|
|||
* Issues related to API usage may receive lower priority support.
|
||||
*
|
||||
* For the usage, see an example in mtmd-cli.cpp
|
||||
*
|
||||
* For contributors:
|
||||
* - Make sure the C API is aligned with the libllama C API (as in llama.h)
|
||||
* - Do not include model name (e.g., qwen, gemma) in the API, use generic terms instead
|
||||
* - Keep the API minimal, do not expose internal details unless necessary
|
||||
*/
|
||||
|
||||
#ifdef LLAMA_SHARED
|
||||
|
|
|
|||
|
|
@ -42,6 +42,7 @@ import (
|
|||
_ "github.com/ollama/ollama/llama/llama.cpp/common"
|
||||
_ "github.com/ollama/ollama/llama/llama.cpp/src"
|
||||
_ "github.com/ollama/ollama/llama/llama.cpp/tools/mtmd"
|
||||
_ "github.com/ollama/ollama/llama/llama.cpp/tools/mtmd/models"
|
||||
"github.com/ollama/ollama/ml"
|
||||
ggml "github.com/ollama/ollama/ml/backend/ggml/ggml/src"
|
||||
)
|
||||
|
|
|
|||
|
|
@ -23,10 +23,10 @@ problem.
|
|||
8 files changed, 21 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
|
||||
index 08681f35e..afde2f0b7 100644
|
||||
index 8547ecc84..9f37ca70c 100644
|
||||
--- a/ggml/src/ggml-backend.cpp
|
||||
+++ b/ggml/src/ggml-backend.cpp
|
||||
@@ -113,7 +113,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
|
||||
@@ -112,7 +112,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
|
||||
if (buffer->iface.free_buffer != NULL) {
|
||||
buffer->iface.free_buffer(buffer);
|
||||
}
|
||||
|
|
@ -34,7 +34,7 @@ index 08681f35e..afde2f0b7 100644
|
|||
}
|
||||
|
||||
size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
|
||||
@@ -586,6 +585,7 @@ static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer)
|
||||
@@ -591,6 +590,7 @@ static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer)
|
||||
|
||||
free(ctx->buffers);
|
||||
free(ctx);
|
||||
|
|
@ -42,7 +42,7 @@ index 08681f35e..afde2f0b7 100644
|
|||
}
|
||||
|
||||
static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||
@@ -2106,6 +2106,11 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
@@ -2125,6 +2125,11 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
GGML_ASSERT(buffer);
|
||||
ggml_aligned_free(buffer->context, buffer->size);
|
||||
|
|
@ -54,7 +54,7 @@ index 08681f35e..afde2f0b7 100644
|
|||
}
|
||||
|
||||
static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
||||
@@ -2158,7 +2163,7 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
|
||||
@@ -2177,7 +2182,7 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
|
||||
};
|
||||
|
||||
static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
|
||||
|
|
@ -64,7 +64,7 @@ index 08681f35e..afde2f0b7 100644
|
|||
/* .init_tensor = */ NULL, // no initialization required
|
||||
/* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
|
||||
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
|
||||
index 81288464c..866758782 100644
|
||||
index da624c587..efc63e092 100644
|
||||
--- a/ggml/src/ggml-cann/ggml-cann.cpp
|
||||
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
|
||||
@@ -831,6 +831,7 @@ static bool ggml_backend_buffer_is_cann(ggml_backend_buffer_t buffer) {
|
||||
|
|
@ -84,7 +84,7 @@ index 81288464c..866758782 100644
|
|||
|
||||
/**
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index 279679a4e..5145c1e88 100644
|
||||
index ab0f6fe9c..6519af435 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -583,6 +583,7 @@ struct ggml_backend_cuda_buffer_context {
|
||||
|
|
@ -156,10 +156,10 @@ index 18a45d2d9..89041805e 100644
|
|||
|
||||
static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
|
||||
index 7449a9160..e69a1ff5f 100644
|
||||
index e996d98be..84b679315 100644
|
||||
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
|
||||
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
|
||||
@@ -355,6 +355,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
|
||||
@@ -356,6 +356,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
|
||||
ggml_sycl_set_device(ctx->device);
|
||||
|
||||
delete ctx;
|
||||
|
|
@ -167,7 +167,7 @@ index 7449a9160..e69a1ff5f 100644
|
|||
}
|
||||
catch (sycl::exception const &exc) {
|
||||
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
||||
@@ -816,6 +817,7 @@ struct ggml_backend_sycl_split_buffer_context {
|
||||
@@ -817,6 +818,7 @@ struct ggml_backend_sycl_split_buffer_context {
|
||||
static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
|
||||
delete ctx;
|
||||
|
|
@ -175,7 +175,7 @@ index 7449a9160..e69a1ff5f 100644
|
|||
}
|
||||
|
||||
static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
@@ -1158,6 +1160,7 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
|
||||
@@ -1159,6 +1161,7 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
|
||||
|
||||
static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_sycl_host_free(buffer->context);
|
||||
|
|
@ -184,10 +184,10 @@ index 7449a9160..e69a1ff5f 100644
|
|||
|
||||
static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
index c6f5809cc..c801d2fd2 100644
|
||||
index 34ec09d40..120191ca0 100644
|
||||
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
@@ -12271,6 +12271,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
@@ -12365,6 +12365,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
||||
ggml_vk_destroy_buffer(ctx->dev_buffer);
|
||||
delete ctx;
|
||||
|
|
@ -195,7 +195,7 @@ index c6f5809cc..c801d2fd2 100644
|
|||
}
|
||||
|
||||
static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
@@ -12414,6 +12415,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
|
||||
@@ -12508,6 +12509,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
|
||||
static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
|
||||
ggml_vk_host_free(vk_instance.devices[0], buffer->context);
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@ logs instead of throwing an error
|
|||
1 file changed, 3 insertions(+), 11 deletions(-)
|
||||
|
||||
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
|
||||
index e2cca66e4..8246a0a14 100644
|
||||
index 7b01a2edf..63250cdf1 100644
|
||||
--- a/src/llama-vocab.cpp
|
||||
+++ b/src/llama-vocab.cpp
|
||||
@@ -1825,16 +1825,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
|
|
@ -31,7 +31,7 @@ index e2cca66e4..8246a0a14 100644
|
|||
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
} else if (
|
||||
tokenizer_pre == "llama3" ||
|
||||
@@ -2014,7 +2005,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
@@ -2015,7 +2006,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2;
|
||||
clean_spaces = false;
|
||||
} else {
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@ filesystems for paths that include wide characters
|
|||
1 file changed, 39 insertions(+)
|
||||
|
||||
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
|
||||
index 3ed08a0fe..6be1470ad 100644
|
||||
index 35e3aef0a..84a3796b5 100644
|
||||
--- a/tools/mtmd/clip.cpp
|
||||
+++ b/tools/mtmd/clip.cpp
|
||||
@@ -24,6 +24,19 @@
|
||||
|
|
@ -32,8 +32,8 @@ index 3ed08a0fe..6be1470ad 100644
|
|||
+
|
||||
struct clip_logger_state g_logger_state = {clip_log_callback_default, NULL};
|
||||
|
||||
enum ffn_op_type {
|
||||
@@ -3257,7 +3270,29 @@ struct clip_model_loader {
|
||||
//#define CLIP_DEBUG_FUNCTIONS
|
||||
@@ -1619,7 +1632,29 @@ struct clip_model_loader {
|
||||
{
|
||||
std::vector<uint8_t> read_buf;
|
||||
|
||||
|
|
@ -63,7 +63,7 @@ index 3ed08a0fe..6be1470ad 100644
|
|||
if (!fin) {
|
||||
throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
|
||||
}
|
||||
@@ -3284,7 +3319,11 @@ struct clip_model_loader {
|
||||
@@ -1646,7 +1681,11 @@ struct clip_model_loader {
|
||||
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -6,7 +6,7 @@ Subject: [PATCH] solar-pro
|
|||
adds support for the Solar Pro architecture
|
||||
---
|
||||
src/CMakeLists.txt | 1 +
|
||||
src/llama-arch.cpp | 21 +++++
|
||||
src/llama-arch.cpp | 20 +++++
|
||||
src/llama-arch.h | 3 +
|
||||
src/llama-hparams.cpp | 8 ++
|
||||
src/llama-hparams.h | 5 ++
|
||||
|
|
@ -15,7 +15,7 @@ adds support for the Solar Pro architecture
|
|||
src/llama-model.h | 3 +
|
||||
src/models/models.h | 5 ++
|
||||
src/models/solar.cpp | 158 +++++++++++++++++++++++++++++++++++++
|
||||
10 files changed, 253 insertions(+), 1 deletion(-)
|
||||
10 files changed, 252 insertions(+), 1 deletion(-)
|
||||
create mode 100644 src/models/solar.cpp
|
||||
|
||||
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
|
||||
|
|
@ -31,10 +31,10 @@ index 4192af7c0..bd44d73e7 100644
|
|||
models/starcoder.cpp
|
||||
models/starcoder2.cpp
|
||||
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
|
||||
index 64ad1b776..a5fe4f66c 100644
|
||||
index 8caf80afc..2ce8ffec0 100644
|
||||
--- a/src/llama-arch.cpp
|
||||
+++ b/src/llama-arch.cpp
|
||||
@@ -85,6 +85,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
@@ -87,6 +87,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_GRANITE_MOE, "granitemoe" },
|
||||
{ LLM_ARCH_GRANITE_HYBRID, "granitehybrid" },
|
||||
{ LLM_ARCH_CHAMELEON, "chameleon" },
|
||||
|
|
@ -42,7 +42,7 @@ index 64ad1b776..a5fe4f66c 100644
|
|||
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
|
||||
{ LLM_ARCH_PLM, "plm" },
|
||||
{ LLM_ARCH_BAILINGMOE, "bailingmoe" },
|
||||
@@ -206,6 +207,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
@@ -208,6 +209,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_ATTENTION_OUTPUT_SCALE, "%s.attention.output_scale" },
|
||||
{ LLM_KV_ATTENTION_TEMPERATURE_LENGTH, "%s.attention.temperature_length" },
|
||||
{ LLM_KV_ATTENTION_TEMPERATURE_SCALE, "%s.attention.temperature_scale" },
|
||||
|
|
@ -50,32 +50,38 @@ index 64ad1b776..a5fe4f66c 100644
|
|||
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
|
||||
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
|
||||
|
||||
@@ -2025,6 +2027,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
||||
},
|
||||
},
|
||||
+ {
|
||||
+ LLM_ARCH_SOLAR,
|
||||
+ {
|
||||
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
+ { LLM_TENSOR_OUTPUT, "output" },
|
||||
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
+ { LLM_TENSOR_BSKCN_TV, "bskcn_tv" },
|
||||
+ },
|
||||
+ },
|
||||
{
|
||||
LLM_ARCH_WAVTOKENIZER_DEC,
|
||||
{
|
||||
@@ -2710,6 +2730,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
||||
@@ -339,6 +341,7 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
|
||||
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
||||
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
|
||||
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
||||
+ { LLM_TENSOR_BSKCN_TV, "bskcn_tv" },
|
||||
{ LLM_TENSOR_POS_EMBD, "position_embd" },
|
||||
{ LLM_TENSOR_FFN_ACT, "blk.%d.ffn.act" },
|
||||
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
||||
@@ -2176,6 +2179,22 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
||||
return {
|
||||
LLM_TENSOR_TOKEN_EMBD,
|
||||
};
|
||||
+ case LLM_ARCH_SOLAR:
|
||||
+ return {
|
||||
+ LLM_TENSOR_TOKEN_EMBD,
|
||||
+ LLM_TENSOR_OUTPUT_NORM,
|
||||
+ LLM_TENSOR_OUTPUT,
|
||||
+ LLM_TENSOR_ATTN_NORM,
|
||||
+ LLM_TENSOR_ATTN_Q,
|
||||
+ LLM_TENSOR_ATTN_K,
|
||||
+ LLM_TENSOR_ATTN_V,
|
||||
+ LLM_TENSOR_ATTN_OUT,
|
||||
+ LLM_TENSOR_FFN_NORM,
|
||||
+ LLM_TENSOR_FFN_GATE,
|
||||
+ LLM_TENSOR_FFN_DOWN,
|
||||
+ LLM_TENSOR_FFN_UP,
|
||||
+ LLM_TENSOR_BSKCN_TV,
|
||||
+ };
|
||||
default:
|
||||
GGML_ABORT("unknown architecture for tensor mapping");
|
||||
}
|
||||
@@ -2344,6 +2363,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
||||
{LLM_TENSOR_LAUREL_POST_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
// this tensor is loaded for T5, but never used
|
||||
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
|
||||
|
|
@ -84,10 +90,10 @@ index 64ad1b776..a5fe4f66c 100644
|
|||
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
diff --git a/src/llama-arch.h b/src/llama-arch.h
|
||||
index e11318002..ec9e3a6df 100644
|
||||
index 6cbf9b1f8..14d461c76 100644
|
||||
--- a/src/llama-arch.h
|
||||
+++ b/src/llama-arch.h
|
||||
@@ -89,6 +89,7 @@ enum llm_arch {
|
||||
@@ -91,6 +91,7 @@ enum llm_arch {
|
||||
LLM_ARCH_GRANITE_MOE,
|
||||
LLM_ARCH_GRANITE_HYBRID,
|
||||
LLM_ARCH_CHAMELEON,
|
||||
|
|
@ -95,7 +101,7 @@ index e11318002..ec9e3a6df 100644
|
|||
LLM_ARCH_WAVTOKENIZER_DEC,
|
||||
LLM_ARCH_PLM,
|
||||
LLM_ARCH_BAILINGMOE,
|
||||
@@ -210,6 +211,7 @@ enum llm_kv {
|
||||
@@ -212,6 +213,7 @@ enum llm_kv {
|
||||
LLM_KV_ATTENTION_OUTPUT_SCALE,
|
||||
LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
|
||||
LLM_KV_ATTENTION_TEMPERATURE_SCALE,
|
||||
|
|
@ -103,7 +109,7 @@ index e11318002..ec9e3a6df 100644
|
|||
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
|
||||
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
|
||||
|
||||
@@ -462,6 +464,7 @@ enum llm_tensor {
|
||||
@@ -465,6 +467,7 @@ enum llm_tensor {
|
||||
LLM_TENSOR_ENC_OUTPUT_NORM,
|
||||
LLM_TENSOR_CLS,
|
||||
LLM_TENSOR_CLS_OUT,
|
||||
|
|
@ -112,10 +118,10 @@ index e11318002..ec9e3a6df 100644
|
|||
LLM_TENSOR_CONVNEXT_DW,
|
||||
LLM_TENSOR_CONVNEXT_NORM,
|
||||
diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
|
||||
index 8cdbaf69f..41127bf91 100644
|
||||
index fe1fa4341..aabff2f06 100644
|
||||
--- a/src/llama-hparams.cpp
|
||||
+++ b/src/llama-hparams.cpp
|
||||
@@ -161,6 +161,14 @@ uint32_t llama_hparams::n_pos_per_embd() const {
|
||||
@@ -163,6 +163,14 @@ uint32_t llama_hparams::n_pos_per_embd() const {
|
||||
return rope_type == LLAMA_ROPE_TYPE_MROPE || rope_type == LLAMA_ROPE_TYPE_IMROPE ? 4 : 1;
|
||||
}
|
||||
|
||||
|
|
@ -131,10 +137,10 @@ index 8cdbaf69f..41127bf91 100644
|
|||
if (il < n_layer) {
|
||||
return swa_layers[il];
|
||||
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
|
||||
index 6eff334a5..a778fc3cf 100644
|
||||
index f6e95b5d2..c6e673276 100644
|
||||
--- a/src/llama-hparams.h
|
||||
+++ b/src/llama-hparams.h
|
||||
@@ -64,6 +64,8 @@ struct llama_hparams {
|
||||
@@ -65,6 +65,8 @@ struct llama_hparams {
|
||||
std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
|
||||
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
|
||||
|
||||
|
|
@ -143,7 +149,7 @@ index 6eff334a5..a778fc3cf 100644
|
|||
uint32_t n_layer_dense_lead = 0;
|
||||
uint32_t n_lora_q = 0;
|
||||
uint32_t n_lora_kv = 0;
|
||||
@@ -256,6 +258,9 @@ struct llama_hparams {
|
||||
@@ -259,6 +261,9 @@ struct llama_hparams {
|
||||
|
||||
uint32_t n_pos_per_embd() const;
|
||||
|
||||
|
|
@ -154,7 +160,7 @@ index 6eff334a5..a778fc3cf 100644
|
|||
|
||||
bool has_kv(uint32_t il) const;
|
||||
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
|
||||
index aa3a65f87..ee303bd58 100644
|
||||
index ca2ea2461..8916a6242 100644
|
||||
--- a/src/llama-model-loader.cpp
|
||||
+++ b/src/llama-model-loader.cpp
|
||||
@@ -466,7 +466,7 @@ namespace GGUFMeta {
|
||||
|
|
@ -167,10 +173,10 @@ index aa3a65f87..ee303bd58 100644
|
|||
llama_model_loader::llama_model_loader(
|
||||
const std::string & fname,
|
||||
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
|
||||
index 04fccc979..3c503b424 100644
|
||||
index ae8207ee1..00cd579e0 100644
|
||||
--- a/src/llama-model.cpp
|
||||
+++ b/src/llama-model.cpp
|
||||
@@ -1975,6 +1975,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
@@ -1995,6 +1995,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
|
|
@ -192,7 +198,7 @@ index 04fccc979..3c503b424 100644
|
|||
case LLM_ARCH_WAVTOKENIZER_DEC:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
@@ -5401,6 +5416,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
@@ -5429,6 +5444,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
|
||||
|
|
@ -227,7 +233,7 @@ index 04fccc979..3c503b424 100644
|
|||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||
@@ -7480,6 +7523,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
||||
@@ -7534,6 +7577,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
||||
{
|
||||
llm = std::make_unique<llm_build_chameleon>(*this, params);
|
||||
} break;
|
||||
|
|
@ -238,7 +244,7 @@ index 04fccc979..3c503b424 100644
|
|||
case LLM_ARCH_WAVTOKENIZER_DEC:
|
||||
{
|
||||
llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params);
|
||||
@@ -7743,6 +7790,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
@@ -7798,6 +7845,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
case LLM_ARCH_GRANITE_MOE:
|
||||
case LLM_ARCH_GRANITE_HYBRID:
|
||||
case LLM_ARCH_CHAMELEON:
|
||||
|
|
@ -247,7 +253,7 @@ index 04fccc979..3c503b424 100644
|
|||
case LLM_ARCH_NEO_BERT:
|
||||
case LLM_ARCH_SMOLLM3:
|
||||
diff --git a/src/llama-model.h b/src/llama-model.h
|
||||
index f8342cf2c..cbf4e1bfa 100644
|
||||
index c6eb95318..b378b23ec 100644
|
||||
--- a/src/llama-model.h
|
||||
+++ b/src/llama-model.h
|
||||
@@ -76,6 +76,7 @@ enum llm_type {
|
||||
|
|
@ -258,7 +264,7 @@ index f8342cf2c..cbf4e1bfa 100644
|
|||
LLM_TYPE_26B,
|
||||
LLM_TYPE_27B,
|
||||
LLM_TYPE_30B,
|
||||
@@ -404,6 +405,8 @@ struct llama_layer {
|
||||
@@ -405,6 +406,8 @@ struct llama_layer {
|
||||
struct ggml_tensor * ffn_act_beta = nullptr;
|
||||
struct ggml_tensor * ffn_act_eps = nullptr;
|
||||
|
||||
|
|
@ -268,7 +274,7 @@ index f8342cf2c..cbf4e1bfa 100644
|
|||
|
||||
struct llama_layer_convnext convnext;
|
||||
diff --git a/src/models/models.h b/src/models/models.h
|
||||
index 6494f5450..e0aec822c 100644
|
||||
index ffb36acc6..6d84a185d 100644
|
||||
--- a/src/models/models.h
|
||||
+++ b/src/models/models.h
|
||||
@@ -515,6 +515,11 @@ struct llm_build_smollm3 : public llm_graph_context {
|
||||
|
|
|
|||
|
|
@ -12,7 +12,7 @@ regex
|
|||
2 files changed, 22 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
|
||||
index 8246a0a14..dfba7778b 100644
|
||||
index 63250cdf1..dd86a1745 100644
|
||||
--- a/src/llama-vocab.cpp
|
||||
+++ b/src/llama-vocab.cpp
|
||||
@@ -299,7 +299,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
||||
|
|
|
|||
|
|
@ -8,10 +8,10 @@ Subject: [PATCH] maintain ordering for rules for grammar
|
|||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
|
||||
index c3b4e5d9d..6be552826 100644
|
||||
index 2f67c74d7..acf00e2d2 100644
|
||||
--- a/common/json-schema-to-grammar.cpp
|
||||
+++ b/common/json-schema-to-grammar.cpp
|
||||
@@ -310,7 +310,7 @@ private:
|
||||
@@ -311,7 +311,7 @@ private:
|
||||
friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options);
|
||||
std::function<json(const std::string &)> _fetch_json;
|
||||
bool _dotall;
|
||||
|
|
|
|||
|
|
@ -53,7 +53,7 @@ index b165d8bdc..f91d4faba 100644
|
|||
}
|
||||
|
||||
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
|
||||
index dfba7778b..f72f321b9 100644
|
||||
index dd86a1745..d63ce9c84 100644
|
||||
--- a/src/llama-vocab.cpp
|
||||
+++ b/src/llama-vocab.cpp
|
||||
@@ -1781,9 +1781,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ Subject: [PATCH] ollama debug tensor
|
|||
1 file changed, 6 insertions(+)
|
||||
|
||||
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
index b468b115a..bb65985b4 100644
|
||||
index a59b51893..53891a91f 100644
|
||||
--- a/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
@@ -15,6 +15,8 @@
|
||||
|
|
@ -20,7 +20,7 @@ index b468b115a..bb65985b4 100644
|
|||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||
#include <malloc.h> // using malloc.h with MSC/MINGW
|
||||
#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
|
||||
@@ -2928,6 +2930,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||
@@ -2945,6 +2947,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||
|
||||
ggml_compute_forward(¶ms, node);
|
||||
|
||||
|
|
|
|||
|
|
@ -11,10 +11,10 @@ Subject: [PATCH] graph memory reporting on failure
|
|||
4 files changed, 40 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/ggml/include/ggml-alloc.h b/ggml/include/ggml-alloc.h
|
||||
index 2cb150fd2..7ab3f0192 100644
|
||||
index 78aa059dd..7fa8403b3 100644
|
||||
--- a/ggml/include/ggml-alloc.h
|
||||
+++ b/ggml/include/ggml-alloc.h
|
||||
@@ -65,6 +65,7 @@ GGML_API bool ggml_gallocr_reserve_n(
|
||||
@@ -72,6 +72,7 @@ GGML_API bool ggml_gallocr_reserve_n(
|
||||
GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
|
||||
|
||||
GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);
|
||||
|
|
@ -23,10 +23,10 @@ index 2cb150fd2..7ab3f0192 100644
|
|||
// Utils
|
||||
// Create a buffer and allocate all the tensors in a ggml_context
|
||||
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
|
||||
index f1b740785..c54ff98bf 100644
|
||||
index 4ed5f3577..a7ebe5dcd 100644
|
||||
--- a/ggml/include/ggml-backend.h
|
||||
+++ b/ggml/include/ggml-backend.h
|
||||
@@ -318,6 +318,7 @@ extern "C" {
|
||||
@@ -319,6 +319,7 @@ extern "C" {
|
||||
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_sched_get_buffer_type(ggml_backend_sched_t sched, ggml_backend_t backend);
|
||||
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
|
||||
|
|
@ -35,10 +35,10 @@ index f1b740785..c54ff98bf 100644
|
|||
GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
|
||||
GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
|
||||
diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
|
||||
index a5995fdc2..dbfd8b5b2 100644
|
||||
index 41419b617..73b39bfea 100644
|
||||
--- a/ggml/src/ggml-alloc.c
|
||||
+++ b/ggml/src/ggml-alloc.c
|
||||
@@ -494,6 +494,7 @@ struct node_alloc {
|
||||
@@ -485,6 +485,7 @@ struct node_alloc {
|
||||
struct ggml_gallocr {
|
||||
ggml_backend_buffer_type_t * bufts; // [n_buffers]
|
||||
struct vbuffer ** buffers; // [n_buffers]
|
||||
|
|
@ -46,7 +46,7 @@ index a5995fdc2..dbfd8b5b2 100644
|
|||
struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
|
||||
int n_buffers;
|
||||
|
||||
@@ -517,6 +518,9 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
|
||||
@@ -508,6 +509,9 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
|
||||
galloc->buffers = calloc(n_bufs, sizeof(struct vbuffer *));
|
||||
GGML_ASSERT(galloc->buffers != NULL);
|
||||
|
||||
|
|
@ -56,7 +56,7 @@ index a5995fdc2..dbfd8b5b2 100644
|
|||
galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
|
||||
GGML_ASSERT(galloc->buf_tallocs != NULL);
|
||||
|
||||
@@ -584,6 +588,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
|
||||
@@ -575,6 +579,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
|
||||
ggml_hash_set_free(&galloc->hash_set);
|
||||
free(galloc->hash_values);
|
||||
free(galloc->bufts);
|
||||
|
|
@ -64,7 +64,7 @@ index a5995fdc2..dbfd8b5b2 100644
|
|||
free(galloc->buffers);
|
||||
free(galloc->buf_tallocs);
|
||||
free(galloc->node_allocs);
|
||||
@@ -899,6 +904,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
||||
@@ -904,6 +909,8 @@ static bool ggml_gallocr_reserve_n_impl(
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -73,18 +73,19 @@ index a5995fdc2..dbfd8b5b2 100644
|
|||
// reallocate buffers if needed
|
||||
for (int i = 0; i < galloc->n_buffers; i++) {
|
||||
// if the buffer type is used multiple times, we reuse the same buffer
|
||||
@@ -933,14 +940,19 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
||||
#endif
|
||||
ggml_vbuffer_free(galloc->buffers[i]);
|
||||
galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
||||
- if (galloc->buffers[i] == NULL) {
|
||||
+ if (galloc->buffers[i]) {
|
||||
+ galloc->buffer_sizes[i] = ggml_vbuffer_size(galloc->buffers[i]);
|
||||
+ } else {
|
||||
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
|
||||
- return false;
|
||||
+ galloc->buffer_sizes[i] = new_size;
|
||||
+ success = false;
|
||||
@@ -940,15 +947,20 @@ static bool ggml_gallocr_reserve_n_impl(
|
||||
galloc->buffers[i] = NULL;
|
||||
} else {
|
||||
galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
||||
- if (galloc->buffers[i] == NULL) {
|
||||
+ if (galloc->buffers[i]) {
|
||||
+ galloc->buffer_sizes[i] = ggml_vbuffer_size(galloc->buffers[i]);
|
||||
+ } else {
|
||||
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
|
||||
- return false;
|
||||
+ galloc->buffer_sizes[i] = new_size;
|
||||
+ success = false;
|
||||
}
|
||||
}
|
||||
+ } else {
|
||||
+ galloc->buffer_sizes[i] = ggml_vbuffer_size(galloc->buffers[i]);
|
||||
|
|
@ -95,8 +96,8 @@ index a5995fdc2..dbfd8b5b2 100644
|
|||
+ return success;
|
||||
}
|
||||
|
||||
bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
|
||||
@@ -1095,6 +1107,22 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
|
||||
void ggml_gallocr_reserve_n_size(
|
||||
@@ -1118,6 +1130,22 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
|
||||
return ggml_vbuffer_size(galloc->buffers[buffer_id]);
|
||||
}
|
||||
|
||||
|
|
@ -120,10 +121,10 @@ index a5995fdc2..dbfd8b5b2 100644
|
|||
|
||||
static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) {
|
||||
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
|
||||
index afde2f0b7..dbf8486a0 100644
|
||||
index 9f37ca70c..1459d16dd 100644
|
||||
--- a/ggml/src/ggml-backend.cpp
|
||||
+++ b/ggml/src/ggml-backend.cpp
|
||||
@@ -1840,6 +1840,13 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
|
||||
@@ -1859,6 +1859,13 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
|
||||
return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@ Subject: [PATCH] ggml: Export GPU UUIDs
|
|||
3 files changed, 63 insertions(+), 6 deletions(-)
|
||||
|
||||
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
|
||||
index c54ff98bf..229bf387b 100644
|
||||
index a7ebe5dcd..03557bb31 100644
|
||||
--- a/ggml/include/ggml-backend.h
|
||||
+++ b/ggml/include/ggml-backend.h
|
||||
@@ -158,6 +158,7 @@ extern "C" {
|
||||
|
|
@ -22,7 +22,7 @@ index c54ff98bf..229bf387b 100644
|
|||
size_t memory_total;
|
||||
// device type
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index 5145c1e88..f641c1016 100644
|
||||
index 6519af435..c9d3a2b03 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -189,6 +189,51 @@ static int ggml_cuda_parse_id(char devName[]) {
|
||||
|
|
@ -136,7 +136,7 @@ index 5145c1e88..f641c1016 100644
|
|||
props->type = ggml_backend_cuda_device_get_type(dev);
|
||||
props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
|
||||
ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
||||
@@ -4833,6 +4887,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||
@@ -4834,6 +4888,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||
cudaDeviceProp prop;
|
||||
CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
|
||||
dev_ctx->description = prop.name;
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@ Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
|
|||
2 files changed, 13 insertions(+)
|
||||
|
||||
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
|
||||
index d06fa42e6..0f5712e21 100644
|
||||
index 2638fe4fc..c4e905a4e 100644
|
||||
--- a/tools/mtmd/mtmd.cpp
|
||||
+++ b/tools/mtmd/mtmd.cpp
|
||||
@@ -87,6 +87,16 @@ enum mtmd_slice_tmpl {
|
||||
|
|
@ -31,10 +31,10 @@ index d06fa42e6..0f5712e21 100644
|
|||
return "<__media__>";
|
||||
}
|
||||
diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h
|
||||
index b3df24c29..a6a1af3b8 100644
|
||||
index 9f7e861e9..72cec1937 100644
|
||||
--- a/tools/mtmd/mtmd.h
|
||||
+++ b/tools/mtmd/mtmd.h
|
||||
@@ -75,6 +75,9 @@ typedef struct mtmd_input_chunk mtmd_input_chunk;
|
||||
@@ -80,6 +80,9 @@ typedef struct mtmd_input_chunk mtmd_input_chunk;
|
||||
typedef struct mtmd_input_chunks mtmd_input_chunks;
|
||||
typedef struct mtmd_input_text mtmd_input_text;
|
||||
|
||||
|
|
|
|||
|
|
@ -8,10 +8,10 @@ Subject: [PATCH] no power throttling win32 with gnuc
|
|||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
index bb65985b4..47089a62e 100644
|
||||
index 53891a91f..8d4851312 100644
|
||||
--- a/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
@@ -2464,7 +2464,7 @@ static bool ggml_thread_apply_priority(int32_t prio) {
|
||||
@@ -2479,7 +2479,7 @@ static bool ggml_thread_apply_priority(int32_t prio) {
|
||||
// Newer Windows 11 versions aggresively park (offline) CPU cores and often place
|
||||
// all our threads onto the first 4 cores which results in terrible performance with
|
||||
// n_threads > 4
|
||||
|
|
|
|||
|
|
@ -20,7 +20,7 @@ consistent performance.
|
|||
8 files changed, 58 insertions(+), 32 deletions(-)
|
||||
|
||||
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
|
||||
index 229bf387b..2763f2bd6 100644
|
||||
index 03557bb31..93c95602d 100644
|
||||
--- a/ggml/include/ggml-backend.h
|
||||
+++ b/ggml/include/ggml-backend.h
|
||||
@@ -98,7 +98,7 @@ extern "C" {
|
||||
|
|
@ -40,8 +40,8 @@ index 229bf387b..2763f2bd6 100644
|
|||
+ GGML_API void ggml_backend_sched_set_batch_size(ggml_backend_sched_t sched, int batch_size);
|
||||
+
|
||||
// Initialize backend buffers from a measure graph
|
||||
GGML_API void ggml_backend_sched_reserve_size(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph, size_t * sizes);
|
||||
GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success
|
||||
|
||||
diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h
|
||||
index 6792ba986..0f5b03cef 100644
|
||||
--- a/ggml/src/ggml-backend-impl.h
|
||||
|
|
@ -58,10 +58,10 @@ index 6792ba986..0f5b03cef 100644
|
|||
// (optional) event synchronization
|
||||
// record an event on this stream
|
||||
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
|
||||
index dbf8486a0..312ca873c 100644
|
||||
index 1459d16dd..498186a7c 100644
|
||||
--- a/ggml/src/ggml-backend.cpp
|
||||
+++ b/ggml/src/ggml-backend.cpp
|
||||
@@ -348,14 +348,14 @@ enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_ba
|
||||
@@ -353,14 +353,14 @@ enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_ba
|
||||
}
|
||||
|
||||
enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||
|
|
@ -79,7 +79,7 @@ index dbf8486a0..312ca873c 100644
|
|||
}
|
||||
|
||||
bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
||||
@@ -722,6 +722,8 @@ struct ggml_backend_sched {
|
||||
@@ -727,6 +727,8 @@ struct ggml_backend_sched {
|
||||
|
||||
bool op_offload;
|
||||
|
||||
|
|
@ -88,7 +88,7 @@ index dbf8486a0..312ca873c 100644
|
|||
int debug;
|
||||
|
||||
// used for debugging graph reallocations [GGML_SCHED_DEBUG_REALLOC]
|
||||
@@ -820,7 +822,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
||||
@@ -825,7 +827,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
||||
if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
||||
int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
|
||||
// check if a backend with higher prio wants to offload the op
|
||||
|
|
@ -97,7 +97,7 @@ index dbf8486a0..312ca873c 100644
|
|||
for (int b = 0; b < src_backend_id; b++) {
|
||||
if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) {
|
||||
SET_CAUSE(tensor, "1.off");
|
||||
@@ -1572,7 +1574,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
||||
@@ -1577,7 +1579,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
||||
}
|
||||
|
||||
if (!sched->callback_eval) {
|
||||
|
|
@ -106,7 +106,7 @@ index dbf8486a0..312ca873c 100644
|
|||
if (ec != GGML_STATUS_SUCCESS) {
|
||||
return ec;
|
||||
}
|
||||
@@ -1594,7 +1596,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
||||
@@ -1599,7 +1601,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
||||
|
||||
struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
|
||||
|
||||
|
|
@ -115,7 +115,7 @@ index dbf8486a0..312ca873c 100644
|
|||
if (ec != GGML_STATUS_SUCCESS) {
|
||||
return ec;
|
||||
}
|
||||
@@ -1684,6 +1686,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
||||
@@ -1689,6 +1691,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
||||
|
||||
sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
|
||||
sched->op_offload = op_offload;
|
||||
|
|
@ -123,7 +123,7 @@ index dbf8486a0..312ca873c 100644
|
|||
|
||||
ggml_backend_sched_reset(sched);
|
||||
|
||||
@@ -1715,6 +1718,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
||||
@@ -1720,6 +1723,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
||||
free(sched);
|
||||
}
|
||||
|
||||
|
|
@ -156,7 +156,7 @@ index 5b888cdd8..88d088952 100644
|
|||
|
||||
static struct ggml_backend_i blas_backend_i = {
|
||||
diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp
|
||||
index 3191faaa4..32f14c811 100644
|
||||
index f4713a421..92ba577a5 100644
|
||||
--- a/ggml/src/ggml-cpu/ggml-cpu.cpp
|
||||
+++ b/ggml/src/ggml-cpu/ggml-cpu.cpp
|
||||
@@ -164,7 +164,7 @@ static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backe
|
||||
|
|
@ -178,7 +178,7 @@ index 3191faaa4..32f14c811 100644
|
|||
|
||||
static const struct ggml_backend_i ggml_backend_cpu_i = {
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index f641c1016..17062697b 100644
|
||||
index c9d3a2b03..25548629d 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -2901,7 +2901,7 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
|
||||
|
|
@ -278,10 +278,10 @@ index 8fc1c2fb5..ba95b4acc 100644
|
|||
|
||||
static void ggml_backend_metal_graph_optimize(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
||||
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
index c801d2fd2..b2c0d0cee 100644
|
||||
index 120191ca0..5349bce24 100644
|
||||
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
@@ -13006,7 +13006,7 @@ static uint32_t ggml_vk_fuse_multi_add(ggml_backend_vk_context * ctx, const stru
|
||||
@@ -13099,7 +13099,7 @@ static uint32_t ggml_vk_fuse_multi_add(ggml_backend_vk_context * ctx, const stru
|
||||
return num_adds;
|
||||
}
|
||||
|
||||
|
|
@ -290,7 +290,7 @@ index c801d2fd2..b2c0d0cee 100644
|
|||
VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)");
|
||||
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
||||
|
||||
@@ -13241,6 +13241,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||
@@ -13334,6 +13334,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||
return GGML_STATUS_SUCCESS;
|
||||
|
||||
UNUSED(backend);
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ Subject: [PATCH] fix mtmd-audio.cpp build on windows
|
|||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/tools/mtmd/mtmd-audio.cpp b/tools/mtmd/mtmd-audio.cpp
|
||||
index 4d053895c..84bdc2777 100644
|
||||
index f68829a61..2024d3d37 100644
|
||||
--- a/tools/mtmd/mtmd-audio.cpp
|
||||
+++ b/tools/mtmd/mtmd-audio.cpp
|
||||
@@ -1,6 +1,6 @@
|
||||
|
|
|
|||
|
|
@ -10,13 +10,13 @@ must be recreated with no-alloc set to false before loading data.
|
|||
---
|
||||
ggml/include/ggml-backend.h | 1 +
|
||||
ggml/src/ggml-backend-impl.h | 16 +++
|
||||
ggml/src/ggml-backend.cpp | 72 +++++++++-
|
||||
ggml/src/ggml-backend.cpp | 75 ++++++++++-
|
||||
ggml/src/ggml-cuda/common.cuh | 62 ++++++++-
|
||||
ggml/src/ggml-cuda/ggml-cuda.cu | 224 ++++++++++++++++++++++++++------
|
||||
5 files changed, 331 insertions(+), 44 deletions(-)
|
||||
5 files changed, 333 insertions(+), 45 deletions(-)
|
||||
|
||||
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
|
||||
index 2763f2bd6..b3b5b356a 100644
|
||||
index 93c95602d..dbbb61d9c 100644
|
||||
--- a/ggml/include/ggml-backend.h
|
||||
+++ b/ggml/include/ggml-backend.h
|
||||
@@ -305,6 +305,7 @@ extern "C" {
|
||||
|
|
@ -75,13 +75,19 @@ index 0f5b03cef..7bdf9d81f 100644
|
|||
|
||||
struct ggml_backend {
|
||||
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
|
||||
index 312ca873c..4092dfe8a 100644
|
||||
index 498186a7c..7746e8b92 100644
|
||||
--- a/ggml/src/ggml-backend.cpp
|
||||
+++ b/ggml/src/ggml-backend.cpp
|
||||
@@ -41,6 +41,19 @@ ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t
|
||||
@@ -36,11 +36,25 @@ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
|
||||
}
|
||||
|
||||
ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
- GGML_ASSERT(buft);
|
||||
if (size == 0) {
|
||||
// return a dummy buffer for zero-sized allocations
|
||||
return ggml_backend_buffer_init(buft, {}, NULL, 0);
|
||||
}
|
||||
|
||||
+
|
||||
+ if (buft->no_alloc) {
|
||||
+ ggml_backend_buffer_t buf;
|
||||
+
|
||||
|
|
@ -95,10 +101,11 @@ index 312ca873c..4092dfe8a 100644
|
|||
+ return buf;
|
||||
+ }
|
||||
+
|
||||
GGML_ASSERT(buft);
|
||||
+ GGML_ASSERT(buft);
|
||||
return buft->iface.alloc_buffer(buft, size);
|
||||
}
|
||||
@@ -95,7 +108,8 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
|
||||
|
||||
@@ -94,7 +108,8 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
|
||||
/* .buft = */ buft,
|
||||
/* .context = */ context,
|
||||
/* .size = */ size,
|
||||
|
|
@ -108,7 +115,7 @@ index 312ca873c..4092dfe8a 100644
|
|||
};
|
||||
|
||||
return buffer;
|
||||
@@ -127,6 +141,12 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
@@ -126,6 +141,12 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
|
@ -118,10 +125,10 @@ index 312ca873c..4092dfe8a 100644
|
|||
+ return (void *)ggml_backend_buffer_get_alignment(buffer);
|
||||
+ }
|
||||
+
|
||||
void * base = buffer->iface.get_base(buffer);
|
||||
|
||||
GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
|
||||
@@ -731,6 +751,12 @@ struct ggml_backend_sched {
|
||||
// FIXME JG: a multi_buffer has a non-zero size, according to the above comment get_base is not optional,
|
||||
// I don't know whether the above comment is correct
|
||||
if (!buffer->iface.get_base) {
|
||||
@@ -736,6 +757,12 @@ struct ggml_backend_sched {
|
||||
int debug_realloc;
|
||||
int debug_graph_size;
|
||||
int debug_prev_graph_size;
|
||||
|
|
@ -134,7 +141,7 @@ index 312ca873c..4092dfe8a 100644
|
|||
};
|
||||
|
||||
#define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
|
||||
@@ -1630,6 +1656,17 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
||||
@@ -1635,6 +1662,17 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
||||
size_t graph_size,
|
||||
bool parallel,
|
||||
bool op_offload) {
|
||||
|
|
@ -152,7 +159,7 @@ index 312ca873c..4092dfe8a 100644
|
|||
GGML_ASSERT(n_backends > 0);
|
||||
GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
|
||||
GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||
@@ -1682,11 +1719,14 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
||||
@@ -1687,11 +1725,14 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
||||
sched->events[b][c] = ggml_backend_event_new(backends[b]->device);
|
||||
}
|
||||
}
|
||||
|
|
@ -167,7 +174,7 @@ index 312ca873c..4092dfe8a 100644
|
|||
|
||||
ggml_backend_sched_reset(sched);
|
||||
|
||||
@@ -1701,6 +1741,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
||||
@@ -1706,6 +1747,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
||||
for (int c = 0; c < sched->n_copies; c++) {
|
||||
ggml_backend_event_free(sched->events[b][c]);
|
||||
}
|
||||
|
|
@ -178,7 +185,7 @@ index 312ca873c..4092dfe8a 100644
|
|||
}
|
||||
ggml_gallocr_free(sched->galloc);
|
||||
ggml_free(sched->ctx);
|
||||
@@ -1746,6 +1790,24 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
|
||||
@@ -1765,6 +1810,24 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
@ -203,7 +210,7 @@ index 312ca873c..4092dfe8a 100644
|
|||
ggml_backend_sched_reset(sched);
|
||||
|
||||
return true;
|
||||
@@ -1851,7 +1913,13 @@ size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched,
|
||||
@@ -1870,7 +1933,13 @@ size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched,
|
||||
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
||||
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
||||
|
||||
|
|
@ -219,7 +226,7 @@ index 312ca873c..4092dfe8a 100644
|
|||
|
||||
void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
|
||||
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
|
||||
index c4529f5d9..8b0fb5d42 100644
|
||||
index 9fcb2f9fd..e800ee8f6 100644
|
||||
--- a/ggml/src/ggml-cuda/common.cuh
|
||||
+++ b/ggml/src/ggml-cuda/common.cuh
|
||||
@@ -37,6 +37,41 @@
|
||||
|
|
@ -264,7 +271,7 @@ index c4529f5d9..8b0fb5d42 100644
|
|||
#define STRINGIZE_IMPL(...) #__VA_ARGS__
|
||||
#define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)
|
||||
|
||||
@@ -938,6 +973,9 @@ struct ggml_cuda_pool {
|
||||
@@ -941,6 +976,9 @@ struct ggml_cuda_pool {
|
||||
|
||||
virtual void * alloc(size_t size, size_t * actual_size) = 0;
|
||||
virtual void free(void * ptr, size_t size) = 0;
|
||||
|
|
@ -274,7 +281,7 @@ index c4529f5d9..8b0fb5d42 100644
|
|||
};
|
||||
|
||||
template<typename T>
|
||||
@@ -1229,11 +1267,15 @@ struct ggml_backend_cuda_context {
|
||||
@@ -1232,11 +1270,15 @@ struct ggml_backend_cuda_context {
|
||||
// pool
|
||||
std::unique_ptr<ggml_cuda_pool> pools[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS];
|
||||
|
||||
|
|
@ -292,7 +299,7 @@ index c4529f5d9..8b0fb5d42 100644
|
|||
}
|
||||
return *pools[device][curr_stream_no];
|
||||
}
|
||||
@@ -1241,6 +1283,22 @@ struct ggml_backend_cuda_context {
|
||||
@@ -1244,6 +1286,22 @@ struct ggml_backend_cuda_context {
|
||||
ggml_cuda_pool & pool() {
|
||||
return pool(device);
|
||||
}
|
||||
|
|
@ -316,7 +323,7 @@ index c4529f5d9..8b0fb5d42 100644
|
|||
|
||||
struct ggml_cuda_mm_fusion_args_host {
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index 17062697b..ede1d089a 100644
|
||||
index 25548629d..eeaae3fe4 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -365,6 +365,8 @@ const ggml_cuda_device_info & ggml_cuda_info() {
|
||||
|
|
|
|||
|
|
@ -8,10 +8,10 @@ Subject: [PATCH] decode: disable output_all
|
|||
1 file changed, 1 insertion(+), 2 deletions(-)
|
||||
|
||||
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
|
||||
index 417140071..87f407f99 100644
|
||||
index 8786d4ee3..9e6998272 100644
|
||||
--- a/src/llama-context.cpp
|
||||
+++ b/src/llama-context.cpp
|
||||
@@ -999,8 +999,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
||||
@@ -1051,8 +1051,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
||||
const int64_t n_vocab = vocab.n_tokens();
|
||||
const int64_t n_embd = hparams.n_embd_inp();
|
||||
|
||||
|
|
|
|||
|
|
@ -16,7 +16,7 @@ unused then it can be reset to free these data structures.
|
|||
6 files changed, 32 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
|
||||
index b3b5b356a..69223c488 100644
|
||||
index dbbb61d9c..92ca32a4b 100644
|
||||
--- a/ggml/include/ggml-backend.h
|
||||
+++ b/ggml/include/ggml-backend.h
|
||||
@@ -178,6 +178,7 @@ extern "C" {
|
||||
|
|
@ -43,10 +43,10 @@ index 7bdf9d81f..21b35ac5c 100644
|
|||
|
||||
struct ggml_backend_device {
|
||||
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
|
||||
index 4092dfe8a..a1a19fe51 100644
|
||||
index 7746e8b92..189e97170 100644
|
||||
--- a/ggml/src/ggml-backend.cpp
|
||||
+++ b/ggml/src/ggml-backend.cpp
|
||||
@@ -526,6 +526,14 @@ ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * par
|
||||
@@ -532,6 +532,14 @@ ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * par
|
||||
return device->iface.init_backend(device, params);
|
||||
}
|
||||
|
||||
|
|
@ -62,7 +62,7 @@ index 4092dfe8a..a1a19fe51 100644
|
|||
GGML_ASSERT(device);
|
||||
return device->iface.get_buffer_type(device);
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index ede1d089a..ec63cadab 100644
|
||||
index eeaae3fe4..6852d2e20 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -113,6 +113,11 @@ int ggml_cuda_get_device() {
|
||||
|
|
@ -89,7 +89,7 @@ index ede1d089a..ec63cadab 100644
|
|||
|
||||
bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
|
||||
#ifdef GGML_CUDA_NO_PEER_COPY
|
||||
@@ -4907,6 +4915,11 @@ static void ggml_backend_cuda_device_event_synchronize(ggml_backend_dev_t dev, g
|
||||
@@ -4908,6 +4916,11 @@ static void ggml_backend_cuda_device_event_synchronize(ggml_backend_dev_t dev, g
|
||||
CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context));
|
||||
}
|
||||
|
||||
|
|
@ -101,7 +101,7 @@ index ede1d089a..ec63cadab 100644
|
|||
static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
|
||||
/* .get_name = */ ggml_backend_cuda_device_get_name,
|
||||
/* .get_description = */ ggml_backend_cuda_device_get_description,
|
||||
@@ -4923,6 +4936,7 @@ static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
|
||||
@@ -4924,6 +4937,7 @@ static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
|
||||
/* .event_new = */ ggml_backend_cuda_device_event_new,
|
||||
/* .event_free = */ ggml_backend_cuda_device_event_free,
|
||||
/* .event_synchronize = */ ggml_backend_cuda_device_event_synchronize,
|
||||
|
|
@ -110,10 +110,10 @@ index ede1d089a..ec63cadab 100644
|
|||
|
||||
// backend reg
|
||||
diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h
|
||||
index b7d6edf7f..b987d7aeb 100644
|
||||
index 951a88d56..4e162258d 100644
|
||||
--- a/ggml/src/ggml-cuda/vendors/hip.h
|
||||
+++ b/ggml/src/ggml-cuda/vendors/hip.h
|
||||
@@ -45,6 +45,7 @@
|
||||
@@ -49,6 +49,7 @@
|
||||
#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
|
||||
#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
|
||||
#define cudaDeviceProp hipDeviceProp_t
|
||||
|
|
@ -122,10 +122,10 @@ index b7d6edf7f..b987d7aeb 100644
|
|||
#define cudaError_t hipError_t
|
||||
#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
|
||||
diff --git a/src/llama.cpp b/src/llama.cpp
|
||||
index ab2e9868a..74c49e651 100644
|
||||
index f69964b6d..759152b76 100644
|
||||
--- a/src/llama.cpp
|
||||
+++ b/src/llama.cpp
|
||||
@@ -270,10 +270,12 @@ static struct llama_model * llama_model_load_from_file_impl(
|
||||
@@ -921,10 +921,12 @@ static struct llama_model * llama_model_load_from_file_impl(
|
||||
for (auto * dev : model->devices) {
|
||||
ggml_backend_dev_props props;
|
||||
ggml_backend_dev_get_props(dev, &props);
|
||||
|
|
|
|||
|
|
@ -28,7 +28,7 @@ fix vulkan PCI ID and ID handling
|
|||
create mode 100644 ggml/src/mem_nvml.cpp
|
||||
|
||||
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
|
||||
index 69223c488..6510e0cba 100644
|
||||
index 92ca32a4b..6ad583f09 100644
|
||||
--- a/ggml/include/ggml-backend.h
|
||||
+++ b/ggml/include/ggml-backend.h
|
||||
@@ -169,6 +169,12 @@ extern "C" {
|
||||
|
|
@ -58,7 +58,7 @@ index d55aed348..99ae293cc 100644
|
|||
|
||||
set_target_properties(ggml-base PROPERTIES
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index ec63cadab..cd71902df 100644
|
||||
index 6852d2e20..48cdb1dcf 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -267,6 +267,16 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||
|
|
@ -159,7 +159,7 @@ index ec63cadab..cd71902df 100644
|
|||
bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
|
||||
#ifdef GGML_CUDA_NO_PEER_COPY
|
||||
bool events = false;
|
||||
@@ -5046,6 +5102,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||
@@ -5047,6 +5103,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
if (!initialized) {
|
||||
ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context;
|
||||
|
|
@ -167,7 +167,7 @@ index ec63cadab..cd71902df 100644
|
|||
|
||||
for (int i = 0; i < ggml_cuda_info().device_count; i++) {
|
||||
ggml_backend_cuda_device_context * dev_ctx = new ggml_backend_cuda_device_context;
|
||||
@@ -5061,6 +5118,14 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||
@@ -5062,6 +5119,14 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||
snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
|
||||
dev_ctx->pci_bus_id = pci_bus_id;
|
||||
|
||||
|
|
@ -183,7 +183,7 @@ index ec63cadab..cd71902df 100644
|
|||
/* .iface = */ ggml_backend_cuda_device_interface,
|
||||
/* .reg = */ ®,
|
||||
diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h
|
||||
index b987d7aeb..5ad5623ae 100644
|
||||
index 4e162258d..d89e35a8e 100644
|
||||
--- a/ggml/src/ggml-cuda/vendors/hip.h
|
||||
+++ b/ggml/src/ggml-cuda/vendors/hip.h
|
||||
@@ -5,6 +5,8 @@
|
||||
|
|
@ -195,7 +195,7 @@ index b987d7aeb..5ad5623ae 100644
|
|||
|
||||
#if defined(GGML_HIP_ROCWMMA_FATTN)
|
||||
#include <rocwmma/rocwmma-version.hpp>
|
||||
@@ -47,6 +49,7 @@
|
||||
@@ -51,6 +53,7 @@
|
||||
#define cudaDeviceProp hipDeviceProp_t
|
||||
#define cudaDeviceReset hipDeviceReset
|
||||
#define cudaDeviceSynchronize hipDeviceSynchronize
|
||||
|
|
@ -243,7 +243,7 @@ index ba95b4acc..f6f8f7a10 100644
|
|||
/* .async = */ true,
|
||||
/* .host_buffer = */ false,
|
||||
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
index b2c0d0cee..d9f4d34f5 100644
|
||||
index 5349bce24..d43d46d1d 100644
|
||||
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
@@ -236,6 +236,7 @@ class vk_memory_logger;
|
||||
|
|
@ -254,7 +254,7 @@ index b2c0d0cee..d9f4d34f5 100644
|
|||
|
||||
static constexpr uint32_t mul_mat_vec_max_cols = 8;
|
||||
static constexpr uint32_t p021_max_gqa_ratio = 8;
|
||||
@@ -12256,6 +12257,29 @@ static void ggml_vk_get_device_description(int device, char * description, size_
|
||||
@@ -12350,6 +12351,29 @@ static void ggml_vk_get_device_description(int device, char * description, size_
|
||||
snprintf(description, description_size, "%s", props.deviceName.data());
|
||||
}
|
||||
|
||||
|
|
@ -284,7 +284,7 @@ index b2c0d0cee..d9f4d34f5 100644
|
|||
// backend interface
|
||||
|
||||
#define UNUSED GGML_UNUSED
|
||||
@@ -13535,15 +13559,72 @@ void ggml_backend_vk_get_device_description(int device, char * description, size
|
||||
@@ -13628,15 +13652,72 @@ void ggml_backend_vk_get_device_description(int device, char * description, size
|
||||
ggml_vk_get_device_description(dev_idx, description, description_size);
|
||||
}
|
||||
|
||||
|
|
@ -361,7 +361,7 @@ index b2c0d0cee..d9f4d34f5 100644
|
|||
|
||||
if (membudget_supported) {
|
||||
memprops.pNext = &budgetprops;
|
||||
@@ -13595,8 +13676,13 @@ static std::string ggml_backend_vk_get_device_pci_id(int device_idx) {
|
||||
@@ -13688,8 +13769,13 @@ static std::string ggml_backend_vk_get_device_pci_id(int device_idx) {
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -376,7 +376,7 @@ index b2c0d0cee..d9f4d34f5 100644
|
|||
}
|
||||
|
||||
vk::PhysicalDeviceProperties2 props = {};
|
||||
@@ -13613,19 +13699,24 @@ static std::string ggml_backend_vk_get_device_pci_id(int device_idx) {
|
||||
@@ -13706,19 +13792,24 @@ static std::string ggml_backend_vk_get_device_pci_id(int device_idx) {
|
||||
|
||||
char pci_bus_id[16] = {};
|
||||
snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.%x", pci_domain, pci_bus, pci_device, pci_function);
|
||||
|
|
@ -410,7 +410,7 @@ index b2c0d0cee..d9f4d34f5 100644
|
|||
|
||||
static const char * ggml_backend_vk_device_get_name(ggml_backend_dev_t dev) {
|
||||
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
|
||||
@@ -13637,9 +13728,14 @@ static const char * ggml_backend_vk_device_get_description(ggml_backend_dev_t de
|
||||
@@ -13730,9 +13821,14 @@ static const char * ggml_backend_vk_device_get_description(ggml_backend_dev_t de
|
||||
return ctx->description.c_str();
|
||||
}
|
||||
|
||||
|
|
@ -426,7 +426,7 @@ index b2c0d0cee..d9f4d34f5 100644
|
|||
}
|
||||
|
||||
static ggml_backend_buffer_type_t ggml_backend_vk_device_get_buffer_type(ggml_backend_dev_t dev) {
|
||||
@@ -13663,8 +13759,9 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
|
||||
@@ -13756,8 +13852,9 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
|
||||
|
||||
props->name = ggml_backend_vk_device_get_name(dev);
|
||||
props->description = ggml_backend_vk_device_get_description(dev);
|
||||
|
|
@ -437,7 +437,7 @@ index b2c0d0cee..d9f4d34f5 100644
|
|||
ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
||||
props->caps = {
|
||||
/* .async = */ false,
|
||||
@@ -13672,6 +13769,13 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
|
||||
@@ -13765,6 +13862,13 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
|
||||
/* .buffer_from_host_ptr = */ false,
|
||||
/* .events = */ false,
|
||||
};
|
||||
|
|
@ -451,7 +451,7 @@ index b2c0d0cee..d9f4d34f5 100644
|
|||
}
|
||||
|
||||
static ggml_backend_t ggml_backend_vk_device_init(ggml_backend_dev_t dev, const char * params) {
|
||||
@@ -14236,6 +14340,8 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
|
||||
@@ -14331,6 +14435,8 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
|
||||
static std::mutex mutex;
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
if (!initialized) {
|
||||
|
|
@ -460,7 +460,7 @@ index b2c0d0cee..d9f4d34f5 100644
|
|||
for (int i = 0; i < ggml_backend_vk_get_device_count(); i++) {
|
||||
ggml_backend_vk_device_context * ctx = new ggml_backend_vk_device_context;
|
||||
char desc[256];
|
||||
@@ -14244,12 +14350,41 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
|
||||
@@ -14339,12 +14445,41 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
|
||||
ctx->name = GGML_VK_NAME + std::to_string(i);
|
||||
ctx->description = desc;
|
||||
ctx->is_integrated_gpu = ggml_backend_vk_get_device_type(i) == vk::PhysicalDeviceType::eIntegratedGpu;
|
||||
|
|
|
|||
|
|
@ -38,7 +38,7 @@ index 1c07e767a..0da3e065b 100644
|
|||
#ifdef __cplusplus
|
||||
}
|
||||
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
index d9f4d34f5..8a83427fb 100644
|
||||
index d43d46d1d..df79f9f79 100644
|
||||
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
@@ -74,6 +74,7 @@ DispatchLoaderDynamic & ggml_vk_default_dispatcher();
|
||||
|
|
@ -49,7 +49,7 @@ index d9f4d34f5..8a83427fb 100644
|
|||
|
||||
typedef struct VkPhysicalDeviceShaderBfloat16FeaturesKHR {
|
||||
VkStructureType sType;
|
||||
@@ -13576,6 +13577,7 @@ struct ggml_backend_vk_device_context {
|
||||
@@ -13669,6 +13670,7 @@ struct ggml_backend_vk_device_context {
|
||||
std::string pci_id;
|
||||
std::string id;
|
||||
std::string uuid;
|
||||
|
|
@ -57,7 +57,7 @@ index d9f4d34f5..8a83427fb 100644
|
|||
int major;
|
||||
int minor;
|
||||
int driver_major;
|
||||
@@ -13594,6 +13596,20 @@ void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size
|
||||
@@ -13687,6 +13689,20 @@ void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size
|
||||
|
||||
vk::PhysicalDeviceProperties2 props2;
|
||||
vkdev.getProperties2(&props2);
|
||||
|
|
@ -78,7 +78,7 @@ index d9f4d34f5..8a83427fb 100644
|
|||
|
||||
if (!is_integrated_gpu)
|
||||
{
|
||||
@@ -13625,7 +13641,6 @@ void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size
|
||||
@@ -13718,7 +13734,6 @@ void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size
|
||||
}
|
||||
// else fallback to memory budget if supported
|
||||
|
||||
|
|
@ -86,7 +86,7 @@ index d9f4d34f5..8a83427fb 100644
|
|||
if (membudget_supported) {
|
||||
memprops.pNext = &budgetprops;
|
||||
}
|
||||
@@ -14357,7 +14372,6 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
|
||||
@@ -14452,7 +14467,6 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
|
||||
/* .reg = */ reg,
|
||||
/* .context = */ ctx,
|
||||
});
|
||||
|
|
@ -94,7 +94,7 @@ index d9f4d34f5..8a83427fb 100644
|
|||
// Gather additional information about the device
|
||||
int dev_idx = vk_instance.device_indices[i];
|
||||
vk::PhysicalDeviceProperties props1;
|
||||
@@ -14380,6 +14394,14 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
|
||||
@@ -14475,6 +14489,14 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
|
||||
}
|
||||
}
|
||||
ctx->uuid = oss.str();
|
||||
|
|
|
|||
|
|
@ -10,10 +10,10 @@ fallback to cpu
|
|||
1 file changed, 3 insertions(+)
|
||||
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index cd71902df..d69d62193 100644
|
||||
index 48cdb1dcf..3102d7ea7 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -4632,6 +4632,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
@@ -4633,6 +4633,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
if (b->type == GGML_TYPE_F16 && a->type != GGML_TYPE_F16) {
|
||||
return false;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ Subject: [PATCH] win: exit instead of abort
|
|||
1 file changed, 6 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
|
||||
index 530ff7b95..fc0196eb7 100644
|
||||
index eb3ae72ea..c9242a15a 100644
|
||||
--- a/ggml/src/ggml.c
|
||||
+++ b/ggml/src/ggml.c
|
||||
@@ -250,8 +250,13 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) {
|
||||
|
|
|
|||
|
|
@ -9,10 +9,10 @@ Rever to prior logic of assuming an empty projector type is mlp
|
|||
1 file changed, 4 insertions(+)
|
||||
|
||||
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
|
||||
index 6be1470ad..2a325c726 100644
|
||||
index 84a3796b5..d3a37842d 100644
|
||||
--- a/tools/mtmd/clip.cpp
|
||||
+++ b/tools/mtmd/clip.cpp
|
||||
@@ -2649,6 +2649,10 @@ struct clip_model_loader {
|
||||
@@ -960,6 +960,10 @@ struct clip_model_loader {
|
||||
if (proj_type.empty()) {
|
||||
if (modality == CLIP_MODALITY_VISION) {
|
||||
get_string(KEY_VISION_PROJ_TYPE, proj_type, false);
|
||||
|
|
|
|||
|
|
@ -1,586 +0,0 @@
|
|||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: Daniel Bevenius <daniel.bevenius@gmail.com>
|
||||
Date: Mon, 15 Dec 2025 15:13:49 +0100
|
||||
Subject: [PATCH] llama : add support for NVIDIA Nemotron Nano 3
|
||||
|
||||
This commit adds support for the NVIDIA Nemotron Nano 3 model, enabling
|
||||
the conversion and running of this model.
|
||||
|
||||
fix indentation in llama-graph.cpp
|
||||
|
||||
fix indentation and move ffn_inp
|
||||
|
||||
convert : fix modify_tensors in NemotronHModel to call super()
|
||||
|
||||
fix pyright error
|
||||
|
||||
fix flake8 errors
|
||||
---
|
||||
convert_hf_to_gguf.py | 116 +++++++++++++++++++++++++++++++--
|
||||
gguf-py/gguf/constants.py | 29 +++++++++
|
||||
gguf-py/gguf/tensor_mapping.py | 9 ++-
|
||||
src/llama-arch.cpp | 35 ++++++++++
|
||||
src/llama-arch.h | 1 +
|
||||
src/llama-graph.cpp | 10 +++
|
||||
src/llama-model.cpp | 50 +++++++++++---
|
||||
src/llama-model.h | 1 +
|
||||
src/models/nemotron-h.cpp | 41 ++++++++++--
|
||||
9 files changed, 269 insertions(+), 23 deletions(-)
|
||||
|
||||
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
|
||||
index 867bc9053..57ec2faac 100755
|
||||
--- a/convert_hf_to_gguf.py
|
||||
+++ b/convert_hf_to_gguf.py
|
||||
@@ -8601,8 +8601,18 @@ class GraniteHybridModel(Mamba2Model, GraniteMoeModel):
|
||||
class NemotronHModel(GraniteHybridModel):
|
||||
"""Hybrid mamba2/attention model from NVIDIA"""
|
||||
model_arch = gguf.MODEL_ARCH.NEMOTRON_H
|
||||
+ is_moe: bool = False
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
+ # We have to determine the correct model architecture (MoE vs non-MoE) before
|
||||
+ # calling the parent __init__. This is because the parent constructor
|
||||
+ # uses self.model_arch to build the tensor name map, and all MoE-specific
|
||||
+ # mappings would be missed if it were called with the default non-MoE arch.
|
||||
+ hparams = ModelBase.load_hparams(args[0], self.is_mistral_format)
|
||||
+ if "num_experts_per_tok" in hparams:
|
||||
+ self.model_arch = gguf.MODEL_ARCH.NEMOTRON_H_MOE
|
||||
+ self.is_moe = True
|
||||
+
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
# Save the top-level head_dim for later
|
||||
@@ -8614,9 +8624,11 @@ class NemotronHModel(GraniteHybridModel):
|
||||
|
||||
# Update the ssm / attn / mlp layers
|
||||
# M: Mamba2, *: Attention, -: MLP
|
||||
+ # MoE:
|
||||
+ # M: Mamba2, *: Attention, E: Expert
|
||||
hybrid_override_pattern = self.hparams["hybrid_override_pattern"]
|
||||
self._ssm_layers = [i for i, val in enumerate(hybrid_override_pattern) if val == "M"]
|
||||
- self._mlp_layers = [i for i, val in enumerate(hybrid_override_pattern) if val == "-"]
|
||||
+ self._mlp_layers = [i for i, val in enumerate(hybrid_override_pattern) if val == ("E" if self.is_moe else "-")]
|
||||
|
||||
def get_attn_layers(self):
|
||||
hybrid_override_pattern = self.hparams["hybrid_override_pattern"]
|
||||
@@ -8632,10 +8644,28 @@ class NemotronHModel(GraniteHybridModel):
|
||||
# Set feed_forward_length
|
||||
# NOTE: This will trigger an override warning. This is preferrable to
|
||||
# duplicating all the parent logic
|
||||
- n_ff = self.find_hparam(["intermediate_size", "n_inner", "hidden_dim"])
|
||||
- self.gguf_writer.add_feed_forward_length([
|
||||
- n_ff if i in self._mlp_layers else 0 for i in range(self.block_count)
|
||||
- ])
|
||||
+ if not self.is_moe:
|
||||
+ n_ff = self.find_hparam(["intermediate_size", "n_inner", "hidden_dim"])
|
||||
+ self.gguf_writer.add_feed_forward_length([
|
||||
+ n_ff if i in self._mlp_layers else 0 for i in range(self.block_count)
|
||||
+ ])
|
||||
+ else:
|
||||
+ moe_intermediate_size = self.hparams["moe_intermediate_size"]
|
||||
+ self.gguf_writer.add_feed_forward_length([
|
||||
+ moe_intermediate_size if i in self._mlp_layers else 0 for i in range(self.block_count)
|
||||
+ ])
|
||||
+ self.gguf_writer.add_expert_used_count(self.hparams["num_experts_per_tok"])
|
||||
+ self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"])
|
||||
+ self.gguf_writer.add_expert_shared_feed_forward_length(self.hparams["moe_shared_expert_intermediate_size"])
|
||||
+ self.gguf_writer.add_expert_count(self.hparams["n_routed_experts"])
|
||||
+ self.gguf_writer.add_expert_shared_count(self.hparams["n_shared_experts"])
|
||||
+ self.gguf_writer.add_expert_weights_norm(self.hparams["norm_topk_prob"])
|
||||
+ self.gguf_writer.add_expert_weights_scale(self.hparams["routed_scaling_factor"])
|
||||
+ self.gguf_writer.add_expert_group_count(self.hparams["n_group"])
|
||||
+
|
||||
+ # number of experts used per token (top-k)
|
||||
+ if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
|
||||
+ self.gguf_writer.add_expert_used_count(n_experts_used)
|
||||
|
||||
def set_vocab(self):
|
||||
super().set_vocab()
|
||||
@@ -8643,7 +8673,81 @@ class NemotronHModel(GraniteHybridModel):
|
||||
# The tokenizer _does_ add a BOS token (via post_processor type
|
||||
# TemplateProcessing) but does not set add_bos_token to true in the
|
||||
# config, so we need to explicitly override it here.
|
||||
- self.gguf_writer.add_add_bos_token(True)
|
||||
+ if not self.is_moe:
|
||||
+ self.gguf_writer.add_add_bos_token(True)
|
||||
+
|
||||
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
+ if self.is_moe and bid is not None:
|
||||
+ if name.endswith("mixer.gate.e_score_correction_bias"):
|
||||
+ new_name = name.replace("e_score_correction_bias", "e_score_correction_bias.bias")
|
||||
+ mapped_name = self.map_tensor_name(new_name)
|
||||
+ return [(mapped_name, data_torch)]
|
||||
+
|
||||
+ if name.endswith("mixer.dt_bias"):
|
||||
+ new_name = name.replace("dt_bias", "dt.bias")
|
||||
+ mapped_name = self.map_tensor_name(new_name)
|
||||
+ return [(mapped_name, data_torch)]
|
||||
+
|
||||
+ if name.endswith("mixer.conv1d.weight"):
|
||||
+ squeezed_data = data_torch.squeeze()
|
||||
+ mapped_name = self.map_tensor_name(name)
|
||||
+ return [(mapped_name, squeezed_data)]
|
||||
+
|
||||
+ if name.endswith("mixer.A_log"):
|
||||
+ transformed_data = -torch.exp(data_torch)
|
||||
+ reshaped_data = transformed_data.squeeze().reshape(-1, 1)
|
||||
+ mapped_name = self.map_tensor_name(name)
|
||||
+ return [(mapped_name, reshaped_data)]
|
||||
+
|
||||
+ if name.endswith("mixer.D"):
|
||||
+ reshaped_data = data_torch.squeeze().reshape(-1, 1)
|
||||
+ mapped_name = self.map_tensor_name(name)
|
||||
+ return [(mapped_name, reshaped_data)]
|
||||
+
|
||||
+ if name.endswith("mixer.norm.weight"):
|
||||
+ reshaped_data = data_torch.reshape(8, 512)
|
||||
+ mapped_name = self.map_tensor_name(name)
|
||||
+ return [(mapped_name, reshaped_data)]
|
||||
+
|
||||
+ if name.find("mixer.experts") != -1:
|
||||
+ n_experts = self.hparams["n_routed_experts"]
|
||||
+ assert bid is not None
|
||||
+
|
||||
+ if self._experts is None:
|
||||
+ self._experts = [{} for _ in range(self.block_count)]
|
||||
+
|
||||
+ self._experts[bid][name] = data_torch
|
||||
+
|
||||
+ if len(self._experts[bid]) >= n_experts * 2:
|
||||
+ # merge the experts into a single tensor
|
||||
+ tensors: list[tuple[str, Tensor]] = []
|
||||
+ for w_name in ["down_proj", "up_proj"]:
|
||||
+ datas: list[Tensor] = []
|
||||
+
|
||||
+ for xid in range(n_experts):
|
||||
+ ename = f"backbone.layers.{bid}.mixer.experts.{xid}.{w_name}.weight"
|
||||
+ datas.append(self._experts[bid][ename])
|
||||
+ del self._experts[bid][ename]
|
||||
+
|
||||
+ data_torch = torch.stack(datas, dim=0)
|
||||
+ merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
|
||||
+ new_name = self.map_tensor_name(merged_name)
|
||||
+ tensors.append((new_name, data_torch))
|
||||
+
|
||||
+ return tensors
|
||||
+ else:
|
||||
+ return []
|
||||
+
|
||||
+ return super().modify_tensors(data_torch, name, bid)
|
||||
+
|
||||
+ def prepare_tensors(self):
|
||||
+ super().prepare_tensors()
|
||||
+
|
||||
+ if self._experts is not None:
|
||||
+ # flatten `list[dict[str, Tensor]]` into `list[str]`
|
||||
+ experts = [k for d in self._experts for k in d.keys()]
|
||||
+ if len(experts) > 0:
|
||||
+ raise ValueError(f"Unprocessed experts: {experts}")
|
||||
|
||||
|
||||
@ModelBase.register("BailingMoeForCausalLM")
|
||||
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
|
||||
index 2b8489c59..1852428b4 100644
|
||||
--- a/gguf-py/gguf/constants.py
|
||||
+++ b/gguf-py/gguf/constants.py
|
||||
@@ -413,6 +413,7 @@ class MODEL_ARCH(IntEnum):
|
||||
JAIS = auto()
|
||||
NEMOTRON = auto()
|
||||
NEMOTRON_H = auto()
|
||||
+ NEMOTRON_H_MOE = auto()
|
||||
EXAONE = auto()
|
||||
EXAONE4 = auto()
|
||||
GRANITE = auto()
|
||||
@@ -786,6 +787,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||
MODEL_ARCH.JAIS: "jais",
|
||||
MODEL_ARCH.NEMOTRON: "nemotron",
|
||||
MODEL_ARCH.NEMOTRON_H: "nemotron_h",
|
||||
+ MODEL_ARCH.NEMOTRON_H_MOE: "nemotron_h_moe",
|
||||
MODEL_ARCH.EXAONE: "exaone",
|
||||
MODEL_ARCH.EXAONE4: "exaone4",
|
||||
MODEL_ARCH.GRANITE: "granite",
|
||||
@@ -2529,6 +2531,33 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
],
|
||||
+ MODEL_ARCH.NEMOTRON_H_MOE: [
|
||||
+ MODEL_TENSOR.TOKEN_EMBD,
|
||||
+ MODEL_TENSOR.OUTPUT_NORM,
|
||||
+ MODEL_TENSOR.OUTPUT,
|
||||
+ MODEL_TENSOR.ATTN_NORM,
|
||||
+ MODEL_TENSOR.SSM_IN,
|
||||
+ MODEL_TENSOR.SSM_CONV1D,
|
||||
+ MODEL_TENSOR.SSM_DT,
|
||||
+ MODEL_TENSOR.SSM_A,
|
||||
+ MODEL_TENSOR.SSM_D,
|
||||
+ MODEL_TENSOR.SSM_NORM,
|
||||
+ MODEL_TENSOR.SSM_OUT,
|
||||
+ MODEL_TENSOR.ATTN_Q,
|
||||
+ MODEL_TENSOR.ATTN_K,
|
||||
+ MODEL_TENSOR.ATTN_V,
|
||||
+ MODEL_TENSOR.ATTN_OUT,
|
||||
+ MODEL_TENSOR.FFN_DOWN,
|
||||
+ MODEL_TENSOR.FFN_UP,
|
||||
+ # experts
|
||||
+ MODEL_TENSOR.FFN_GATE_INP,
|
||||
+ MODEL_TENSOR.FFN_UP_EXP,
|
||||
+ MODEL_TENSOR.FFN_DOWN_EXP,
|
||||
+ # shared expert
|
||||
+ MODEL_TENSOR.FFN_DOWN_SHEXP,
|
||||
+ MODEL_TENSOR.FFN_UP_SHEXP,
|
||||
+ MODEL_TENSOR.FFN_EXP_PROBS_B,
|
||||
+ ],
|
||||
MODEL_ARCH.EXAONE: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
|
||||
index d9c87da19..7a3c7c5e0 100644
|
||||
--- a/gguf-py/gguf/tensor_mapping.py
|
||||
+++ b/gguf-py/gguf/tensor_mapping.py
|
||||
@@ -377,6 +377,7 @@ class TensorNameMap:
|
||||
"model.layers.{bid}.feed_forward.gate", # lfm2moe
|
||||
"model.layers.{bid}.mlp.router.gate", # afmoe
|
||||
"layers.{bid}.gate", # mistral-large
|
||||
+ "backbone.layers.{bid}.mixer.gate", # nemotron-h-moe
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_GATE_INP_SHEXP: (
|
||||
@@ -390,6 +391,7 @@ class TensorNameMap:
|
||||
"model.layers.{bid}.mlp.expert_bias", # afmoe
|
||||
"model.layers.{bid}.feed_forward.expert_bias", # lfm2moe
|
||||
"model.layers.{bid}.block_sparse_moe.e_score_correction", # minimax-m2
|
||||
+ "backbone.layers.{bid}.mixer.gate.e_score_correction_bias" # nemotron-h-moe
|
||||
),
|
||||
|
||||
# Feed-forward up
|
||||
@@ -438,7 +440,7 @@ class TensorNameMap:
|
||||
"layers.{bid}.feed_forward.experts.w3", # mixtral (merged)
|
||||
"transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged)
|
||||
"transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx
|
||||
- "model.layers.{bid}.mlp.experts.up_proj", # qwen2moe olmoe (merged) ernie4.5-moe
|
||||
+ "model.layers.{bid}.mlp.experts.up_proj", # qwen2moe olmoe (merged) ernie4.5-moe, nemotron-h-moe (merged)
|
||||
"model.layers.{bid}.block_sparse_moe.experts.w3", # phimoe (merged)
|
||||
"model.layers.{bid}.feed_forward.experts.up_proj", # llama4
|
||||
"encoder.layers.{bid}.mlp.experts.mlp.w1", # nomic-bert-moe
|
||||
@@ -452,6 +454,7 @@ class TensorNameMap:
|
||||
"model.layers.{bid}.feed_forward.down_proj",
|
||||
"model.layers.{bid}.mlp.shared_mlp.up_proj", # hunyuan
|
||||
"layers.{bid}.shared_experts.w3", # mistral-large
|
||||
+ "backbone.layers.{bid}.mixer.shared_experts.up_proj", # nemotron-h-moe
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_UP_CHEXP: (
|
||||
@@ -546,7 +549,7 @@ class TensorNameMap:
|
||||
"layers.{bid}.feed_forward.experts.w2", # mixtral (merged)
|
||||
"transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged)
|
||||
"transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx
|
||||
- "model.layers.{bid}.mlp.experts.down_proj", # qwen2moe olmoe (merged) ernie4.5-moe
|
||||
+ "model.layers.{bid}.mlp.experts.down_proj", # qwen2moe olmoe (merged) ernie4.5-moe nemotron-h-moe (merged)
|
||||
"model.layers.{bid}.block_sparse_moe.output_linear", # granitemoe
|
||||
"model.layers.{bid}.block_sparse_moe.experts.w2", # phimoe (merged)
|
||||
"model.layers.{bid}.feed_forward.experts.down_proj", # llama4
|
||||
@@ -561,6 +564,7 @@ class TensorNameMap:
|
||||
"model.layers.{bid}.shared_mlp.output_linear", # granitemoe
|
||||
"model.layers.{bid}.mlp.shared_mlp.down_proj", # hunyuan
|
||||
"layers.{bid}.shared_experts.w2", # mistral-large
|
||||
+ "backbone.layers.{bid}.mixer.shared_experts.down_proj", # nemotron-h-moe
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_DOWN_CHEXP: (
|
||||
@@ -704,6 +708,7 @@ class TensorNameMap:
|
||||
"model.layers.{bid}.mamba.dt_proj", # jamba falcon-h1 granite-hybrid
|
||||
"model.layers.layers.{bid}.mixer.dt_proj", # plamo2
|
||||
"model.layers.{bid}.linear_attn.dt_proj", # qwen3next
|
||||
+ "backbone.layers.{bid}.mixer.dt", # nemotron-h-moe
|
||||
),
|
||||
|
||||
MODEL_TENSOR.SSM_DT_NORM: (
|
||||
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
|
||||
index a5fe4f66c..ac8b5e033 100644
|
||||
--- a/src/llama-arch.cpp
|
||||
+++ b/src/llama-arch.cpp
|
||||
@@ -75,6 +75,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_JAIS, "jais" },
|
||||
{ LLM_ARCH_NEMOTRON, "nemotron" },
|
||||
{ LLM_ARCH_NEMOTRON_H, "nemotron_h" },
|
||||
+ { LLM_ARCH_NEMOTRON_H_MOE, "nemotron_h_moe" },
|
||||
{ LLM_ARCH_EXAONE, "exaone" },
|
||||
{ LLM_ARCH_EXAONE4, "exaone4" },
|
||||
{ LLM_ARCH_RWKV6, "rwkv6" },
|
||||
@@ -1765,6 +1766,39 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
},
|
||||
},
|
||||
+ {
|
||||
+ LLM_ARCH_NEMOTRON_H_MOE,
|
||||
+ {
|
||||
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
+ { LLM_TENSOR_OUTPUT, "output" },
|
||||
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
+ // mamba(2) ssm layers
|
||||
+ { LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" },
|
||||
+ { LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" },
|
||||
+ { LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" },
|
||||
+ { LLM_TENSOR_SSM_A, "blk.%d.ssm_a" },
|
||||
+ { LLM_TENSOR_SSM_D, "blk.%d.ssm_d" },
|
||||
+ { LLM_TENSOR_SSM_NORM, "blk.%d.ssm_norm" },
|
||||
+ { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
|
||||
+ // attention layers
|
||||
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
+ // dense FFN
|
||||
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
+ // MoE FFN (for MoE layers)
|
||||
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
||||
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
||||
+ { LLM_TENSOR_FFN_EXP_PROBS_B,"blk.%d.exp_probs_b" },
|
||||
+ // MoE shared expert layer
|
||||
+ { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
||||
+ { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
||||
+ },
|
||||
+ },
|
||||
{
|
||||
LLM_ARCH_EXAONE,
|
||||
{
|
||||
@@ -2838,6 +2872,7 @@ bool llm_arch_is_hybrid(const llm_arch & arch) {
|
||||
case LLM_ARCH_LFM2:
|
||||
case LLM_ARCH_LFM2MOE:
|
||||
case LLM_ARCH_NEMOTRON_H:
|
||||
+ case LLM_ARCH_NEMOTRON_H_MOE:
|
||||
case LLM_ARCH_QWEN3NEXT:
|
||||
return true;
|
||||
default:
|
||||
diff --git a/src/llama-arch.h b/src/llama-arch.h
|
||||
index ec9e3a6df..61d73786c 100644
|
||||
--- a/src/llama-arch.h
|
||||
+++ b/src/llama-arch.h
|
||||
@@ -79,6 +79,7 @@ enum llm_arch {
|
||||
LLM_ARCH_JAIS,
|
||||
LLM_ARCH_NEMOTRON,
|
||||
LLM_ARCH_NEMOTRON_H,
|
||||
+ LLM_ARCH_NEMOTRON_H_MOE,
|
||||
LLM_ARCH_EXAONE,
|
||||
LLM_ARCH_EXAONE4,
|
||||
LLM_ARCH_RWKV6,
|
||||
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
|
||||
index 43620df78..763202d87 100644
|
||||
--- a/src/llama-graph.cpp
|
||||
+++ b/src/llama-graph.cpp
|
||||
@@ -1089,6 +1089,16 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
||||
cur = ggml_relu(ctx0, cur);
|
||||
cb(cur, "ffn_moe_relu", il);
|
||||
} break;
|
||||
+ case LLM_FFN_RELU_SQR:
|
||||
+ if (gate_exps) {
|
||||
+ // TODO: add support for gated squared relu
|
||||
+ GGML_ABORT("fatal error: gated squared relu not implemented");
|
||||
+ } else {
|
||||
+ cur = ggml_relu(ctx0, cur);
|
||||
+ cur = ggml_sqr(ctx0, cur);
|
||||
+ cb(cur, "ffn_moe_relu_sqr", il);
|
||||
+ }
|
||||
+ break;
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
|
||||
index 3c503b424..94dee78c3 100644
|
||||
--- a/src/llama-model.cpp
|
||||
+++ b/src/llama-model.cpp
|
||||
@@ -120,6 +120,8 @@ const char * llm_type_name(llm_type type) {
|
||||
case LLM_TYPE_16B_A1B: return "16B.A1B";
|
||||
case LLM_TYPE_21B_A3B: return "21B.A3B";
|
||||
case LLM_TYPE_30B_A3B: return "30B.A3B";
|
||||
+ case LLM_TYPE_31B_A3_5B: return "31B.A3.5B";
|
||||
+ case LLM_TYPE_80B_A3B: return "80B.A3B";
|
||||
case LLM_TYPE_100B_A6B: return "100B.A6B";
|
||||
case LLM_TYPE_106B_A12B: return "106B.A12B";
|
||||
case LLM_TYPE_230B_A10B: return "230B.A10B";
|
||||
@@ -1788,6 +1790,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_NEMOTRON_H:
|
||||
+ case LLM_ARCH_NEMOTRON_H_MOE:
|
||||
{
|
||||
ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
|
||||
ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
|
||||
@@ -1803,7 +1806,14 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
|
||||
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
|
||||
+ ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
|
||||
+ ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared, false);
|
||||
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
|
||||
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
|
||||
+
|
||||
switch (hparams.n_layer) {
|
||||
+ case 52: type = LLM_TYPE_31B_A3_5B; break; // Nemotron-H_MOE 31B
|
||||
case 56: type = LLM_TYPE_9B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
@@ -5175,6 +5185,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_NEMOTRON_H:
|
||||
+ case LLM_ARCH_NEMOTRON_H_MOE:
|
||||
{
|
||||
// mamba2 Mixer SSM params
|
||||
// NOTE: int64_t for tensor dimensions
|
||||
@@ -5185,6 +5196,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
const int64_t n_group = hparams.ssm_n_group;
|
||||
const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head;
|
||||
|
||||
+ const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
|
||||
+ const int64_t n_ff_shexp = hparams.n_ff_shexp;
|
||||
+
|
||||
// embeddings
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
|
||||
@@ -5234,12 +5248,26 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_k_gqa_i}, TENSOR_NOT_REQUIRED);
|
||||
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_v_gqa_i}, TENSOR_NOT_REQUIRED);
|
||||
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
||||
- } else {
|
||||
- // mlp layers
|
||||
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { hparams.n_ff(i), n_embd}, 0);
|
||||
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, hparams.n_ff(i)}, 0);
|
||||
- layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
||||
- layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {hparams.n_ff(i)}, TENSOR_NOT_REQUIRED);
|
||||
+ } else {
|
||||
+ if (n_expert != 0) {
|
||||
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert}, 0);
|
||||
+ layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert }, 0);
|
||||
+
|
||||
+ // MoE branch
|
||||
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
|
||||
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
||||
+
|
||||
+ // Shared expert branch
|
||||
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
|
||||
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp}, 0);
|
||||
+
|
||||
+ } else {
|
||||
+ // mlp layers
|
||||
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { hparams.n_ff(i), n_embd}, 0);
|
||||
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, hparams.n_ff(i)}, 0);
|
||||
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
||||
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {hparams.n_ff(i)}, TENSOR_NOT_REQUIRED);
|
||||
+ }
|
||||
}
|
||||
}
|
||||
} break;
|
||||
@@ -6870,7 +6898,8 @@ void llama_model::print_info() const {
|
||||
arch == LLM_ARCH_PLAMO2 ||
|
||||
arch == LLM_ARCH_GRANITE_HYBRID ||
|
||||
arch == LLM_ARCH_QWEN3NEXT ||
|
||||
- arch == LLM_ARCH_NEMOTRON_H) {
|
||||
+ arch == LLM_ARCH_NEMOTRON_H ||
|
||||
+ arch == LLM_ARCH_NEMOTRON_H_MOE) {
|
||||
LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
|
||||
LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
|
||||
LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
|
||||
@@ -6926,7 +6955,8 @@ void llama_model::print_info() const {
|
||||
if (arch == LLM_ARCH_MINICPM ||
|
||||
arch == LLM_ARCH_GRANITE ||
|
||||
arch == LLM_ARCH_GRANITE_MOE ||
|
||||
- arch == LLM_ARCH_GRANITE_HYBRID) {
|
||||
+ arch == LLM_ARCH_GRANITE_HYBRID ||
|
||||
+ arch == LLM_ARCH_NEMOTRON_H_MOE) {
|
||||
LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
|
||||
LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
|
||||
LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
|
||||
@@ -7107,7 +7137,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
||||
if (arch == LLM_ARCH_FALCON_H1) {
|
||||
filter_attn = [&](int32_t) { return true; };
|
||||
filter_recr = [&](int32_t) { return true; };
|
||||
- } else if (arch == LLM_ARCH_NEMOTRON_H) {
|
||||
+ } else if (arch == LLM_ARCH_NEMOTRON_H || arch == LLM_ARCH_NEMOTRON_H_MOE) {
|
||||
filter_attn = [&](int32_t il) {
|
||||
return !hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
|
||||
};
|
||||
@@ -7478,6 +7508,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
||||
llm = std::make_unique<llm_build_nemotron>(*this, params);
|
||||
} break;
|
||||
case LLM_ARCH_NEMOTRON_H:
|
||||
+ case LLM_ARCH_NEMOTRON_H_MOE:
|
||||
{
|
||||
llm = std::make_unique<llm_build_nemotron_h>(*this, params);
|
||||
} break;
|
||||
@@ -7765,6 +7796,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
case LLM_ARCH_ARWKV7:
|
||||
case LLM_ARCH_WAVTOKENIZER_DEC:
|
||||
case LLM_ARCH_NEMOTRON_H:
|
||||
+ case LLM_ARCH_NEMOTRON_H_MOE:
|
||||
return LLAMA_ROPE_TYPE_NONE;
|
||||
|
||||
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
||||
diff --git a/src/llama-model.h b/src/llama-model.h
|
||||
index cbf4e1bfa..b378b23ec 100644
|
||||
--- a/src/llama-model.h
|
||||
+++ b/src/llama-model.h
|
||||
@@ -114,6 +114,7 @@ enum llm_type {
|
||||
LLM_TYPE_16B_A1B,
|
||||
LLM_TYPE_21B_A3B, // Ernie MoE small
|
||||
LLM_TYPE_30B_A3B,
|
||||
+ LLM_TYPE_31B_A3_5B,
|
||||
LLM_TYPE_80B_A3B, // Qwen3 Next
|
||||
LLM_TYPE_100B_A6B,
|
||||
LLM_TYPE_106B_A12B, // GLM-4.5-Air
|
||||
diff --git a/src/models/nemotron-h.cpp b/src/models/nemotron-h.cpp
|
||||
index 541434888..eb135e63f 100644
|
||||
--- a/src/models/nemotron-h.cpp
|
||||
+++ b/src/models/nemotron-h.cpp
|
||||
@@ -107,12 +107,41 @@ ggml_tensor * llm_build_nemotron_h::build_attention_layer(ggml_tensor *
|
||||
}
|
||||
|
||||
ggml_tensor * llm_build_nemotron_h::build_ffn_layer(ggml_tensor * cur, const llama_model & model, const int il) {
|
||||
- cur = build_ffn(cur,
|
||||
- model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
||||
- NULL, NULL, NULL,
|
||||
- model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
||||
- NULL, LLM_FFN_RELU_SQR, LLM_FFN_PAR, il);
|
||||
- cb(cur, "ffn_out", il);
|
||||
+ if (model.layers[il].ffn_gate_inp == nullptr) {
|
||||
+ cur = build_ffn(cur,
|
||||
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
||||
+ NULL, NULL, NULL,
|
||||
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
||||
+ NULL,
|
||||
+ LLM_FFN_RELU_SQR, LLM_FFN_PAR, il);
|
||||
+ cb(cur, "ffn_out", il);
|
||||
+ } else {
|
||||
+ ggml_tensor * ffn_inp = cur;
|
||||
+ ggml_tensor * moe_out =
|
||||
+ build_moe_ffn(ffn_inp,
|
||||
+ model.layers[il].ffn_gate_inp,
|
||||
+ model.layers[il].ffn_up_exps,
|
||||
+ nullptr, // no gate
|
||||
+ model.layers[il].ffn_down_exps,
|
||||
+ model.layers[il].ffn_exp_probs_b,
|
||||
+ n_expert, n_expert_used,
|
||||
+ LLM_FFN_RELU_SQR, hparams.expert_weights_norm,
|
||||
+ true, hparams.expert_weights_scale,
|
||||
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID,
|
||||
+ il);
|
||||
+ cb(moe_out, "ffn_moe_out", il);
|
||||
+
|
||||
+ ggml_tensor * ffn_shexp = build_ffn(ffn_inp,
|
||||
+ model.layers[il].ffn_up_shexp, NULL, NULL,
|
||||
+ NULL /* no gate */ , NULL, NULL,
|
||||
+ model.layers[il].ffn_down_shexp, NULL, NULL,
|
||||
+ NULL,
|
||||
+ LLM_FFN_RELU_SQR, LLM_FFN_PAR, il);
|
||||
+ cb(ffn_shexp, "ffn_shexp", il);
|
||||
+
|
||||
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
|
||||
+ cb(cur, "ffn_out", il);
|
||||
+ }
|
||||
|
||||
cur = build_cvec(cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
|
@ -72,7 +72,7 @@ struct llama_vocab * llama_load_vocab_from_file(const char * fname) {
|
|||
try {
|
||||
const auto kv = LLM_KV(LLM_ARCH_UNKNOWN);
|
||||
std::vector<std::string> splits = {};
|
||||
llama_model_loader ml(std::string(fname), splits, false, false, nullptr, nullptr);
|
||||
llama_model_loader ml(std::string(fname), splits, false, false, false, nullptr, nullptr);
|
||||
vocab->load(ml, kv);
|
||||
} catch (const std::exception & err) {
|
||||
LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
|
||||
|
|
|
|||
|
|
@ -54,10 +54,6 @@ type CacheConfig struct {
|
|||
// MaskDType specifies the data type for generating the mask. If unset it will
|
||||
// default to DTypeF32.
|
||||
MaskDType DType
|
||||
|
||||
// MaskBatchPadding specifies the multiple for the batch size dimension in the mask.
|
||||
// Any position that does not correspond to an actual token will be filled with -Inf.
|
||||
MaskBatchPadding int
|
||||
}
|
||||
|
||||
// BackendParams controls how the backend loads and executes models
|
||||
|
|
|
|||
|
|
@ -685,7 +685,7 @@ func (b *Backend) NewContextSize(n int) ml.Context {
|
|||
|
||||
func (b *Backend) CacheConfig() ml.CacheConfig {
|
||||
if b.flashAttention == ml.FlashAttentionEnabled {
|
||||
return ml.CacheConfig{CachePadding: 256, MaskDType: ml.DTypeF16, MaskBatchPadding: C.GGML_KQ_MASK_PAD}
|
||||
return ml.CacheConfig{CachePadding: 256, MaskDType: ml.DTypeF16}
|
||||
} else {
|
||||
return ml.CacheConfig{CachePadding: 256, PermutedV: true}
|
||||
}
|
||||
|
|
@ -1660,11 +1660,6 @@ func (t *Tensor) ScaledDotProductAttention(ctx ml.Context, key, value, mask, sin
|
|||
}
|
||||
|
||||
if mask != nil {
|
||||
padSize := int(pad(C.size_t(mask.Dim(1)), C.size_t(cacheConfig.MaskBatchPadding))) - mask.Dim(1)
|
||||
if padSize > 0 {
|
||||
mask = mask.Pad(ctx, 0, padSize, 0, 0)
|
||||
}
|
||||
|
||||
if mask.DType() != cacheConfig.MaskDType {
|
||||
mask = mask.Cast(ctx, cacheConfig.MaskDType)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -53,7 +53,14 @@ GGML_API void ggml_gallocr_free(ggml_gallocr_t galloc);
|
|||
// call with a worst-case graph to avoid buffer reallocations
|
||||
// not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
|
||||
// returns false if the buffer allocation failed
|
||||
// ggml_gallocr_resrve_n_size writes the buffer sizes per galloc buffer that would be allocated by ggml_gallocr_reserve_n to sizes
|
||||
GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
|
||||
GGML_API void ggml_gallocr_reserve_n_size(
|
||||
ggml_gallocr_t galloc,
|
||||
struct ggml_cgraph * graph,
|
||||
const int * node_buffer_ids,
|
||||
const int * leaf_buffer_ids,
|
||||
size_t * sizes);
|
||||
GGML_API bool ggml_gallocr_reserve_n(
|
||||
ggml_gallocr_t galloc,
|
||||
struct ggml_cgraph * graph,
|
||||
|
|
@ -69,6 +76,8 @@ GGML_API size_t ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, in
|
|||
|
||||
// Utils
|
||||
// Create a buffer and allocate all the tensors in a ggml_context
|
||||
// ggml_backend_alloc_ctx_tensors_from_buft_size returns the size of the buffer that would be allocated by ggml_backend_alloc_ctx_tensors_from_buft
|
||||
GGML_API size_t ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
|
||||
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
|
||||
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);
|
||||
|
||||
|
|
|
|||
|
|
@ -319,6 +319,7 @@ extern "C" {
|
|||
GGML_API void ggml_backend_sched_set_batch_size(ggml_backend_sched_t sched, int batch_size);
|
||||
|
||||
// Initialize backend buffers from a measure graph
|
||||
GGML_API void ggml_backend_sched_reserve_size(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph, size_t * sizes);
|
||||
GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success
|
||||
|
||||
GGML_API int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched);
|
||||
|
|
|
|||
|
|
@ -99,6 +99,7 @@ extern "C" {
|
|||
GGML_BACKEND_API int ggml_cpu_has_sme (void);
|
||||
// other
|
||||
GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
|
||||
GGML_BACKEND_API int ggml_cpu_get_rvv_vlen (void); // risc-v vector length in bytes
|
||||
GGML_BACKEND_API int ggml_cpu_has_vsx (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_vxe (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);
|
||||
|
|
|
|||
|
|
@ -0,0 +1,22 @@
|
|||
#pragma once
|
||||
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// backend API
|
||||
GGML_BACKEND_API ggml_backend_t ggml_backend_zendnn_init(void);
|
||||
|
||||
GGML_BACKEND_API bool ggml_backend_is_zendnn(ggml_backend_t backend);
|
||||
|
||||
// number of threads used for zendnn operations
|
||||
GGML_BACKEND_API void ggml_backend_zendnn_set_n_threads(ggml_backend_t backend_zendnn, int n_threads);
|
||||
|
||||
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_zendnn_reg(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
@ -2305,13 +2305,11 @@ extern "C" {
|
|||
float stop,
|
||||
float step);
|
||||
|
||||
#define GGML_KQ_MASK_PAD 1
|
||||
|
||||
// q: [n_embd_k, n_batch, n_head, ne3 ]
|
||||
// k: [n_embd_k, n_kv, n_head_kv, ne3 ]
|
||||
// v: [n_embd_v, n_kv, n_head_kv, ne3 ] !! not transposed !!
|
||||
// mask: [n_kv, n_batch_pad, ne32, ne33] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
|
||||
// res: [n_embd_v, n_head, n_batch, ne3 ] !! permuted !!
|
||||
// q: [n_embd_k, n_batch, n_head, ne3 ]
|
||||
// k: [n_embd_k, n_kv, n_head_kv, ne3 ]
|
||||
// v: [n_embd_v, n_kv, n_head_kv, ne3 ] !! not transposed !!
|
||||
// mask: [n_kv, n_batch, ne32, ne33]
|
||||
// res: [n_embd_v, n_head, n_batch, ne3 ] !! permuted !!
|
||||
//
|
||||
// broadcast:
|
||||
// n_head % n_head_kv == 0
|
||||
|
|
@ -2617,7 +2615,8 @@ extern "C" {
|
|||
|
||||
// Set callback for all future logging events.
|
||||
// If this is not called, or NULL is supplied, everything is output on stderr.
|
||||
GGML_API void ggml_log_set(ggml_log_callback log_callback, void * user_data);
|
||||
GGML_API void ggml_log_get(ggml_log_callback * log_callback, void ** user_data);
|
||||
GGML_API void ggml_log_set(ggml_log_callback log_callback, void * user_data);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
|
||||
|
||||
|
|
|
|||
|
|
@ -312,16 +312,9 @@ static struct buffer_address ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * al
|
|||
}
|
||||
|
||||
// this is a very naive implementation, but for our case the number of free blocks should be very small
|
||||
static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, size_t size, const struct ggml_tensor * tensor) {
|
||||
static void ggml_dyn_tallocr_free_bytes(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, size_t size) {
|
||||
size = aligned_offset(NULL, size, alloc->alignment);
|
||||
|
||||
AT_PRINTF("%s: freeing %s at {chunk=%d, offset=%zu} (%zu bytes) - n_free_blocks = %d\n",
|
||||
__func__, tensor->name, addr.chunk, addr.offset, size, alloc->chunks[addr.chunk]->n_free_blocks);
|
||||
|
||||
#ifdef GGML_ALLOCATOR_DEBUG
|
||||
remove_allocated_tensor(alloc, addr, tensor);
|
||||
#endif
|
||||
|
||||
struct tallocr_chunk * chunk = alloc->chunks[addr.chunk];
|
||||
|
||||
// see if we can merge with an existing block
|
||||
|
|
@ -357,8 +350,6 @@ static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, struct
|
|||
}
|
||||
// otherwise, add a new block
|
||||
ggml_dyn_tallocr_insert_block(chunk, addr.offset, size);
|
||||
|
||||
GGML_UNUSED(tensor);
|
||||
}
|
||||
|
||||
static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
|
||||
|
|
@ -608,7 +599,9 @@ static bool ggml_gallocr_is_own(ggml_gallocr_t galloc, struct ggml_tensor * t) {
|
|||
}
|
||||
|
||||
static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) {
|
||||
return t->data != NULL || ggml_gallocr_hash_get(galloc, t)->allocated;
|
||||
return t->data != NULL // tensor data already set externally
|
||||
|| t->buffer // tensor on external buffer (but not yet allocated)
|
||||
|| ggml_gallocr_is_own(galloc, t); // tensor will be allocated by galloc
|
||||
}
|
||||
|
||||
// free the extra space at the end if the new tensor is smaller
|
||||
|
|
@ -621,13 +614,17 @@ static void ggml_gallocr_free_extra_space(ggml_gallocr_t galloc, struct ggml_ten
|
|||
|
||||
GGML_ASSERT(parent_size >= node_size);
|
||||
|
||||
// note: we want after the freeing the chunks to continue to be aligned
|
||||
struct ggml_dyn_tallocr * p_alloc = galloc->buf_tallocs[p_hn->buffer_id];
|
||||
parent_size = aligned_offset(NULL, parent_size, p_alloc->alignment);
|
||||
node_size = aligned_offset(NULL, node_size, p_alloc->alignment);
|
||||
|
||||
if (parent_size > node_size) {
|
||||
struct ggml_dyn_tallocr * p_alloc = galloc->buf_tallocs[p_hn->buffer_id];
|
||||
struct buffer_address p_addr = p_hn->addr;
|
||||
p_addr.offset += node_size;
|
||||
size_t extra_size = parent_size - node_size;
|
||||
AT_PRINTF("freeing extra %zu bytes from parent %s for %s\n", extra_size, parent->name, node->name);
|
||||
ggml_dyn_tallocr_free_tensor(p_alloc, p_addr, extra_size, parent);
|
||||
ggml_dyn_tallocr_free_bytes(p_alloc, p_addr, extra_size);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -711,7 +708,14 @@ static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * n
|
|||
struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
|
||||
ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
|
||||
size_t size = ggml_backend_buft_get_alloc_size(buft, node);
|
||||
ggml_dyn_tallocr_free_tensor(alloc, hn->addr, size, node);
|
||||
|
||||
AT_PRINTF("%s: freeing %s at {chunk=%d, offset=%zu} (%zu bytes) - n_free_blocks = %d\n",
|
||||
__func__, node->name, hn->addr.chunk, hn->addr.offset, size, alloc->chunks[hn->addr.chunk]->n_free_blocks);
|
||||
#ifdef GGML_ALLOCATOR_DEBUG
|
||||
remove_allocated_tensor(alloc, hn->addr, node);
|
||||
#endif
|
||||
|
||||
ggml_dyn_tallocr_free_bytes(alloc, hn->addr, size);
|
||||
hn->allocated = false;
|
||||
}
|
||||
|
||||
|
|
@ -826,7 +830,8 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
|||
}
|
||||
}
|
||||
|
||||
bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
|
||||
static bool ggml_gallocr_reserve_n_impl(
|
||||
ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids, bool no_alloc) {
|
||||
size_t min_hash_size = graph->n_nodes + graph->n_leafs;
|
||||
// add 25% margin to avoid hash collisions
|
||||
min_hash_size += min_hash_size / 4;
|
||||
|
|
@ -933,19 +938,22 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|||
size_t cur_size = galloc->buffers[i] ? ggml_vbuffer_size(galloc->buffers[i]) : 0;
|
||||
if (cur_size > 0) {
|
||||
GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n",
|
||||
__func__, ggml_backend_buft_name(galloc->bufts[i]),
|
||||
cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
||||
__func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
ggml_vbuffer_free(galloc->buffers[i]);
|
||||
galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
||||
if (galloc->buffers[i]) {
|
||||
galloc->buffer_sizes[i] = ggml_vbuffer_size(galloc->buffers[i]);
|
||||
if (no_alloc) {
|
||||
galloc->buffers[i] = NULL;
|
||||
} else {
|
||||
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
|
||||
galloc->buffer_sizes[i] = new_size;
|
||||
success = false;
|
||||
galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
||||
if (galloc->buffers[i]) {
|
||||
galloc->buffer_sizes[i] = ggml_vbuffer_size(galloc->buffers[i]);
|
||||
} else {
|
||||
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
|
||||
galloc->buffer_sizes[i] = new_size;
|
||||
success = false;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
galloc->buffer_sizes[i] = ggml_vbuffer_size(galloc->buffers[i]);
|
||||
|
|
@ -955,6 +963,21 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|||
return success;
|
||||
}
|
||||
|
||||
void ggml_gallocr_reserve_n_size(
|
||||
ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids, size_t * sizes) {
|
||||
GGML_ASSERT(ggml_gallocr_reserve_n_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids, /*no_alloc =*/ true));
|
||||
for (int i = 0; i < galloc->n_buffers; i++) {
|
||||
sizes[i] = 0;
|
||||
for (int c = 0; c < galloc->buf_tallocs[i]->n_chunks; c++) {
|
||||
sizes[i] += galloc->buf_tallocs[i]->chunks[c]->max_size;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
|
||||
return ggml_gallocr_reserve_n_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids, /*no_alloc =*/ false);
|
||||
}
|
||||
|
||||
bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
|
||||
return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
|
||||
}
|
||||
|
|
@ -1173,7 +1196,8 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
|
|||
return true;
|
||||
}
|
||||
|
||||
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
|
||||
static ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft_impl(
|
||||
struct ggml_context * ctx, ggml_backend_buffer_type_t buft, size_t * nbytes_total, bool no_alloc) {
|
||||
GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
|
||||
|
||||
size_t alignment = ggml_backend_buft_get_alignment(buft);
|
||||
|
|
@ -1181,6 +1205,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
|||
|
||||
ggml_backend_buffer_t * buffers = NULL;
|
||||
size_t n_buffers = 0;
|
||||
*nbytes_total = 0;
|
||||
|
||||
size_t cur_buf_size = 0;
|
||||
struct ggml_tensor * first = ggml_get_first_tensor(ctx);
|
||||
|
|
@ -1192,10 +1217,11 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
|||
|
||||
if (cur_buf_size > 0 && (cur_buf_size + this_size) > max_size) {
|
||||
// allocate tensors in the current buffer
|
||||
if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
|
||||
if (!no_alloc && !alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
|
||||
return NULL;
|
||||
}
|
||||
first = t;
|
||||
*nbytes_total += cur_buf_size;
|
||||
cur_buf_size = this_size;
|
||||
} else {
|
||||
cur_buf_size += this_size;
|
||||
|
|
@ -1204,15 +1230,21 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
|||
|
||||
// allocate remaining tensors
|
||||
if (cur_buf_size > 0) {
|
||||
if (!alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {
|
||||
*nbytes_total += cur_buf_size;
|
||||
if (!no_alloc && !alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
if (no_alloc) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (n_buffers == 0) {
|
||||
#ifndef NDEBUG
|
||||
GGML_LOG_DEBUG("%s: all tensors in the context are already allocated\n", __func__);
|
||||
#endif
|
||||
GGML_ASSERT(!buffers);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
|
@ -1222,10 +1254,24 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
|||
} else {
|
||||
buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers);
|
||||
}
|
||||
free(buffers);
|
||||
if (buffers) {
|
||||
free(buffers); // can be NULL if context is empty or no_alloc
|
||||
}
|
||||
return buffer;
|
||||
}
|
||||
|
||||
size_t ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
|
||||
size_t nbytes_total = 0;
|
||||
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft_impl(ctx, buft, &nbytes_total, /*no_alloc=*/ true);
|
||||
GGML_ASSERT(!buf);
|
||||
return nbytes_total;
|
||||
}
|
||||
|
||||
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
|
||||
size_t nbytes_total = 0;
|
||||
return ggml_backend_alloc_ctx_tensors_from_buft_impl(ctx, buft, &nbytes_total, /*no_alloc =*/ false);
|
||||
}
|
||||
|
||||
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend) {
|
||||
return ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_get_default_buffer_type(backend));
|
||||
}
|
||||
|
|
|
|||
|
|
@ -147,6 +147,12 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|||
return (void *)ggml_backend_buffer_get_alignment(buffer);
|
||||
}
|
||||
|
||||
// FIXME JG: a multi_buffer has a non-zero size, according to the above comment get_base is not optional,
|
||||
// I don't know whether the above comment is correct
|
||||
if (!buffer->iface.get_base) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void * base = buffer->iface.get_base(buffer);
|
||||
|
||||
GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
|
||||
|
|
@ -1786,6 +1792,20 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
|
|||
sched->is_alloc = false;
|
||||
}
|
||||
|
||||
void ggml_backend_sched_reserve_size(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph, size_t * sizes) {
|
||||
GGML_ASSERT(sched);
|
||||
GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
|
||||
GGML_ASSERT(sizes);
|
||||
|
||||
ggml_backend_sched_reset(sched);
|
||||
|
||||
ggml_backend_sched_synchronize(sched);
|
||||
|
||||
ggml_backend_sched_split_graph(sched, measure_graph);
|
||||
|
||||
ggml_gallocr_reserve_n_size(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids, sizes);
|
||||
}
|
||||
|
||||
bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
|
||||
GGML_ASSERT(sched);
|
||||
GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
|
||||
|
|
|
|||
|
|
@ -24,6 +24,7 @@
|
|||
|
||||
#define UNUSED GGML_UNUSED
|
||||
|
||||
#if defined(__aarch64__) && defined(__ARM_NEON) && (defined(__ARM_FEATURE_MATMUL_INT8) || defined(__ARM_FEATURE_DOTPROD))
|
||||
static inline void decode_q4_Kx8_scales_mins(const uint8_t * scales_in,
|
||||
int16x8_t * out_mins,
|
||||
int8_t * out_scales) {
|
||||
|
|
@ -46,6 +47,7 @@ static inline void decode_q4_Kx8_scales_mins(const uint8_t * scales_in,
|
|||
scales_u32[1] = (sm[2] & kmask2) | (((sm[0] >> 6) & kmask3) << 4);
|
||||
memcpy(out_scales, scales_u32, 8);
|
||||
}
|
||||
#endif
|
||||
|
||||
void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
||||
assert(QK8_0 == 32);
|
||||
|
|
|
|||
|
|
@ -83,6 +83,11 @@ struct ggml_arm_arch_features_type {
|
|||
} ggml_arm_arch_features = { 0 };
|
||||
#endif
|
||||
|
||||
#if defined(__riscv)
|
||||
struct ggml_riscv_arch_features_type {
|
||||
int rvv_vlen;
|
||||
} ggml_riscv_arch_features = { 0 };
|
||||
#endif
|
||||
|
||||
#if defined(_WIN32)
|
||||
|
||||
|
|
@ -189,6 +194,9 @@ typedef void * thread_ret_t;
|
|||
|
||||
typedef pthread_t ggml_thread_t;
|
||||
|
||||
#define GGML_THREADPOOL_N_THREADS_MASK (0xffffU)
|
||||
#define GGML_THREADPOOL_N_THREADS_BITS (16)
|
||||
|
||||
#if defined(__APPLE__)
|
||||
#include <unistd.h>
|
||||
#include <mach/mach.h>
|
||||
|
|
@ -451,7 +459,7 @@ struct ggml_threadpool {
|
|||
struct ggml_cplan * cplan;
|
||||
|
||||
// synchronization primitives
|
||||
atomic_int n_graph; // incremented when there is work to be done (i.e each graph)
|
||||
atomic_int n_graph; // updated when there is work to be done (i.e each graph) holds graph and active thread counts.
|
||||
atomic_int GGML_CACHE_ALIGN n_barrier;
|
||||
atomic_int GGML_CACHE_ALIGN n_barrier_passed;
|
||||
atomic_int GGML_CACHE_ALIGN current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
|
||||
|
|
@ -459,12 +467,10 @@ struct ggml_threadpool {
|
|||
// these are atomic as an annotation for thread-sanitizer
|
||||
atomic_bool stop; // Used for stopping the threadpool altogether
|
||||
atomic_bool pause; // Used for pausing the threadpool or individual threads
|
||||
atomic_int abort; // Used for aborting processing of a graph
|
||||
atomic_int abort; // Used for aborting processing of a graph
|
||||
|
||||
struct ggml_compute_state * workers; // per thread state
|
||||
int n_threads_max; // number of threads in the pool
|
||||
atomic_int n_threads_cur; // number of threads used in the current graph
|
||||
|
||||
int n_threads; // Number of threads in the pool
|
||||
int32_t prio; // Scheduling priority
|
||||
uint32_t poll; // Polling level (0 - no polling)
|
||||
|
||||
|
|
@ -541,7 +547,7 @@ struct ggml_state {
|
|||
static struct ggml_state g_state = {0};
|
||||
|
||||
void ggml_barrier(struct ggml_threadpool * tp) {
|
||||
int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed);
|
||||
int n_threads = atomic_load_explicit(&tp->n_graph, memory_order_relaxed) & GGML_THREADPOOL_N_THREADS_MASK;
|
||||
if (n_threads == 1) {
|
||||
return;
|
||||
}
|
||||
|
|
@ -558,7 +564,7 @@ void ggml_barrier(struct ggml_threadpool * tp) {
|
|||
// last thread
|
||||
atomic_store_explicit(&tp->n_barrier, 0, memory_order_relaxed);
|
||||
|
||||
// exit barrier (fill seq-cst fence)
|
||||
// exit barrier (full seq-cst fence)
|
||||
atomic_fetch_add_explicit(&tp->n_barrier_passed, 1, memory_order_seq_cst);
|
||||
return;
|
||||
}
|
||||
|
|
@ -704,6 +710,15 @@ static void ggml_init_arm_arch_features(void) {}
|
|||
#endif
|
||||
#endif // __ARM_ARCH
|
||||
|
||||
#if defined(__riscv) && defined(__riscv_v_intrinsic)
|
||||
#include <riscv_vector.h>
|
||||
static void ggml_init_riscv_arch_features(void) {
|
||||
ggml_riscv_arch_features.rvv_vlen = __riscv_vlenb();
|
||||
}
|
||||
#else
|
||||
static void ggml_init_riscv_arch_features(void) {}
|
||||
#endif
|
||||
|
||||
struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) {
|
||||
GGML_ASSERT(!ggml_get_no_alloc(ctx));
|
||||
|
||||
|
|
@ -2630,7 +2645,7 @@ static void ggml_thread_cpumask_next(const bool * global_mask, bool * local_mask
|
|||
void ggml_threadpool_free(struct ggml_threadpool* threadpool) {
|
||||
if (!threadpool) return;
|
||||
|
||||
const int n_threads = threadpool->n_threads_max;
|
||||
const int n_threads = threadpool->n_threads;
|
||||
|
||||
#ifndef GGML_USE_OPENMP
|
||||
struct ggml_compute_state* workers = threadpool->workers;
|
||||
|
|
@ -2706,7 +2721,7 @@ struct ggml_cplan ggml_graph_plan(
|
|||
//GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads);
|
||||
}
|
||||
if (n_threads <= 0) {
|
||||
n_threads = threadpool ? threadpool->n_threads_max : GGML_DEFAULT_N_THREADS;
|
||||
n_threads = threadpool ? threadpool->n_threads : GGML_DEFAULT_N_THREADS;
|
||||
}
|
||||
|
||||
#if defined(__EMSCRIPTEN__) && !defined(__EMSCRIPTEN_PTHREADS__)
|
||||
|
|
@ -2914,12 +2929,14 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|||
|
||||
struct ggml_compute_params params = {
|
||||
/*.ith =*/ state->ith,
|
||||
/*.nth =*/ atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed),
|
||||
/*.nth =*/ atomic_load_explicit(&tp->n_graph, memory_order_relaxed) & GGML_THREADPOOL_N_THREADS_MASK,
|
||||
/*.wsize =*/ cplan->work_size,
|
||||
/*.wdata =*/ cplan->work_data,
|
||||
/*.threadpool=*/ tp,
|
||||
};
|
||||
|
||||
GGML_PRINT_DEBUG("thread #%d compute-start cplan %p last-graph %d \n", state->ith, cplan, state->last_graph);
|
||||
|
||||
for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
|
||||
struct ggml_tensor * node = cgraph->nodes[node_n];
|
||||
|
||||
|
|
@ -2945,6 +2962,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|||
}
|
||||
}
|
||||
|
||||
GGML_PRINT_DEBUG("thread #%d compute-done cplan %p last-graph %d \n", state->ith, cplan, state->last_graph);
|
||||
|
||||
ggml_barrier(state->threadpool);
|
||||
|
||||
return 0;
|
||||
|
|
@ -2952,27 +2971,23 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|||
|
||||
#ifndef GGML_USE_OPENMP
|
||||
|
||||
// check if thread is active
|
||||
static inline bool ggml_graph_compute_thread_active(struct ggml_compute_state * state) {
|
||||
struct ggml_threadpool * threadpool = state->threadpool;
|
||||
int n_threads = atomic_load_explicit(&threadpool->n_threads_cur, memory_order_relaxed);
|
||||
return (state->ith < n_threads);
|
||||
}
|
||||
|
||||
// check if thread is ready to proceed (exit from polling or sleeping)
|
||||
// returns true if loops should exit, sets state->pending to indicate new work
|
||||
static inline bool ggml_graph_compute_thread_ready(struct ggml_compute_state * state) {
|
||||
struct ggml_threadpool * threadpool = state->threadpool;
|
||||
|
||||
if (state->pending || threadpool->stop || threadpool->pause) { return true; }
|
||||
|
||||
// check for new graph/work
|
||||
int new_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed);
|
||||
if (new_graph != state->last_graph) {
|
||||
state->pending = ggml_graph_compute_thread_active(state);
|
||||
state->last_graph = new_graph;
|
||||
int n_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed);
|
||||
int n_threads = n_graph & GGML_THREADPOOL_N_THREADS_MASK;
|
||||
if (n_graph != state->last_graph) {
|
||||
state->pending = (state->ith < n_threads);
|
||||
state->last_graph = n_graph;
|
||||
return true;
|
||||
}
|
||||
|
||||
return state->pending;
|
||||
return false;
|
||||
}
|
||||
|
||||
// sync thread state after polling
|
||||
|
|
@ -2989,11 +3004,6 @@ static inline void ggml_graph_compute_thread_sync(struct ggml_compute_state * st
|
|||
static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) {
|
||||
struct ggml_threadpool * threadpool = state->threadpool;
|
||||
|
||||
// Skip polling for unused threads
|
||||
if (!ggml_graph_compute_thread_active(state)) {
|
||||
return state->pending;
|
||||
}
|
||||
|
||||
// This seems to make 0 ... 100 a decent range for polling level across modern processors.
|
||||
// Perhaps, we can adjust it dynamically based on load and things.
|
||||
const uint64_t n_rounds = 1024UL * 128 * threadpool->poll;
|
||||
|
|
@ -3055,7 +3065,6 @@ static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
|
|||
ggml_graph_compute_check_for_work(state);
|
||||
if (state->pending) {
|
||||
state->pending = false;
|
||||
|
||||
ggml_graph_compute_thread(state);
|
||||
}
|
||||
}
|
||||
|
|
@ -3070,14 +3079,15 @@ static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool, int
|
|||
|
||||
ggml_mutex_lock(&threadpool->mutex);
|
||||
|
||||
GGML_PRINT_DEBUG("threadpool: n_threads_cur %d n_threads %d\n", threadpool->n_threads_cur, n_threads);
|
||||
// Update the number of active threads and the graph count
|
||||
int n_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed) >> GGML_THREADPOOL_N_THREADS_BITS;
|
||||
n_graph = ((n_graph + 1) << GGML_THREADPOOL_N_THREADS_BITS) | (n_threads & GGML_THREADPOOL_N_THREADS_MASK);
|
||||
|
||||
// Update the number of active threads
|
||||
atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
|
||||
GGML_PRINT_DEBUG("compute-kickoff: n_threads %d n_graph %d\n", n_threads, n_graph);
|
||||
|
||||
// Indicate the graph is ready to be processed
|
||||
// We need the full seq-cst fence here because of the polling threads (used in thread_sync)
|
||||
atomic_fetch_add_explicit(&threadpool->n_graph, 1, memory_order_seq_cst);
|
||||
atomic_store_explicit(&threadpool->n_graph, n_graph, memory_order_seq_cst);
|
||||
|
||||
if (threadpool->pause) {
|
||||
// Update main thread prio and affinity to match the threadpool settings
|
||||
|
|
@ -3115,8 +3125,7 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
|
|||
threadpool->pause = tpp->paused;
|
||||
threadpool->abort = -1;
|
||||
threadpool->workers = NULL;
|
||||
threadpool->n_threads_max = tpp->n_threads;
|
||||
threadpool->n_threads_cur = tpp->n_threads;
|
||||
threadpool->n_threads = tpp->n_threads;
|
||||
threadpool->poll = tpp->poll;
|
||||
threadpool->prio = tpp->prio;
|
||||
threadpool->ec = GGML_STATUS_SUCCESS;
|
||||
|
|
@ -3211,7 +3220,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
|||
{
|
||||
// update the number of threads from the actual number of threads that we got from OpenMP
|
||||
n_threads = omp_get_num_threads();
|
||||
atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
|
||||
atomic_store_explicit(&threadpool->n_graph, n_threads, memory_order_relaxed);
|
||||
}
|
||||
|
||||
// Apply thread CPU mask and priority
|
||||
|
|
@ -3224,13 +3233,13 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
|||
ggml_graph_compute_thread(&threadpool->workers[ith]);
|
||||
}
|
||||
} else {
|
||||
atomic_store_explicit(&threadpool->n_threads_cur, 1, memory_order_relaxed);
|
||||
atomic_store_explicit(&threadpool->n_graph, 1, memory_order_relaxed);
|
||||
ggml_graph_compute_thread(&threadpool->workers[0]);
|
||||
}
|
||||
#else
|
||||
if (n_threads > threadpool->n_threads_max) {
|
||||
GGML_LOG_WARN("cplan requested more threads (%d) than available (%d)\n", n_threads, threadpool->n_threads_max);
|
||||
n_threads = threadpool->n_threads_max;
|
||||
if (n_threads > threadpool->n_threads) {
|
||||
GGML_LOG_WARN("cplan requested more threads (%d) than available (%d)\n", n_threads, threadpool->n_threads);
|
||||
n_threads = threadpool->n_threads;
|
||||
}
|
||||
|
||||
// Kick all threads to start the new graph
|
||||
|
|
@ -3470,6 +3479,14 @@ int ggml_cpu_has_riscv_v(void) {
|
|||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_get_rvv_vlen(void) {
|
||||
#if defined(__riscv) && defined(__riscv_v_intrinsic)
|
||||
return ggml_riscv_arch_features.rvv_vlen;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_f16c(void) {
|
||||
#if defined(__F16C__)
|
||||
return 1;
|
||||
|
|
@ -3636,6 +3653,10 @@ void ggml_cpu_init(void) {
|
|||
ggml_init_arm_arch_features();
|
||||
#endif
|
||||
|
||||
#if defined(__riscv)
|
||||
ggml_init_riscv_arch_features();
|
||||
#endif
|
||||
|
||||
is_first_call = false;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -585,6 +585,10 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
|
|||
if (ggml_cpu_has_riscv_v()) {
|
||||
features.push_back({ "RISCV_V", "1" });
|
||||
}
|
||||
if (ggml_cpu_get_rvv_vlen() > 0) {
|
||||
static std::string rvv_vlen = std::to_string(ggml_cpu_get_rvv_vlen());
|
||||
features.push_back({ "RVV_VLEN", rvv_vlen.c_str() });
|
||||
}
|
||||
if (ggml_cpu_has_vsx()) {
|
||||
features.push_back({ "VSX", "1" });
|
||||
}
|
||||
|
|
|
|||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue